{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05522966, "auxiliary_loss_mlp": 0.02590781, "balance_loss_clip": 2.64536452, "balance_loss_mlp": 1.97299075, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 39.959389233178946, "language_loss": 2.58020949, "learning_rate": 0.0, "loss": 1.90769851, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 13.852697372436523 }, { "auxiliary_loss_clip": 0.03632502, "auxiliary_loss_mlp": 0.01837029, "balance_loss_clip": 1.75351596, "balance_loss_mlp": 1.40978336, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 55.222108495375565, "language_loss": 1.88929272, "learning_rate": 5.021476677069823e-07, "loss": 1.94398785, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.606036901473999 }, { "auxiliary_loss_clip": 0.03663505, "auxiliary_loss_mlp": 0.01846548, "balance_loss_clip": 1.75958061, "balance_loss_mlp": 1.42044652, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 41.36022516954795, "language_loss": 1.61462593, "learning_rate": 7.958852231401551e-07, "loss": 1.66972649, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.5667974948883057 }, { "auxiliary_loss_clip": 0.03664737, "auxiliary_loss_mlp": 0.01813388, "balance_loss_clip": 1.75705397, "balance_loss_mlp": 1.38766789, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 37.79247770592117, "language_loss": 1.64462304, "learning_rate": 1.0042953354139647e-06, "loss": 1.69940424, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.5185658931732178 }, { "auxiliary_loss_clip": 0.03638942, "auxiliary_loss_mlp": 0.01791419, "balance_loss_clip": 1.75964463, "balance_loss_mlp": 1.36989546, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 55.95538768236078, "language_loss": 1.93642855, "learning_rate": 1.1659507774310057e-06, "loss": 1.99073207, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.7835872173309326 }, { "auxiliary_loss_clip": 0.0362382, "auxiliary_loss_mlp": 0.01784419, "balance_loss_clip": 1.75020957, "balance_loss_mlp": 1.36213148, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 46.647505750652364, "language_loss": 1.61134672, "learning_rate": 1.2980328908471373e-06, "loss": 1.66542923, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.7665836811065674 }, { "auxiliary_loss_clip": 0.03182577, "auxiliary_loss_mlp": 0.02561622, "balance_loss_clip": 1.87638187, "balance_loss_mlp": 2.07639289, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.663428529582813, "language_loss": 0.81484556, "learning_rate": 1.4097067265369432e-06, "loss": 0.87228751, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.2300939559936523 }, { "auxiliary_loss_clip": 0.03651837, "auxiliary_loss_mlp": 0.01817808, "balance_loss_clip": 1.7623179, "balance_loss_mlp": 1.38102508, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.83484627215433, "language_loss": 1.58042669, "learning_rate": 1.506443003120947e-06, "loss": 1.63512313, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.833540678024292 }, { "auxiliary_loss_clip": 0.03648767, "auxiliary_loss_mlp": 0.0180018, "balance_loss_clip": 1.7602185, "balance_loss_mlp": 1.37751174, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 17.647925931087652, "language_loss": 1.47828412, "learning_rate": 1.5917704462803102e-06, "loss": 1.53277361, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.8345413208007812 }, { "auxiliary_loss_clip": 0.03638143, "auxiliary_loss_mlp": 0.01883886, "balance_loss_clip": 1.76045239, "balance_loss_mlp": 1.43642211, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.441925346691395, "language_loss": 1.52881837, "learning_rate": 1.6680984451379884e-06, "loss": 1.58403873, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.7289352416992188 }, { "auxiliary_loss_clip": 0.03630843, "auxiliary_loss_mlp": 0.01831845, "balance_loss_clip": 1.76113594, "balance_loss_mlp": 1.39201081, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 88.07430345037243, "language_loss": 1.3239522, "learning_rate": 1.7371455188905097e-06, "loss": 1.37857902, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.894083261489868 }, { "auxiliary_loss_clip": 0.0364087, "auxiliary_loss_mlp": 0.01805283, "balance_loss_clip": 1.75509739, "balance_loss_mlp": 1.37727404, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 10.99645839364806, "language_loss": 1.25635886, "learning_rate": 1.8001805585541196e-06, "loss": 1.31082046, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 3.017770528793335 }, { "auxiliary_loss_clip": 0.03631688, "auxiliary_loss_mlp": 0.01789453, "balance_loss_clip": 1.76176453, "balance_loss_mlp": 1.37555814, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.991011445766659, "language_loss": 1.30316961, "learning_rate": 1.8581671739548328e-06, "loss": 1.35738099, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.8324432373046875 }, { "auxiliary_loss_clip": 0.03604407, "auxiliary_loss_mlp": 0.01767689, "balance_loss_clip": 1.75366163, "balance_loss_mlp": 1.34540176, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.5870170858909205, "language_loss": 1.14048624, "learning_rate": 1.9118543942439254e-06, "loss": 1.19420719, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 4.899405002593994 }, { "auxiliary_loss_clip": 0.03581378, "auxiliary_loss_mlp": 0.01852902, "balance_loss_clip": 1.75517178, "balance_loss_mlp": 1.41382992, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.8363540887820795, "language_loss": 1.13214004, "learning_rate": 1.961836000571161e-06, "loss": 1.18648279, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 4.844825744628906 }, { "auxiliary_loss_clip": 0.03096392, "auxiliary_loss_mlp": 0.0230466, "balance_loss_clip": 1.85399413, "balance_loss_mlp": 1.85300004, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.8482985469098647, "language_loss": 0.64698875, "learning_rate": 2.0085906708279293e-06, "loss": 0.70099926, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.2159314155578613 }, { "auxiliary_loss_clip": 0.0359194, "auxiliary_loss_mlp": 0.01750946, "balance_loss_clip": 1.75005352, "balance_loss_mlp": 1.34429979, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.559246655102069, "language_loss": 1.16463542, "learning_rate": 2.0525099325728135e-06, "loss": 1.21806431, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.783622980117798 }, { "auxiliary_loss_clip": 0.03059251, "auxiliary_loss_mlp": 0.02191847, "balance_loss_clip": 1.84420633, "balance_loss_mlp": 1.7523936, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.591449019384214, "language_loss": 0.72168851, "learning_rate": 2.0939181139872922e-06, "loss": 0.77419949, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.2346296310424805 }, { "auxiliary_loss_clip": 0.03576132, "auxiliary_loss_mlp": 0.01727062, "balance_loss_clip": 1.74956179, "balance_loss_mlp": 1.31698251, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 7.0687691182440835, "language_loss": 1.01782107, "learning_rate": 2.1330868934640175e-06, "loss": 1.07085299, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.9212772846221924 }, { "auxiliary_loss_clip": 0.03017352, "auxiliary_loss_mlp": 0.02033127, "balance_loss_clip": 1.83129954, "balance_loss_mlp": 1.60588109, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.685133673059821, "language_loss": 0.76683497, "learning_rate": 2.170246112844971e-06, "loss": 0.81733978, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.0458717346191406 }, { "auxiliary_loss_clip": 0.03519634, "auxiliary_loss_mlp": 0.01692379, "balance_loss_clip": 1.74292088, "balance_loss_mlp": 1.28725851, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 3.834373390180351, "language_loss": 1.02696991, "learning_rate": 2.2055919496770983e-06, "loss": 1.07909012, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.820862293243408 }, { "auxiliary_loss_clip": 0.03465839, "auxiliary_loss_mlp": 0.01641152, "balance_loss_clip": 1.73615575, "balance_loss_mlp": 1.24518657, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 4.691066382142932, "language_loss": 0.90044791, "learning_rate": 2.2392931865974923e-06, "loss": 0.95151782, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.8796117305755615 }, { "auxiliary_loss_clip": 0.03434706, "auxiliary_loss_mlp": 0.01605782, "balance_loss_clip": 1.72172093, "balance_loss_mlp": 1.21000719, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.736230338074221, "language_loss": 1.02327931, "learning_rate": 2.271496085962064e-06, "loss": 1.07368422, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.803281307220459 }, { "auxiliary_loss_clip": 0.03431169, "auxiliary_loss_mlp": 0.01613253, "balance_loss_clip": 1.72523308, "balance_loss_mlp": 1.21938539, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.268671694397111, "language_loss": 1.02998853, "learning_rate": 2.3023282262611022e-06, "loss": 1.08043265, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.739081621170044 }, { "auxiliary_loss_clip": 0.0340286, "auxiliary_loss_mlp": 0.01605983, "balance_loss_clip": 1.72537446, "balance_loss_mlp": 1.22413158, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.427854410611465, "language_loss": 0.92613679, "learning_rate": 2.3319015548620114e-06, "loss": 0.97622526, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.8903372287750244 }, { "auxiliary_loss_clip": 0.03381005, "auxiliary_loss_mlp": 0.01562979, "balance_loss_clip": 1.7168473, "balance_loss_mlp": 1.17693162, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.2682039522436757, "language_loss": 0.93079919, "learning_rate": 2.3603148416618152e-06, "loss": 0.98023909, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.868494987487793 }, { "auxiliary_loss_clip": 0.0335297, "auxiliary_loss_mlp": 0.01572116, "balance_loss_clip": 1.71141779, "balance_loss_mlp": 1.20075512, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 3.5679979332300222, "language_loss": 1.011338, "learning_rate": 2.3876556694204647e-06, "loss": 1.06058884, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.8454933166503906 }, { "auxiliary_loss_clip": 0.03305168, "auxiliary_loss_mlp": 0.015601, "balance_loss_clip": 1.70532584, "balance_loss_mlp": 1.17786765, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.6463968480291524, "language_loss": 0.90842229, "learning_rate": 2.414002061950908e-06, "loss": 0.957075, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.881150245666504 }, { "auxiliary_loss_clip": 0.03300892, "auxiliary_loss_mlp": 0.01550419, "balance_loss_clip": 1.70263696, "balance_loss_mlp": 1.18191934, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.3301716036438194, "language_loss": 0.99952269, "learning_rate": 2.4394238264681557e-06, "loss": 1.04803574, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.9138989448547363 }, { "auxiliary_loss_clip": 0.03258603, "auxiliary_loss_mlp": 0.01529995, "balance_loss_clip": 1.69115734, "balance_loss_mlp": 1.16263986, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.1084129389796704, "language_loss": 0.99653149, "learning_rate": 2.4639836682781433e-06, "loss": 1.04441738, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.8211753368377686 }, { "auxiliary_loss_clip": 0.03240719, "auxiliary_loss_mlp": 0.01542159, "balance_loss_clip": 1.69893658, "balance_loss_mlp": 1.16908121, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.5785210322117678, "language_loss": 1.00235105, "learning_rate": 2.487738122623307e-06, "loss": 1.05017972, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.789303779602051 }, { "auxiliary_loss_clip": 0.03145214, "auxiliary_loss_mlp": 0.0154576, "balance_loss_clip": 1.67964745, "balance_loss_mlp": 1.17821372, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.794315149536335, "language_loss": 0.99074543, "learning_rate": 2.510738338534912e-06, "loss": 1.03765512, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.7724082469940186 }, { "auxiliary_loss_clip": 0.03043265, "auxiliary_loss_mlp": 0.01502257, "balance_loss_clip": 1.65486467, "balance_loss_mlp": 1.14863467, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.417227573981816, "language_loss": 1.02644849, "learning_rate": 2.5330307420306648e-06, "loss": 1.07190371, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.7321553230285645 }, { "auxiliary_loss_clip": 0.03016225, "auxiliary_loss_mlp": 0.01518035, "balance_loss_clip": 1.64593124, "balance_loss_mlp": 1.16384006, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 2.379581634898288, "language_loss": 0.88219488, "learning_rate": 2.554657600279796e-06, "loss": 0.92753744, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.9059810638427734 }, { "auxiliary_loss_clip": 0.0295359, "auxiliary_loss_mlp": 0.01476312, "balance_loss_clip": 1.63753057, "balance_loss_mlp": 1.1251694, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 3.620445545652125, "language_loss": 1.03329837, "learning_rate": 2.5756575039679493e-06, "loss": 1.07759738, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.7663886547088623 }, { "auxiliary_loss_clip": 0.0294686, "auxiliary_loss_mlp": 0.01467617, "balance_loss_clip": 1.63004994, "balance_loss_mlp": 1.11647391, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 1.9703106266141515, "language_loss": 0.95084453, "learning_rate": 2.5960657816942747e-06, "loss": 0.99498928, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.7243058681488037 }, { "auxiliary_loss_clip": 0.02333558, "auxiliary_loss_mlp": 0.01460765, "balance_loss_clip": 1.59866393, "balance_loss_mlp": 1.11591613, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.396783198763894, "language_loss": 0.60995269, "learning_rate": 2.6159148575788668e-06, "loss": 0.64789593, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.2962465286254883 }, { "auxiliary_loss_clip": 0.02851891, "auxiliary_loss_mlp": 0.01442643, "balance_loss_clip": 1.61721313, "balance_loss_mlp": 1.11553288, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.227749349295963, "language_loss": 0.98782432, "learning_rate": 2.635234561171e-06, "loss": 1.03076959, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.758056640625 }, { "auxiliary_loss_clip": 0.02804274, "auxiliary_loss_mlp": 0.01447186, "balance_loss_clip": 1.6008755, "balance_loss_mlp": 1.11874032, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.431405386305459, "language_loss": 0.94095802, "learning_rate": 2.6540523970949877e-06, "loss": 0.98347265, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.832554817199707 }, { "auxiliary_loss_clip": 0.02782462, "auxiliary_loss_mlp": 0.01440369, "balance_loss_clip": 1.59807849, "balance_loss_mlp": 1.11878967, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 6.681958807027286, "language_loss": 0.92409486, "learning_rate": 2.6723937805519533e-06, "loss": 0.96632314, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.8860015869140625 }, { "auxiliary_loss_clip": 0.02762995, "auxiliary_loss_mlp": 0.0142116, "balance_loss_clip": 1.5884378, "balance_loss_mlp": 1.10358679, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.1348270294182816, "language_loss": 0.93058491, "learning_rate": 2.690282243737839e-06, "loss": 0.97242641, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 4.615433931350708 }, { "auxiliary_loss_clip": 0.02718485, "auxiliary_loss_mlp": 0.014265, "balance_loss_clip": 1.57469618, "balance_loss_mlp": 1.10873628, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.6724820973458274, "language_loss": 0.99413627, "learning_rate": 2.7077396173840807e-06, "loss": 1.03558612, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.8047924041748047 }, { "auxiliary_loss_clip": 0.02702448, "auxiliary_loss_mlp": 0.01407832, "balance_loss_clip": 1.56967223, "balance_loss_mlp": 1.09598076, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 2.536414945421854, "language_loss": 0.92830485, "learning_rate": 2.7247861909342594e-06, "loss": 0.96940768, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.8611667156219482 }, { "auxiliary_loss_clip": 0.02633912, "auxiliary_loss_mlp": 0.01402096, "balance_loss_clip": 1.55660129, "balance_loss_mlp": 1.09367764, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.262090877455689, "language_loss": 0.8299067, "learning_rate": 2.7414408543044743e-06, "loss": 0.87026674, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.8083693981170654 }, { "auxiliary_loss_clip": 0.02622778, "auxiliary_loss_mlp": 0.01370535, "balance_loss_clip": 1.54817152, "balance_loss_mlp": 1.07756662, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 3.9235495453116735, "language_loss": 0.79291832, "learning_rate": 2.7577212237113157e-06, "loss": 0.83285141, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.7640068531036377 }, { "auxiliary_loss_clip": 0.0260364, "auxiliary_loss_mlp": 0.01368179, "balance_loss_clip": 1.55213296, "balance_loss_mlp": 1.07120454, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 1.906808079105271, "language_loss": 1.04227889, "learning_rate": 2.7736437536690466e-06, "loss": 1.08199704, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.784273386001587 }, { "auxiliary_loss_clip": 0.02545901, "auxiliary_loss_mlp": 0.01380655, "balance_loss_clip": 1.53280592, "balance_loss_mlp": 1.08368075, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 1.9470111141037585, "language_loss": 1.07806277, "learning_rate": 2.789223836941131e-06, "loss": 1.11732841, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.812368869781494 }, { "auxiliary_loss_clip": 0.02515857, "auxiliary_loss_mlp": 0.01374485, "balance_loss_clip": 1.5228256, "balance_loss_mlp": 1.07922733, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.7628809036325483, "language_loss": 1.08543468, "learning_rate": 2.8044758939680847e-06, "loss": 1.12433815, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.7771575450897217 }, { "auxiliary_loss_clip": 0.02495534, "auxiliary_loss_mlp": 0.01357212, "balance_loss_clip": 1.52111685, "balance_loss_mlp": 1.06862974, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 3.94218489439494, "language_loss": 1.01832974, "learning_rate": 2.8194134530738863e-06, "loss": 1.05685723, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.8108222484588623 }, { "auxiliary_loss_clip": 0.02463959, "auxiliary_loss_mlp": 0.01367268, "balance_loss_clip": 1.51679218, "balance_loss_mlp": 1.09432626, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 2.6058052112778967, "language_loss": 0.90180928, "learning_rate": 2.834049222568994e-06, "loss": 0.94012153, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.835986614227295 }, { "auxiliary_loss_clip": 0.02443993, "auxiliary_loss_mlp": 0.01360499, "balance_loss_clip": 1.4968822, "balance_loss_mlp": 1.08030939, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 1.9692993513084256, "language_loss": 0.92425424, "learning_rate": 2.848395155712969e-06, "loss": 0.96229911, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.790130853652954 }, { "auxiliary_loss_clip": 0.02406854, "auxiliary_loss_mlp": 0.01344013, "balance_loss_clip": 1.49677753, "balance_loss_mlp": 1.0764116, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.336739477002857, "language_loss": 0.97854984, "learning_rate": 2.8624625093687977e-06, "loss": 1.01605856, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.8293187618255615 }, { "auxiliary_loss_clip": 0.02419784, "auxiliary_loss_mlp": 0.0134297, "balance_loss_clip": 1.4933517, "balance_loss_mlp": 1.07422471, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.213871365771904, "language_loss": 0.88804996, "learning_rate": 2.876261897070029e-06, "loss": 0.92567742, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.7627129554748535 }, { "auxiliary_loss_clip": 0.02393073, "auxiliary_loss_mlp": 0.01350611, "balance_loss_clip": 1.4872582, "balance_loss_mlp": 1.08739686, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 3.9318861556549556, "language_loss": 0.92237782, "learning_rate": 2.889803337127447e-06, "loss": 0.95981473, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.8229286670684814 }, { "auxiliary_loss_clip": 0.02349414, "auxiliary_loss_mlp": 0.01339553, "balance_loss_clip": 1.47195947, "balance_loss_mlp": 1.08072627, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 4.295284844609182, "language_loss": 0.84577686, "learning_rate": 2.903096296321516e-06, "loss": 0.88266653, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.8402016162872314 }, { "auxiliary_loss_clip": 0.02329295, "auxiliary_loss_mlp": 0.01327911, "balance_loss_clip": 1.46879363, "balance_loss_mlp": 1.07080078, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 2.037345760303718, "language_loss": 0.91645646, "learning_rate": 2.9161497296578907e-06, "loss": 0.95302856, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.9132657051086426 }, { "auxiliary_loss_clip": 0.02312117, "auxiliary_loss_mlp": 0.01324004, "balance_loss_clip": 1.46269095, "balance_loss_mlp": 1.07375956, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.2876120443342147, "language_loss": 0.85939121, "learning_rate": 2.928972116604173e-06, "loss": 0.89575243, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.8697924613952637 }, { "auxiliary_loss_clip": 0.02307321, "auxiliary_loss_mlp": 0.013089, "balance_loss_clip": 1.46140599, "balance_loss_mlp": 1.06266141, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 2.215868355765223, "language_loss": 1.01893282, "learning_rate": 2.9415714941751377e-06, "loss": 1.05509496, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.884263515472412 }, { "auxiliary_loss_clip": 0.02295593, "auxiliary_loss_mlp": 0.01338668, "balance_loss_clip": 1.45536458, "balance_loss_mlp": 1.08594465, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 2.157387826079313, "language_loss": 0.93499863, "learning_rate": 2.9539554871897396e-06, "loss": 0.97134125, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.928145170211792 }, { "auxiliary_loss_clip": 0.02250255, "auxiliary_loss_mlp": 0.01314897, "balance_loss_clip": 1.44465089, "balance_loss_mlp": 1.08239126, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.0050131455445106, "language_loss": 0.97233284, "learning_rate": 2.9661313359851253e-06, "loss": 1.0079844, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.79606294631958 }, { "auxiliary_loss_clip": 0.02205418, "auxiliary_loss_mlp": 0.01309723, "balance_loss_clip": 1.42492557, "balance_loss_mlp": 1.07340288, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.5598486852843183, "language_loss": 0.94003093, "learning_rate": 2.978105921839922e-06, "loss": 0.97518241, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.7519116401672363 }, { "auxiliary_loss_clip": 0.0219967, "auxiliary_loss_mlp": 0.01284533, "balance_loss_clip": 1.42898417, "balance_loss_mlp": 1.06137347, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 3.2075477236786467, "language_loss": 0.72128528, "learning_rate": 2.9898857903302893e-06, "loss": 0.75612736, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.7955236434936523 }, { "auxiliary_loss_clip": 0.02182832, "auxiliary_loss_mlp": 0.01294951, "balance_loss_clip": 1.42026067, "balance_loss_mlp": 1.07427096, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 8.0312671555668, "language_loss": 0.87814838, "learning_rate": 3.001477172817253e-06, "loss": 0.9129262, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.7082722187042236 }, { "auxiliary_loss_clip": 0.02167074, "auxiliary_loss_mlp": 0.01287938, "balance_loss_clip": 1.41773105, "balance_loss_mlp": 1.06630456, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.646833294908757, "language_loss": 0.9628582, "learning_rate": 3.012886006241894e-06, "loss": 0.99740827, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.841093063354492 }, { "auxiliary_loss_clip": 0.02147076, "auxiliary_loss_mlp": 0.0130192, "balance_loss_clip": 1.40585971, "balance_loss_mlp": 1.09096742, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 2.0799211071023085, "language_loss": 0.88262475, "learning_rate": 3.0241179513858383e-06, "loss": 0.91711473, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.72639536857605 }, { "auxiliary_loss_clip": 0.02131724, "auxiliary_loss_mlp": 0.01270747, "balance_loss_clip": 1.40083849, "balance_loss_mlp": 1.06484842, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 5.2575238192100215, "language_loss": 0.87759936, "learning_rate": 3.035178409737647e-06, "loss": 0.91162401, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 2.805121421813965 }, { "auxiliary_loss_clip": 0.02112536, "auxiliary_loss_mlp": 0.01285102, "balance_loss_clip": 1.38967538, "balance_loss_mlp": 1.08206439, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 3.088809201374108, "language_loss": 0.88891691, "learning_rate": 3.046072539090907e-06, "loss": 0.92289329, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 5.871342182159424 }, { "auxiliary_loss_clip": 0.02082619, "auxiliary_loss_mlp": 0.01266017, "balance_loss_clip": 1.38562179, "balance_loss_mlp": 1.06812978, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.37704662054924, "language_loss": 1.04547548, "learning_rate": 3.056805267986779e-06, "loss": 1.07896185, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 3.702235460281372 }, { "auxiliary_loss_clip": 0.02063017, "auxiliary_loss_mlp": 0.01248474, "balance_loss_clip": 1.37967849, "balance_loss_mlp": 1.06708539, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.154767475985757, "language_loss": 0.95365626, "learning_rate": 3.0673813091022194e-06, "loss": 0.98677111, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.951131820678711 }, { "auxiliary_loss_clip": 0.01867309, "auxiliary_loss_mlp": 0.01492648, "balance_loss_clip": 1.42686749, "balance_loss_mlp": 1.27749944, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3259522234727676, "language_loss": 0.62134254, "learning_rate": 3.0778051716749317e-06, "loss": 0.65494215, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.535900592803955 }, { "auxiliary_loss_clip": 0.02019836, "auxiliary_loss_mlp": 0.01240379, "balance_loss_clip": 1.35775054, "balance_loss_mlp": 1.06042027, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.562060918397283, "language_loss": 0.90286183, "learning_rate": 3.0880811730470094e-06, "loss": 0.93546402, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 3.063695192337036 }, { "auxiliary_loss_clip": 0.01839251, "auxiliary_loss_mlp": 0.01405083, "balance_loss_clip": 1.4135735, "balance_loss_mlp": 1.2051934, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.2017501264364403, "language_loss": 0.5858283, "learning_rate": 3.098213449401257e-06, "loss": 0.61827159, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.3229293823242188 }, { "auxiliary_loss_clip": 0.01972364, "auxiliary_loss_mlp": 0.01248226, "balance_loss_clip": 1.34065008, "balance_loss_mlp": 1.06941187, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 3.108581936499564, "language_loss": 0.98833549, "learning_rate": 3.1082059657570015e-06, "loss": 1.02054143, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.9634714126586914 }, { "auxiliary_loss_clip": 0.01986242, "auxiliary_loss_mlp": 0.01262759, "balance_loss_clip": 1.35082603, "balance_loss_mlp": 1.08280087, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 4.186286838626158, "language_loss": 0.96756655, "learning_rate": 3.1180625252858496e-06, "loss": 1.00005662, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.863412618637085 }, { "auxiliary_loss_clip": 0.01947833, "auxiliary_loss_mlp": 0.01227638, "balance_loss_clip": 1.33661151, "balance_loss_mlp": 1.06331968, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 2.9051400859785916, "language_loss": 0.80403531, "learning_rate": 3.1277867780021663e-06, "loss": 0.83579004, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.838237762451172 }, { "auxiliary_loss_clip": 0.01921432, "auxiliary_loss_mlp": 0.01224082, "balance_loss_clip": 1.3289547, "balance_loss_mlp": 1.06100368, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 2.0142320436065506, "language_loss": 0.95607758, "learning_rate": 3.1373822288779824e-06, "loss": 0.98753262, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.962068796157837 }, { "auxiliary_loss_clip": 0.0193291, "auxiliary_loss_mlp": 0.01235591, "balance_loss_clip": 1.32874894, "balance_loss_mlp": 1.07108223, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 2.156469271603596, "language_loss": 0.79538465, "learning_rate": 3.1468522454274533e-06, "loss": 0.8270697, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.946021318435669 }, { "auxiliary_loss_clip": 0.01911113, "auxiliary_loss_mlp": 0.01233937, "balance_loss_clip": 1.31441903, "balance_loss_mlp": 1.07314789, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.3542473932729675, "language_loss": 0.91754329, "learning_rate": 3.15620006480197e-06, "loss": 0.9489938, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.9376142024993896 }, { "auxiliary_loss_clip": 0.01897262, "auxiliary_loss_mlp": 0.01222973, "balance_loss_clip": 1.31970334, "balance_loss_mlp": 1.06428146, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 5.066165223212594, "language_loss": 0.74986589, "learning_rate": 3.1654288004333087e-06, "loss": 0.78106821, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 3.001807689666748 }, { "auxiliary_loss_clip": 0.01877266, "auxiliary_loss_mlp": 0.01222881, "balance_loss_clip": 1.31136763, "balance_loss_mlp": 1.06752706, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 3.8060863042077506, "language_loss": 0.75773776, "learning_rate": 3.1745414482589353e-06, "loss": 0.7887392, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.913238286972046 }, { "auxiliary_loss_clip": 0.01857139, "auxiliary_loss_mlp": 0.01215793, "balance_loss_clip": 1.30536604, "balance_loss_mlp": 1.06234694, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 2.3059884504569412, "language_loss": 0.86917031, "learning_rate": 3.1835408925606204e-06, "loss": 0.8998996, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.8799424171447754 }, { "auxiliary_loss_clip": 0.01839449, "auxiliary_loss_mlp": 0.01227844, "balance_loss_clip": 1.29269958, "balance_loss_mlp": 1.08145475, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.4200606642058435, "language_loss": 0.89101791, "learning_rate": 3.1924299114448214e-06, "loss": 0.92169088, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.987370014190674 }, { "auxiliary_loss_clip": 0.01832308, "auxiliary_loss_mlp": 0.01228095, "balance_loss_clip": 1.28733349, "balance_loss_mlp": 1.07703328, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 3.1600629257542114, "language_loss": 0.83374465, "learning_rate": 3.2012111819909055e-06, "loss": 0.86434865, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.88316011428833 }, { "auxiliary_loss_clip": 0.01826956, "auxiliary_loss_mlp": 0.0123864, "balance_loss_clip": 1.28376842, "balance_loss_mlp": 1.08958042, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.252570398075834, "language_loss": 0.95134616, "learning_rate": 3.2098872850910627e-06, "loss": 0.98200214, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.958970785140991 }, { "auxiliary_loss_clip": 0.01834933, "auxiliary_loss_mlp": 0.01209588, "balance_loss_clip": 1.28966117, "balance_loss_mlp": 1.0720681, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 2.002945844679008, "language_loss": 0.89281255, "learning_rate": 3.2184607100038194e-06, "loss": 0.92325777, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.894205093383789 }, { "auxiliary_loss_clip": 0.01805956, "auxiliary_loss_mlp": 0.01189453, "balance_loss_clip": 1.27346087, "balance_loss_mlp": 1.05460358, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.0957407411734192, "language_loss": 0.93386739, "learning_rate": 3.2269338586412414e-06, "loss": 0.96382147, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.872617483139038 }, { "auxiliary_loss_clip": 0.0179302, "auxiliary_loss_mlp": 0.01190949, "balance_loss_clip": 1.28037453, "balance_loss_mlp": 1.06268001, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.291356681908334, "language_loss": 0.96471673, "learning_rate": 3.2353090496083106e-06, "loss": 0.99455643, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.85322642326355 }, { "auxiliary_loss_clip": 0.01782509, "auxiliary_loss_mlp": 0.01206084, "balance_loss_clip": 1.2717135, "balance_loss_mlp": 1.07514477, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 2.1326039696783066, "language_loss": 0.81210971, "learning_rate": 3.2435885220114572e-06, "loss": 0.84199566, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 3.0708131790161133 }, { "auxiliary_loss_clip": 0.01759591, "auxiliary_loss_mlp": 0.01204211, "balance_loss_clip": 1.26306844, "balance_loss_mlp": 1.07660925, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.6103310546744627, "language_loss": 0.93947738, "learning_rate": 3.2517744390519113e-06, "loss": 0.96911538, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.915602445602417 }, { "auxiliary_loss_clip": 0.01763082, "auxiliary_loss_mlp": 0.01177221, "balance_loss_clip": 1.26228154, "balance_loss_mlp": 1.05562735, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 1.9965318271757118, "language_loss": 0.75129604, "learning_rate": 3.259868891418298e-06, "loss": 0.78069913, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.8765599727630615 }, { "auxiliary_loss_clip": 0.01763225, "auxiliary_loss_mlp": 0.0119091, "balance_loss_clip": 1.26460886, "balance_loss_mlp": 1.07198691, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 1.9872068647820107, "language_loss": 0.8513869, "learning_rate": 3.2678739004917757e-06, "loss": 0.88092828, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.9888498783111572 }, { "auxiliary_loss_clip": 0.01758433, "auxiliary_loss_mlp": 0.0119068, "balance_loss_clip": 1.26671374, "balance_loss_mlp": 1.07318723, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 2.0174603922058902, "language_loss": 0.92256546, "learning_rate": 3.275791421376029e-06, "loss": 0.95205665, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.0287177562713623 }, { "auxiliary_loss_clip": 0.01737576, "auxiliary_loss_mlp": 0.01178195, "balance_loss_clip": 1.25182486, "balance_loss_mlp": 1.06404042, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.0466235756349445, "language_loss": 0.96122563, "learning_rate": 3.2836233457634622e-06, "loss": 0.99038333, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 4.931012868881226 }, { "auxiliary_loss_clip": 0.01744352, "auxiliary_loss_mlp": 0.01179062, "balance_loss_clip": 1.25203991, "balance_loss_mlp": 1.05641937, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 2.3403290890888986, "language_loss": 0.85487479, "learning_rate": 3.2913715046481135e-06, "loss": 0.88410896, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 4.714641332626343 }, { "auxiliary_loss_clip": 0.01725353, "auxiliary_loss_mlp": 0.01181119, "balance_loss_clip": 1.24459052, "balance_loss_mlp": 1.07335377, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.248882125168948, "language_loss": 0.88781619, "learning_rate": 3.299037670895023e-06, "loss": 0.91688091, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.879807710647583 }, { "auxiliary_loss_clip": 0.01722078, "auxiliary_loss_mlp": 0.01181066, "balance_loss_clip": 1.25010657, "balance_loss_mlp": 1.07234752, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 1.8291852461407545, "language_loss": 0.80365711, "learning_rate": 3.3066235616750667e-06, "loss": 0.83268857, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.9547457695007324 }, { "auxiliary_loss_clip": 0.01707565, "auxiliary_loss_mlp": 0.01166644, "balance_loss_clip": 1.24320221, "balance_loss_mlp": 1.05940306, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.294569935770283, "language_loss": 0.92498606, "learning_rate": 3.3141308407736276e-06, "loss": 0.95372808, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.952949285507202 }, { "auxiliary_loss_clip": 0.01707781, "auxiliary_loss_mlp": 0.01173527, "balance_loss_clip": 1.23600054, "balance_loss_mlp": 1.06561852, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 2.180679286989102, "language_loss": 0.86686409, "learning_rate": 3.321561120780869e-06, "loss": 0.89567715, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.817582607269287 }, { "auxiliary_loss_clip": 0.01688073, "auxiliary_loss_mlp": 0.01149586, "balance_loss_clip": 1.23754096, "balance_loss_mlp": 1.0517391, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.5690650511581454, "language_loss": 1.01521993, "learning_rate": 3.3289159651708192e-06, "loss": 1.04359651, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.995007038116455 }, { "auxiliary_loss_clip": 0.01678299, "auxiliary_loss_mlp": 0.01159822, "balance_loss_clip": 1.22949457, "balance_loss_mlp": 1.06092572, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 2.8054206424523342, "language_loss": 0.97828633, "learning_rate": 3.3361968902759768e-06, "loss": 1.00666761, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.9250128269195557 }, { "auxiliary_loss_clip": 0.01656584, "auxiliary_loss_mlp": 0.01160926, "balance_loss_clip": 1.22481287, "balance_loss_mlp": 1.06565392, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.293345982738518, "language_loss": 0.93933713, "learning_rate": 3.343405367163663e-06, "loss": 0.96751225, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.851440668106079 }, { "auxiliary_loss_clip": 0.01664265, "auxiliary_loss_mlp": 0.01153168, "balance_loss_clip": 1.22618723, "balance_loss_mlp": 1.05832458, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 2.6353529028408484, "language_loss": 0.81182402, "learning_rate": 3.350542823419951e-06, "loss": 0.83999836, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.8608765602111816 }, { "auxiliary_loss_clip": 0.01666763, "auxiliary_loss_mlp": 0.01172297, "balance_loss_clip": 1.2281512, "balance_loss_mlp": 1.07382965, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 5.6203958620003, "language_loss": 0.87289381, "learning_rate": 3.3576106448465615e-06, "loss": 0.90128434, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.7945313453674316 }, { "auxiliary_loss_clip": 0.01652662, "auxiliary_loss_mlp": 0.01144464, "balance_loss_clip": 1.21954489, "balance_loss_mlp": 1.05047941, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 1.9855898120627027, "language_loss": 0.88362908, "learning_rate": 3.3646101770757797e-06, "loss": 0.91160029, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.922579288482666 }, { "auxiliary_loss_clip": 0.01656358, "auxiliary_loss_mlp": 0.01145646, "balance_loss_clip": 1.22854364, "balance_loss_mlp": 1.05800366, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.6866774348870917, "language_loss": 0.85641319, "learning_rate": 3.371542727108104e-06, "loss": 0.88443315, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 3.0348575115203857 }, { "auxiliary_loss_clip": 0.01641173, "auxiliary_loss_mlp": 0.01158228, "balance_loss_clip": 1.21720636, "balance_loss_mlp": 1.07244515, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 2.780085351352469, "language_loss": 0.89803177, "learning_rate": 3.3784095647770114e-06, "loss": 0.92602575, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.8809244632720947 }, { "auxiliary_loss_clip": 0.01639032, "auxiliary_loss_mlp": 0.01145384, "balance_loss_clip": 1.21830225, "balance_loss_mlp": 1.05826628, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 2.087649907910871, "language_loss": 0.88568699, "learning_rate": 3.3852119241449547e-06, "loss": 0.91353118, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.9040722846984863 }, { "auxiliary_loss_clip": 0.01623495, "auxiliary_loss_mlp": 0.01157393, "balance_loss_clip": 1.20883083, "balance_loss_mlp": 1.06841528, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 2.7151180348919133, "language_loss": 0.96251237, "learning_rate": 3.3919510048344295e-06, "loss": 0.99032116, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.976682424545288 }, { "auxiliary_loss_clip": 0.01621612, "auxiliary_loss_mlp": 0.01146684, "balance_loss_clip": 1.21004117, "balance_loss_mlp": 1.06404793, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 2.0810585475387993, "language_loss": 0.86756796, "learning_rate": 3.3986279732976907e-06, "loss": 0.89525092, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.9586191177368164 }, { "auxiliary_loss_clip": 0.01610842, "auxiliary_loss_mlp": 0.01151288, "balance_loss_clip": 1.20359468, "balance_loss_mlp": 1.07213354, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 2.1413498262183324, "language_loss": 0.95229638, "learning_rate": 3.4052439640284983e-06, "loss": 0.97991765, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.9210593700408936 }, { "auxiliary_loss_clip": 0.01604265, "auxiliary_loss_mlp": 0.01143627, "balance_loss_clip": 1.20466113, "balance_loss_mlp": 1.06375635, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.8466351505747316, "language_loss": 0.81126034, "learning_rate": 3.4118000807190217e-06, "loss": 0.83873928, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.925126075744629 }, { "auxiliary_loss_clip": 0.01606946, "auxiliary_loss_mlp": 0.01149007, "balance_loss_clip": 1.19893038, "balance_loss_mlp": 1.07228422, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.7464224524501835, "language_loss": 0.75883412, "learning_rate": 3.4182973973648723e-06, "loss": 0.78639364, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.9905526638031006 }, { "auxiliary_loss_clip": 0.01606466, "auxiliary_loss_mlp": 0.01139405, "balance_loss_clip": 1.20617104, "balance_loss_mlp": 1.06358743, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 3.0792691767462164, "language_loss": 0.9518193, "learning_rate": 3.424736959321014e-06, "loss": 0.97927803, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.864097833633423 }, { "auxiliary_loss_clip": 0.01605558, "auxiliary_loss_mlp": 0.01135098, "balance_loss_clip": 1.20405293, "balance_loss_mlp": 1.06032944, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.3652837853393467, "language_loss": 0.88670039, "learning_rate": 3.431119784311155e-06, "loss": 0.91410691, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.9298107624053955 }, { "auxiliary_loss_clip": 0.0158398, "auxiliary_loss_mlp": 0.01126197, "balance_loss_clip": 1.19421697, "balance_loss_mlp": 1.05400395, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 1.8006701202552446, "language_loss": 0.77579975, "learning_rate": 3.43744686339307e-06, "loss": 0.80290157, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.9746525287628174 }, { "auxiliary_loss_clip": 0.01578044, "auxiliary_loss_mlp": 0.01114509, "balance_loss_clip": 1.18546247, "balance_loss_mlp": 1.04555786, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.2262826834304277, "language_loss": 0.90984643, "learning_rate": 3.44371916188212e-06, "loss": 0.93677199, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 3.1305603981018066 }, { "auxiliary_loss_clip": 0.01572639, "auxiliary_loss_mlp": 0.01129615, "balance_loss_clip": 1.18853831, "balance_loss_mlp": 1.06214309, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.5188555142900704, "language_loss": 0.86250633, "learning_rate": 3.449937620235143e-06, "loss": 0.88952887, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 3.013335943222046 }, { "auxiliary_loss_clip": 0.01577844, "auxiliary_loss_mlp": 0.01137877, "balance_loss_clip": 1.19208336, "balance_loss_mlp": 1.0687356, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 2.315204945730483, "language_loss": 0.89433765, "learning_rate": 3.456103154896722e-06, "loss": 0.92149484, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.002483367919922 }, { "auxiliary_loss_clip": 0.01560637, "auxiliary_loss_mlp": 0.01122773, "balance_loss_clip": 1.184443, "balance_loss_mlp": 1.05840015, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.951989050723572, "language_loss": 0.92525482, "learning_rate": 3.462216659109757e-06, "loss": 0.95208895, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 3.9390597343444824 }, { "auxiliary_loss_clip": 0.0156715, "auxiliary_loss_mlp": 0.01127634, "balance_loss_clip": 1.18784487, "balance_loss_mlp": 1.06488204, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.2343372253899236, "language_loss": 0.85135746, "learning_rate": 3.4682790036921077e-06, "loss": 0.87830526, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 5.31898307800293 }, { "auxiliary_loss_clip": 0.01547459, "auxiliary_loss_mlp": 0.01110701, "balance_loss_clip": 1.17878556, "balance_loss_mlp": 1.0544343, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.076580246012786, "language_loss": 0.83081311, "learning_rate": 3.4742910377810193e-06, "loss": 0.8573947, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 3.8063547611236572 }, { "auxiliary_loss_clip": 0.01551106, "auxiliary_loss_mlp": 0.01122669, "balance_loss_clip": 1.18063879, "balance_loss_mlp": 1.06413722, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.5984580920396474, "language_loss": 0.88645601, "learning_rate": 3.4802535895469042e-06, "loss": 0.91319382, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.998000144958496 }, { "auxiliary_loss_clip": 0.01547082, "auxiliary_loss_mlp": 0.01114178, "balance_loss_clip": 1.17821741, "balance_loss_mlp": 1.05438221, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 1.9777835256725866, "language_loss": 0.89538741, "learning_rate": 3.4861674668779934e-06, "loss": 0.92200005, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 3.019275426864624 }, { "auxiliary_loss_clip": 0.01542192, "auxiliary_loss_mlp": 0.01116082, "balance_loss_clip": 1.17652106, "balance_loss_mlp": 1.06026793, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 3.180843152556381, "language_loss": 0.83843601, "learning_rate": 3.492033458037272e-06, "loss": 0.86501873, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.8754453659057617 }, { "auxiliary_loss_clip": 0.01539086, "auxiliary_loss_mlp": 0.01113303, "balance_loss_clip": 1.17411351, "balance_loss_mlp": 1.05531955, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.285821675876931, "language_loss": 0.87035012, "learning_rate": 3.497852332293018e-06, "loss": 0.89687401, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.982187032699585 }, { "auxiliary_loss_clip": 0.01536559, "auxiliary_loss_mlp": 0.0110017, "balance_loss_clip": 1.17597771, "balance_loss_mlp": 1.04650187, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 2.114891743241444, "language_loss": 0.96628559, "learning_rate": 3.5036248405242356e-06, "loss": 0.99265283, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.9691214561462402 }, { "auxiliary_loss_clip": 0.01536625, "auxiliary_loss_mlp": 0.01115221, "balance_loss_clip": 1.17311704, "balance_loss_mlp": 1.05981278, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.243833788264279, "language_loss": 0.82767516, "learning_rate": 3.509351715802146e-06, "loss": 0.85419363, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 3.0239593982696533 }, { "auxiliary_loss_clip": 0.01526279, "auxiliary_loss_mlp": 0.01098399, "balance_loss_clip": 1.16820121, "balance_loss_mlp": 1.04384923, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.3056158356229575, "language_loss": 0.78339756, "learning_rate": 3.5150336739488763e-06, "loss": 0.80964434, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 3.205263137817383 }, { "auxiliary_loss_clip": 0.01524806, "auxiliary_loss_mlp": 0.01098673, "balance_loss_clip": 1.17195201, "balance_loss_mlp": 1.04736555, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 2.3500595419458845, "language_loss": 0.84062964, "learning_rate": 3.5206714140744143e-06, "loss": 0.86686444, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.9813997745513916 }, { "auxiliary_loss_clip": 0.01521558, "auxiliary_loss_mlp": 0.01112331, "balance_loss_clip": 1.17307746, "balance_loss_mlp": 1.05649352, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 2.89823533272617, "language_loss": 0.87784785, "learning_rate": 3.5262656190928208e-06, "loss": 0.90418673, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 3.006838083267212 }, { "auxiliary_loss_clip": 0.01592603, "auxiliary_loss_mlp": 0.01264673, "balance_loss_clip": 1.27016234, "balance_loss_mlp": 1.21050406, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.088784855758675, "language_loss": 0.71525264, "learning_rate": 3.5318169562186737e-06, "loss": 0.74382532, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.419646739959717 }, { "auxiliary_loss_clip": 0.0151849, "auxiliary_loss_mlp": 0.01117467, "balance_loss_clip": 1.16902924, "balance_loss_mlp": 1.0660398, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 1.8426976570970197, "language_loss": 0.82088989, "learning_rate": 3.5373260774446292e-06, "loss": 0.84724951, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.8674590587615967 }, { "auxiliary_loss_clip": 0.01500596, "auxiliary_loss_mlp": 0.01108687, "balance_loss_clip": 1.16012907, "balance_loss_mlp": 1.06109881, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 2.5946461974111226, "language_loss": 0.90129054, "learning_rate": 3.542793620000961e-06, "loss": 0.92738342, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.8400206565856934 }, { "auxiliary_loss_clip": 0.01504393, "auxiliary_loss_mlp": 0.01116456, "balance_loss_clip": 1.1627394, "balance_loss_mlp": 1.06273997, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 3.0853437302716182, "language_loss": 0.87215197, "learning_rate": 3.5482202067978894e-06, "loss": 0.89836043, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.70760440826416 }, { "auxiliary_loss_clip": 0.01506477, "auxiliary_loss_mlp": 0.01117, "balance_loss_clip": 1.16357327, "balance_loss_mlp": 1.06762302, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 2.05605441492831, "language_loss": 0.76123047, "learning_rate": 3.553606446851471e-06, "loss": 0.78746521, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.7156004905700684 }, { "auxiliary_loss_clip": 0.01498754, "auxiliary_loss_mlp": 0.01108852, "balance_loss_clip": 1.16020346, "balance_loss_mlp": 1.06219351, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 2.560880034337986, "language_loss": 0.83323073, "learning_rate": 3.5589529356937613e-06, "loss": 0.85930675, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.702470302581787 }, { "auxiliary_loss_clip": 0.01500228, "auxiliary_loss_mlp": 0.01109688, "balance_loss_clip": 1.15976501, "balance_loss_mlp": 1.05993032, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 1.7293601735154533, "language_loss": 0.76860267, "learning_rate": 3.5642602557679627e-06, "loss": 0.79470181, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.7081241607666016 }, { "auxiliary_loss_clip": 0.01492743, "auxiliary_loss_mlp": 0.01109994, "balance_loss_clip": 1.16454792, "balance_loss_mlp": 1.06500459, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.126630399912691, "language_loss": 0.8397646, "learning_rate": 3.569528976809202e-06, "loss": 0.86579192, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.813283681869507 }, { "auxiliary_loss_clip": 0.01485829, "auxiliary_loss_mlp": 0.01109537, "balance_loss_clip": 1.15150833, "balance_loss_mlp": 1.0652864, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 2.986741946594054, "language_loss": 0.89888525, "learning_rate": 3.5747596562115522e-06, "loss": 0.9248389, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.7201638221740723 }, { "auxiliary_loss_clip": 0.01497249, "auxiliary_loss_mlp": 0.011025, "balance_loss_clip": 1.15861583, "balance_loss_mlp": 1.05560303, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 2.2406398640839713, "language_loss": 0.91186523, "learning_rate": 3.5799528393819138e-06, "loss": 0.93786275, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.683584451675415 }, { "auxiliary_loss_clip": 0.01489295, "auxiliary_loss_mlp": 0.01099259, "balance_loss_clip": 1.15328503, "balance_loss_mlp": 1.05403137, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 2.0120216923942915, "language_loss": 0.87945116, "learning_rate": 3.585109060081286e-06, "loss": 0.90533674, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.6741178035736084 }, { "auxiliary_loss_clip": 0.01488556, "auxiliary_loss_mlp": 0.0110906, "balance_loss_clip": 1.15583694, "balance_loss_mlp": 1.0651437, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 1.8221244132613748, "language_loss": 0.78338969, "learning_rate": 3.590228840753992e-06, "loss": 0.80936587, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.75569748878479 }, { "auxiliary_loss_clip": 0.01484107, "auxiliary_loss_mlp": 0.01102913, "balance_loss_clip": 1.15521276, "balance_loss_mlp": 1.06040335, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.1854771869513807, "language_loss": 0.87190402, "learning_rate": 3.5953126928453423e-06, "loss": 0.89777422, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.712308883666992 }, { "auxiliary_loss_clip": 0.01475, "auxiliary_loss_mlp": 0.01094653, "balance_loss_clip": 1.14900875, "balance_loss_mlp": 1.05192828, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 3.496514768598437, "language_loss": 0.80519009, "learning_rate": 3.600361117108239e-06, "loss": 0.8308866, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 2.7730443477630615 }, { "auxiliary_loss_clip": 0.01471914, "auxiliary_loss_mlp": 0.01106085, "balance_loss_clip": 1.14787221, "balance_loss_mlp": 1.06498122, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 2.2367888611759486, "language_loss": 0.97172558, "learning_rate": 3.6053746038991616e-06, "loss": 0.99750561, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 3.653458595275879 }, { "auxiliary_loss_clip": 0.0150124, "auxiliary_loss_mlp": 0.01079383, "balance_loss_clip": 1.22408092, "balance_loss_mlp": 1.04028225, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0662951904433318, "language_loss": 0.58460283, "learning_rate": 3.6103536334639843e-06, "loss": 0.61040902, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 4.18355131149292 }, { "auxiliary_loss_clip": 0.01469763, "auxiliary_loss_mlp": 0.01095265, "balance_loss_clip": 1.14717209, "balance_loss_mlp": 1.05647385, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 4.83975595361538, "language_loss": 0.8541314, "learning_rate": 3.615298676214041e-06, "loss": 0.8797816, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 4.6036646366119385 }, { "auxiliary_loss_clip": 0.01473019, "auxiliary_loss_mlp": 0.01096699, "balance_loss_clip": 1.15157139, "balance_loss_mlp": 1.05728865, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.281271984941926, "language_loss": 0.8891207, "learning_rate": 3.6202101929928317e-06, "loss": 0.91481781, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.7776453495025635 }, { "auxiliary_loss_clip": 0.01464425, "auxiliary_loss_mlp": 0.01101372, "balance_loss_clip": 1.14377546, "balance_loss_mlp": 1.06322479, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 3.4690401736473695, "language_loss": 0.88337749, "learning_rate": 3.6250886353337413e-06, "loss": 0.90903544, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.6882412433624268 }, { "auxiliary_loss_clip": 0.01467806, "auxiliary_loss_mlp": 0.0110546, "balance_loss_clip": 1.14835405, "balance_loss_mlp": 1.06602514, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 1.94800892384711, "language_loss": 0.86613458, "learning_rate": 3.6299344457091488e-06, "loss": 0.89186728, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.730767250061035 }, { "auxiliary_loss_clip": 0.01469869, "auxiliary_loss_mlp": 0.01095764, "balance_loss_clip": 1.1500814, "balance_loss_mlp": 1.0562582, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.308266700390163, "language_loss": 0.93852311, "learning_rate": 3.634748057771256e-06, "loss": 0.96417952, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.6674718856811523 }, { "auxiliary_loss_clip": 0.01462294, "auxiliary_loss_mlp": 0.0109966, "balance_loss_clip": 1.1485858, "balance_loss_mlp": 1.06220424, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.6805552350797621, "language_loss": 0.85580212, "learning_rate": 3.639529896584965e-06, "loss": 0.88142163, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.7233667373657227 }, { "auxiliary_loss_clip": 0.01460216, "auxiliary_loss_mlp": 0.01083393, "balance_loss_clip": 1.14525831, "balance_loss_mlp": 1.04593778, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.7668867162303585, "language_loss": 0.88783538, "learning_rate": 3.6442803788531233e-06, "loss": 0.91327155, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.6788299083709717 }, { "auxiliary_loss_clip": 0.01461056, "auxiliary_loss_mlp": 0.01096144, "balance_loss_clip": 1.14485884, "balance_loss_mlp": 1.05613768, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 2.6769106753690957, "language_loss": 0.96062255, "learning_rate": 3.6489999131344357e-06, "loss": 0.98619455, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.7335593700408936 }, { "auxiliary_loss_clip": 0.01447188, "auxiliary_loss_mlp": 0.01092256, "balance_loss_clip": 1.14009857, "balance_loss_mlp": 1.05704188, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 2.2281419393642854, "language_loss": 0.90530956, "learning_rate": 3.653688900054313e-06, "loss": 0.930704, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.718315362930298 }, { "auxiliary_loss_clip": 0.01455481, "auxiliary_loss_mlp": 0.01088818, "balance_loss_clip": 1.1406827, "balance_loss_mlp": 1.05016994, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.702201212820332, "language_loss": 0.75880468, "learning_rate": 3.6583477325089526e-06, "loss": 0.7842477, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.7053611278533936 }, { "auxiliary_loss_clip": 0.014499, "auxiliary_loss_mlp": 0.01091635, "balance_loss_clip": 1.14239812, "balance_loss_mlp": 1.05613422, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.9065995567344816, "language_loss": 1.04048252, "learning_rate": 3.6629767958628916e-06, "loss": 1.06589782, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.7983720302581787 }, { "auxiliary_loss_clip": 0.01454845, "auxiliary_loss_mlp": 0.01087198, "balance_loss_clip": 1.14552665, "balance_loss_mlp": 1.05250812, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.94594324719457, "language_loss": 0.8559975, "learning_rate": 3.667576468140291e-06, "loss": 0.88141793, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.835381031036377 }, { "auxiliary_loss_clip": 0.01445405, "auxiliary_loss_mlp": 0.01079974, "balance_loss_clip": 1.13597167, "balance_loss_mlp": 1.0466665, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.2724649212971495, "language_loss": 0.88665873, "learning_rate": 3.672147120210184e-06, "loss": 0.91191256, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.748196601867676 }, { "auxiliary_loss_clip": 0.01439664, "auxiliary_loss_mlp": 0.01092815, "balance_loss_clip": 1.14013577, "balance_loss_mlp": 1.05922163, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 2.5933199443254957, "language_loss": 0.8632462, "learning_rate": 3.6766891159659177e-06, "loss": 0.8885709, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.6645824909210205 }, { "auxiliary_loss_clip": 0.01441128, "auxiliary_loss_mlp": 0.01081659, "balance_loss_clip": 1.13997245, "balance_loss_mlp": 1.04654026, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 2.839714973602095, "language_loss": 0.87683988, "learning_rate": 3.6812028124990075e-06, "loss": 0.90206778, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.698045492172241 }, { "auxiliary_loss_clip": 0.014422, "auxiliary_loss_mlp": 0.01096596, "balance_loss_clip": 1.13958049, "balance_loss_mlp": 1.0643146, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 3.0117545270398005, "language_loss": 0.81226724, "learning_rate": 3.6856885602676016e-06, "loss": 0.83765519, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.6293506622314453 }, { "auxiliary_loss_clip": 0.01442064, "auxiliary_loss_mlp": 0.01086672, "balance_loss_clip": 1.13909292, "balance_loss_mlp": 1.05133808, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.184984973208769, "language_loss": 0.94247186, "learning_rate": 3.6901467032597733e-06, "loss": 0.96775925, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.742133855819702 }, { "auxiliary_loss_clip": 0.01435696, "auxiliary_loss_mlp": 0.01084459, "balance_loss_clip": 1.13600993, "balance_loss_mlp": 1.05146146, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 3.3442008013937845, "language_loss": 0.8739903, "learning_rate": 3.694577579151804e-06, "loss": 0.89919186, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.6751654148101807 }, { "auxiliary_loss_clip": 0.0143522, "auxiliary_loss_mlp": 0.01088848, "balance_loss_clip": 1.13783097, "balance_loss_mlp": 1.05513537, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.376916690847514, "language_loss": 0.73701477, "learning_rate": 3.6989815194616703e-06, "loss": 0.76225543, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.679934024810791 }, { "auxiliary_loss_clip": 0.01437746, "auxiliary_loss_mlp": 0.01091085, "balance_loss_clip": 1.13318598, "balance_loss_mlp": 1.05670464, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 2.447731805180356, "language_loss": 0.79923522, "learning_rate": 3.703358849697888e-06, "loss": 0.82452357, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.7488467693328857 }, { "auxiliary_loss_clip": 0.0142913, "auxiliary_loss_mlp": 0.01089739, "balance_loss_clip": 1.13548303, "balance_loss_mlp": 1.05731404, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 1.6854257414965002, "language_loss": 0.82556605, "learning_rate": 3.7077098895038803e-06, "loss": 0.85075474, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.7778618335723877 }, { "auxiliary_loss_clip": 0.01431672, "auxiliary_loss_mlp": 0.01087277, "balance_loss_clip": 1.13559365, "balance_loss_mlp": 1.05418468, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.0191756083546335, "language_loss": 0.96805888, "learning_rate": 3.712034952798045e-06, "loss": 0.9932484, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.7300612926483154 }, { "auxiliary_loss_clip": 0.01438125, "auxiliary_loss_mlp": 0.0108565, "balance_loss_clip": 1.13920176, "balance_loss_mlp": 1.05260563, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 4.736419546988029, "language_loss": 0.84907329, "learning_rate": 3.7163343479096656e-06, "loss": 0.87431103, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.7728898525238037 }, { "auxiliary_loss_clip": 0.01418603, "auxiliary_loss_mlp": 0.01075008, "balance_loss_clip": 1.13079071, "balance_loss_mlp": 1.0450387, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.6545826605372222, "language_loss": 0.82624722, "learning_rate": 3.720608377710802e-06, "loss": 0.8511833, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.7858669757843018 }, { "auxiliary_loss_clip": 0.01425473, "auxiliary_loss_mlp": 0.01084899, "balance_loss_clip": 1.13128436, "balance_loss_mlp": 1.05268836, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.6630208639261768, "language_loss": 0.86393696, "learning_rate": 3.7248573397443277e-06, "loss": 0.88904071, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 3.777146816253662 }, { "auxiliary_loss_clip": 0.01427273, "auxiliary_loss_mlp": 0.01079752, "balance_loss_clip": 1.1351155, "balance_loss_mlp": 1.04735112, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.2242281725534734, "language_loss": 0.9759025, "learning_rate": 3.729081526348224e-06, "loss": 1.00097275, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 3.624589443206787 }, { "auxiliary_loss_clip": 0.01425263, "auxiliary_loss_mlp": 0.01073825, "balance_loss_clip": 1.13325489, "balance_loss_mlp": 1.043594, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 3.4428119864106232, "language_loss": 0.85054302, "learning_rate": 3.7332812247762777e-06, "loss": 0.87553394, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 3.6111385822296143 }, { "auxiliary_loss_clip": 0.01426019, "auxiliary_loss_mlp": 0.01085102, "balance_loss_clip": 1.13713884, "balance_loss_mlp": 1.05477524, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.3806871475940734, "language_loss": 0.95507079, "learning_rate": 3.737456717315293e-06, "loss": 0.98018205, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 3.6793758869171143 }, { "auxiliary_loss_clip": 0.01415922, "auxiliary_loss_mlp": 0.01077284, "balance_loss_clip": 1.13341832, "balance_loss_mlp": 1.04893613, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.9092353495111383, "language_loss": 0.90603131, "learning_rate": 3.7416082813989552e-06, "loss": 0.9309634, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.6679985523223877 }, { "auxiliary_loss_clip": 0.01426326, "auxiliary_loss_mlp": 0.01082039, "balance_loss_clip": 1.13472116, "balance_loss_mlp": 1.05044842, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 13.613996888451393, "language_loss": 0.89402539, "learning_rate": 3.745736189718439e-06, "loss": 0.91910899, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.697988510131836 }, { "auxiliary_loss_clip": 0.01411939, "auxiliary_loss_mlp": 0.01077885, "balance_loss_clip": 1.12874937, "balance_loss_mlp": 1.04898858, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 3.0267801463555792, "language_loss": 0.73117143, "learning_rate": 3.749840710329894e-06, "loss": 0.75606966, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.755798578262329 }, { "auxiliary_loss_clip": 0.01425637, "auxiliary_loss_mlp": 0.01084445, "balance_loss_clip": 1.13505018, "balance_loss_mlp": 1.05268741, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 2.8415367298495986, "language_loss": 0.98032123, "learning_rate": 3.7539221067588938e-06, "loss": 1.005422, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.630544900894165 }, { "auxiliary_loss_clip": 0.01419593, "auxiliary_loss_mlp": 0.01072017, "balance_loss_clip": 1.13037694, "balance_loss_mlp": 1.04235792, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 4.89955276432131, "language_loss": 0.93294257, "learning_rate": 3.757980638101964e-06, "loss": 0.95785862, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.660855293273926 }, { "auxiliary_loss_clip": 0.01418601, "auxiliary_loss_mlp": 0.01074313, "balance_loss_clip": 1.13336265, "balance_loss_mlp": 1.04300904, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.443507878317669, "language_loss": 0.89742619, "learning_rate": 3.7620165591252806e-06, "loss": 0.92235529, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.8360371589660645 }, { "auxiliary_loss_clip": 0.01407166, "auxiliary_loss_mlp": 0.01071396, "balance_loss_clip": 1.1283797, "balance_loss_mlp": 1.04342914, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.9640126866231804, "language_loss": 0.94199884, "learning_rate": 3.766030120360636e-06, "loss": 0.96678448, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.709075689315796 }, { "auxiliary_loss_clip": 0.01406248, "auxiliary_loss_mlp": 0.01072845, "balance_loss_clip": 1.12406206, "balance_loss_mlp": 1.04483044, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.066837876421517, "language_loss": 0.90295351, "learning_rate": 3.7700215681987578e-06, "loss": 0.92774439, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.7456021308898926 }, { "auxiliary_loss_clip": 0.01411272, "auxiliary_loss_mlp": 0.01081514, "balance_loss_clip": 1.12870407, "balance_loss_mlp": 1.05247521, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.7900003913234817, "language_loss": 0.82284099, "learning_rate": 3.7739911449800767e-06, "loss": 0.8477689, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.694955825805664 }, { "auxiliary_loss_clip": 0.01407091, "auxiliary_loss_mlp": 0.01079583, "balance_loss_clip": 1.12529325, "balance_loss_mlp": 1.04920888, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 2.7839989433001033, "language_loss": 0.80622828, "learning_rate": 3.7779390890830114e-06, "loss": 0.83109504, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.658193826675415 }, { "auxiliary_loss_clip": 0.01408778, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.1268189, "balance_loss_mlp": 1.04682207, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 2.1728194928852482, "language_loss": 0.85912228, "learning_rate": 3.7818656350098723e-06, "loss": 0.88397515, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.769411325454712 }, { "auxiliary_loss_clip": 0.01405809, "auxiliary_loss_mlp": 0.01082886, "balance_loss_clip": 1.12671089, "balance_loss_mlp": 1.05220175, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.7828755030698593, "language_loss": 0.77147359, "learning_rate": 3.7857710134704447e-06, "loss": 0.79636049, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.721472978591919 }, { "auxiliary_loss_clip": 0.01399372, "auxiliary_loss_mlp": 0.01073158, "balance_loss_clip": 1.12324059, "balance_loss_mlp": 1.04497671, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.230348795603599, "language_loss": 0.79168236, "learning_rate": 3.7896554514633234e-06, "loss": 0.81640768, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.8829474449157715 }, { "auxiliary_loss_clip": 0.01398468, "auxiliary_loss_mlp": 0.01066677, "balance_loss_clip": 1.12208045, "balance_loss_mlp": 1.03818607, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 2.9149948576442837, "language_loss": 0.8440634, "learning_rate": 3.7935191723550955e-06, "loss": 0.86871487, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.670006275177002 }, { "auxiliary_loss_clip": 0.01388246, "auxiliary_loss_mlp": 0.01074799, "balance_loss_clip": 1.11785257, "balance_loss_mlp": 1.04782224, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.0983350916139134, "language_loss": 0.88750267, "learning_rate": 3.797362395957408e-06, "loss": 0.91213316, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.7288918495178223 }, { "auxiliary_loss_clip": 0.01402762, "auxiliary_loss_mlp": 0.01083275, "balance_loss_clip": 1.12559366, "balance_loss_mlp": 1.05428326, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 5.598758273483956, "language_loss": 0.78658938, "learning_rate": 3.8011853386020055e-06, "loss": 0.81144971, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.697518825531006 }, { "auxiliary_loss_clip": 0.013973, "auxiliary_loss_mlp": 0.01070077, "balance_loss_clip": 1.12217724, "balance_loss_mlp": 1.04287302, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 3.1072005672095853, "language_loss": 0.89661324, "learning_rate": 3.804988213213804e-06, "loss": 0.92128694, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.742000102996826 }, { "auxiliary_loss_clip": 0.01424385, "auxiliary_loss_mlp": 0.0104792, "balance_loss_clip": 1.20159054, "balance_loss_mlp": 1.02426839, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0223256900886744, "language_loss": 0.63196737, "learning_rate": 3.808771229382049e-06, "loss": 0.65669036, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.148181438446045 }, { "auxiliary_loss_clip": 0.01390792, "auxiliary_loss_mlp": 0.01068937, "balance_loss_clip": 1.11754155, "balance_loss_mlp": 1.04254425, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 1.9398014541343245, "language_loss": 0.84545594, "learning_rate": 3.8125345934296324e-06, "loss": 0.87005317, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.655709981918335 }, { "auxiliary_loss_clip": 0.01396234, "auxiliary_loss_mlp": 0.01077713, "balance_loss_clip": 1.12115633, "balance_loss_mlp": 1.04793429, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.0619972709396954, "language_loss": 0.87907934, "learning_rate": 3.81627850848061e-06, "loss": 0.90381879, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.7164971828460693 }, { "auxiliary_loss_clip": 0.01393204, "auxiliary_loss_mlp": 0.01077457, "balance_loss_clip": 1.12242723, "balance_loss_mlp": 1.05020618, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.4219065298047786, "language_loss": 0.86195797, "learning_rate": 3.820003174525994e-06, "loss": 0.88666457, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.713988780975342 }, { "auxiliary_loss_clip": 0.01390099, "auxiliary_loss_mlp": 0.0106924, "balance_loss_clip": 1.12090039, "balance_loss_mlp": 1.04232228, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.366244480208766, "language_loss": 0.82935309, "learning_rate": 3.823708788487851e-06, "loss": 0.85394645, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.701359510421753 }, { "auxiliary_loss_clip": 0.01387122, "auxiliary_loss_mlp": 0.01073895, "balance_loss_clip": 1.12139237, "balance_loss_mlp": 1.0482409, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 1.876095675334408, "language_loss": 0.84847367, "learning_rate": 3.827395544281781e-06, "loss": 0.87308383, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 2.6688716411590576 }, { "auxiliary_loss_clip": 0.01386979, "auxiliary_loss_mlp": 0.01070761, "balance_loss_clip": 1.11767662, "balance_loss_mlp": 1.04449916, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 1.9965360485312935, "language_loss": 0.78865445, "learning_rate": 3.831063632877802e-06, "loss": 0.81323183, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 4.5386834144592285 }, { "auxiliary_loss_clip": 0.01385574, "auxiliary_loss_mlp": 0.01079191, "balance_loss_clip": 1.12173235, "balance_loss_mlp": 1.05407333, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.580862384267779, "language_loss": 0.75804043, "learning_rate": 3.834713242359712e-06, "loss": 0.78268808, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 2.6747987270355225 }, { "auxiliary_loss_clip": 0.01388643, "auxiliary_loss_mlp": 0.01083181, "balance_loss_clip": 1.11798477, "balance_loss_mlp": 1.05588186, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 1.8437114051578152, "language_loss": 0.87251681, "learning_rate": 3.838344557982959e-06, "loss": 0.89723504, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 3.6128945350646973 }, { "auxiliary_loss_clip": 0.01378977, "auxiliary_loss_mlp": 0.01067945, "balance_loss_clip": 1.11379242, "balance_loss_mlp": 1.04194546, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 5.059658097319671, "language_loss": 0.84734201, "learning_rate": 3.841957762231063e-06, "loss": 0.87181115, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.698756217956543 }, { "auxiliary_loss_clip": 0.01380125, "auxiliary_loss_mlp": 0.01067146, "balance_loss_clip": 1.1156652, "balance_loss_mlp": 1.04130137, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.114674186601082, "language_loss": 0.87740046, "learning_rate": 3.8455530348706454e-06, "loss": 0.90187317, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.735649347305298 }, { "auxiliary_loss_clip": 0.01382995, "auxiliary_loss_mlp": 0.01065001, "balance_loss_clip": 1.11912084, "balance_loss_mlp": 1.03952575, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 1.9806504669162635, "language_loss": 0.77653897, "learning_rate": 3.849130553005099e-06, "loss": 0.80101889, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.630460023880005 }, { "auxiliary_loss_clip": 0.0138415, "auxiliary_loss_mlp": 0.01066768, "balance_loss_clip": 1.11832261, "balance_loss_mlp": 1.04070926, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 1.8619236535920554, "language_loss": 0.83487654, "learning_rate": 3.852690491126933e-06, "loss": 0.85938567, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.7258100509643555 }, { "auxiliary_loss_clip": 0.01381173, "auxiliary_loss_mlp": 0.01068178, "balance_loss_clip": 1.11534286, "balance_loss_mlp": 1.04197586, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 3.3316653418974655, "language_loss": 0.91263688, "learning_rate": 3.856233021168845e-06, "loss": 0.93713033, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.7292428016662598 }, { "auxiliary_loss_clip": 0.01369062, "auxiliary_loss_mlp": 0.01052449, "balance_loss_clip": 1.11089206, "balance_loss_mlp": 1.02784479, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.166665954917511, "language_loss": 0.91086072, "learning_rate": 3.859758312553544e-06, "loss": 0.93507588, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.786299467086792 }, { "auxiliary_loss_clip": 0.01380371, "auxiliary_loss_mlp": 0.0107929, "balance_loss_clip": 1.11843777, "balance_loss_mlp": 1.0534575, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.0029285186652763, "language_loss": 0.91782427, "learning_rate": 3.8632665322423735e-06, "loss": 0.9424209, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.659464120864868 }, { "auxiliary_loss_clip": 0.01370982, "auxiliary_loss_mlp": 0.01059235, "balance_loss_clip": 1.11124563, "balance_loss_mlp": 1.03426087, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 1.9477579942988144, "language_loss": 0.86006653, "learning_rate": 3.866757844782762e-06, "loss": 0.88436878, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.7445032596588135 }, { "auxiliary_loss_clip": 0.01374298, "auxiliary_loss_mlp": 0.01075402, "balance_loss_clip": 1.1146307, "balance_loss_mlp": 1.04857945, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 2.291547664256191, "language_loss": 0.91404188, "learning_rate": 3.870232412354527e-06, "loss": 0.93853891, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.657935619354248 }, { "auxiliary_loss_clip": 0.01373241, "auxiliary_loss_mlp": 0.01072729, "balance_loss_clip": 1.11095214, "balance_loss_mlp": 1.0470041, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 3.0229336004701084, "language_loss": 0.92506224, "learning_rate": 3.873690394815086e-06, "loss": 0.94952196, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.6614913940429688 }, { "auxiliary_loss_clip": 0.01373158, "auxiliary_loss_mlp": 0.01065687, "balance_loss_clip": 1.11311305, "balance_loss_mlp": 1.04083157, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 4.957668518303082, "language_loss": 0.91466153, "learning_rate": 3.877131949743587e-06, "loss": 0.93905002, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.619922161102295 }, { "auxiliary_loss_clip": 0.01366479, "auxiliary_loss_mlp": 0.01072741, "balance_loss_clip": 1.11053419, "balance_loss_mlp": 1.04627705, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 2.1874376359050105, "language_loss": 0.78019929, "learning_rate": 3.880557232483993e-06, "loss": 0.80459154, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.6772422790527344 }, { "auxiliary_loss_clip": 0.01369696, "auxiliary_loss_mlp": 0.0106109, "balance_loss_clip": 1.11089015, "balance_loss_mlp": 1.03572202, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 2.020593195483851, "language_loss": 0.87089372, "learning_rate": 3.883966396187164e-06, "loss": 0.89520162, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.702227830886841 }, { "auxiliary_loss_clip": 0.01374402, "auxiliary_loss_mlp": 0.01064726, "balance_loss_clip": 1.11721861, "balance_loss_mlp": 1.03894114, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.252954858744666, "language_loss": 0.90110826, "learning_rate": 3.887359591851937e-06, "loss": 0.92549956, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.6565611362457275 }, { "auxiliary_loss_clip": 0.01366586, "auxiliary_loss_mlp": 0.01062174, "balance_loss_clip": 1.11250806, "balance_loss_mlp": 1.03750992, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 2.4242827601028316, "language_loss": 0.92245996, "learning_rate": 3.890736968365265e-06, "loss": 0.94674766, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.6806750297546387 }, { "auxiliary_loss_clip": 0.01360784, "auxiliary_loss_mlp": 0.01058745, "balance_loss_clip": 1.10600471, "balance_loss_mlp": 1.03335333, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 1.9749013177225792, "language_loss": 0.85355258, "learning_rate": 3.894098672541412e-06, "loss": 0.87774789, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.6665074825286865 }, { "auxiliary_loss_clip": 0.01366495, "auxiliary_loss_mlp": 0.01063829, "balance_loss_clip": 1.11110401, "balance_loss_mlp": 1.0386517, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 2.1200752681509476, "language_loss": 0.75447011, "learning_rate": 3.89744484916025e-06, "loss": 0.77877343, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.8706514835357666 }, { "auxiliary_loss_clip": 0.01373167, "auxiliary_loss_mlp": 0.01078324, "balance_loss_clip": 1.11418581, "balance_loss_mlp": 1.05104923, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 2.0704680257607144, "language_loss": 0.87497735, "learning_rate": 3.900775641004673e-06, "loss": 0.8994922, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.688509225845337 }, { "auxiliary_loss_clip": 0.01376346, "auxiliary_loss_mlp": 0.01069028, "balance_loss_clip": 1.11796081, "balance_loss_mlp": 1.04112124, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 4.064680761035665, "language_loss": 0.74120462, "learning_rate": 3.904091188897156e-06, "loss": 0.76565832, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.9124033451080322 }, { "auxiliary_loss_clip": 0.01365027, "auxiliary_loss_mlp": 0.01069947, "balance_loss_clip": 1.1113975, "balance_loss_mlp": 1.04419804, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.140406371677079, "language_loss": 0.82072681, "learning_rate": 3.90739163173548e-06, "loss": 0.8450765, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.750692129135132 }, { "auxiliary_loss_clip": 0.01359414, "auxiliary_loss_mlp": 0.01086942, "balance_loss_clip": 1.10893106, "balance_loss_mlp": 1.06097841, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.7208375676501504, "language_loss": 0.88433409, "learning_rate": 3.910677106527646e-06, "loss": 0.90879774, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.8090085983276367 }, { "auxiliary_loss_clip": 0.01363616, "auxiliary_loss_mlp": 0.01077908, "balance_loss_clip": 1.11041307, "balance_loss_mlp": 1.05346966, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.265775324867123, "language_loss": 0.84182537, "learning_rate": 3.913947748426004e-06, "loss": 0.86624056, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.7633893489837646 }, { "auxiliary_loss_clip": 0.01365878, "auxiliary_loss_mlp": 0.01075875, "balance_loss_clip": 1.11373997, "balance_loss_mlp": 1.04980445, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 3.2272226927045127, "language_loss": 0.76348335, "learning_rate": 3.9172036907606136e-06, "loss": 0.78790087, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 2.6127636432647705 }, { "auxiliary_loss_clip": 0.01362851, "auxiliary_loss_mlp": 0.0107148, "balance_loss_clip": 1.1085453, "balance_loss_mlp": 1.04537356, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.7491281540616526, "language_loss": 0.94972265, "learning_rate": 3.920445065071855e-06, "loss": 0.9740659, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 4.5359110832214355 }, { "auxiliary_loss_clip": 0.01366078, "auxiliary_loss_mlp": 0.01076418, "balance_loss_clip": 1.11330843, "balance_loss_mlp": 1.05066872, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 5.207757123500714, "language_loss": 0.80201417, "learning_rate": 3.923672001142322e-06, "loss": 0.82643914, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 2.664036512374878 }, { "auxiliary_loss_clip": 0.01360027, "auxiliary_loss_mlp": 0.01069923, "balance_loss_clip": 1.10878325, "balance_loss_mlp": 1.0442692, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 2.451566289900459, "language_loss": 0.84323239, "learning_rate": 3.926884627027996e-06, "loss": 0.8675319, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 3.6600916385650635 }, { "auxiliary_loss_clip": 0.01358885, "auxiliary_loss_mlp": 0.01052584, "balance_loss_clip": 1.10742116, "balance_loss_mlp": 1.02708578, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 1.804036186756018, "language_loss": 0.77367485, "learning_rate": 3.930083069088744e-06, "loss": 0.79778951, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 3.5305886268615723 }, { "auxiliary_loss_clip": 0.01368282, "auxiliary_loss_mlp": 0.01025818, "balance_loss_clip": 1.17563963, "balance_loss_mlp": 1.00455093, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9839098014685971, "language_loss": 0.5936591, "learning_rate": 3.933267452018137e-06, "loss": 0.61760014, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.2374987602233887 }, { "auxiliary_loss_clip": 0.01354518, "auxiliary_loss_mlp": 0.01073147, "balance_loss_clip": 1.10865688, "balance_loss_mlp": 1.04837549, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 5.532826906021258, "language_loss": 0.84155011, "learning_rate": 3.936437898872622e-06, "loss": 0.86582673, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.678624391555786 }, { "auxiliary_loss_clip": 0.01362265, "auxiliary_loss_mlp": 0.01063646, "balance_loss_clip": 1.10895681, "balance_loss_mlp": 1.03799224, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 2.3806896401619393, "language_loss": 0.79441482, "learning_rate": 3.9395945311000525e-06, "loss": 0.81867391, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.7873032093048096 }, { "auxiliary_loss_clip": 0.01357619, "auxiliary_loss_mlp": 0.01073678, "balance_loss_clip": 1.10708427, "balance_loss_mlp": 1.04778552, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 2.0867394344878507, "language_loss": 0.91020858, "learning_rate": 3.942737468567608e-06, "loss": 0.93452156, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.604435682296753 }, { "auxiliary_loss_clip": 0.01351526, "auxiliary_loss_mlp": 0.01066325, "balance_loss_clip": 1.1047672, "balance_loss_mlp": 1.04119575, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.4344314166798884, "language_loss": 0.85790837, "learning_rate": 3.9458668295891026e-06, "loss": 0.88208681, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.91850209236145 }, { "auxiliary_loss_clip": 0.01351954, "auxiliary_loss_mlp": 0.01069965, "balance_loss_clip": 1.10326684, "balance_loss_mlp": 1.0442276, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 3.9452604430966023, "language_loss": 0.86612588, "learning_rate": 3.948982730951712e-06, "loss": 0.89034498, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.63356876373291 }, { "auxiliary_loss_clip": 0.01358248, "auxiliary_loss_mlp": 0.01063194, "balance_loss_clip": 1.10762227, "balance_loss_mlp": 1.0369916, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.224147076802408, "language_loss": 0.82081592, "learning_rate": 3.9520852879421254e-06, "loss": 0.84503037, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.6894471645355225 }, { "auxiliary_loss_clip": 0.01350632, "auxiliary_loss_mlp": 0.01064343, "balance_loss_clip": 1.10436368, "balance_loss_mlp": 1.03966689, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.245108842603276, "language_loss": 0.81436098, "learning_rate": 3.955174614372137e-06, "loss": 0.83851075, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.7125468254089355 }, { "auxiliary_loss_clip": 0.01350727, "auxiliary_loss_mlp": 0.0106752, "balance_loss_clip": 1.10303402, "balance_loss_mlp": 1.04147267, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 3.0704625904315135, "language_loss": 0.84168935, "learning_rate": 3.9582508226037045e-06, "loss": 0.86587191, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.7349321842193604 }, { "auxiliary_loss_clip": 0.01362934, "auxiliary_loss_mlp": 0.01062573, "balance_loss_clip": 1.1091392, "balance_loss_mlp": 1.03590608, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 2.271488477327192, "language_loss": 0.9408251, "learning_rate": 3.9613140235734636e-06, "loss": 0.9650802, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.6766061782836914 }, { "auxiliary_loss_clip": 0.01355195, "auxiliary_loss_mlp": 0.01062253, "balance_loss_clip": 1.106475, "balance_loss_mlp": 1.03670621, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 2.014222724855648, "language_loss": 0.8120653, "learning_rate": 3.96436432681674e-06, "loss": 0.83623987, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.6382367610931396 }, { "auxiliary_loss_clip": 0.01352484, "auxiliary_loss_mlp": 0.01064253, "balance_loss_clip": 1.10529494, "balance_loss_mlp": 1.03973138, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.977224040988339, "language_loss": 0.89071715, "learning_rate": 3.967401840491044e-06, "loss": 0.91488445, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.651592254638672 }, { "auxiliary_loss_clip": 0.01347006, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.10363281, "balance_loss_mlp": 1.04102337, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.4884097093256994, "language_loss": 0.87739617, "learning_rate": 3.97042667139909e-06, "loss": 0.90151429, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.656529426574707 }, { "auxiliary_loss_clip": 0.01350282, "auxiliary_loss_mlp": 0.01072081, "balance_loss_clip": 1.10387897, "balance_loss_mlp": 1.04664183, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 10.93327810463271, "language_loss": 0.87293065, "learning_rate": 3.973438925011327e-06, "loss": 0.89715427, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.671586751937866 }, { "auxiliary_loss_clip": 0.01346375, "auxiliary_loss_mlp": 0.01076165, "balance_loss_clip": 1.09961438, "balance_loss_mlp": 1.0507977, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.774401596441149, "language_loss": 0.91392833, "learning_rate": 3.976438705488002e-06, "loss": 0.93815374, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.680312395095825 }, { "auxiliary_loss_clip": 0.0135046, "auxiliary_loss_mlp": 0.01062409, "balance_loss_clip": 1.10603499, "balance_loss_mlp": 1.03700566, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.9984773504281232, "language_loss": 0.92876202, "learning_rate": 3.9794261157007744e-06, "loss": 0.95289069, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.6626274585723877 }, { "auxiliary_loss_clip": 0.01353625, "auxiliary_loss_mlp": 0.01069815, "balance_loss_clip": 1.1073792, "balance_loss_mlp": 1.04337442, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.4165052391244704, "language_loss": 0.84813082, "learning_rate": 3.982401257253887e-06, "loss": 0.87236524, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.6061222553253174 }, { "auxiliary_loss_clip": 0.01342846, "auxiliary_loss_mlp": 0.01067033, "balance_loss_clip": 1.10093045, "balance_loss_mlp": 1.04239249, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.207186613175604, "language_loss": 0.89919561, "learning_rate": 3.985364230504893e-06, "loss": 0.92329443, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.6599957942962646 }, { "auxiliary_loss_clip": 0.0134773, "auxiliary_loss_mlp": 0.01072719, "balance_loss_clip": 1.10385692, "balance_loss_mlp": 1.04643369, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 2.4352099614384897, "language_loss": 0.84448504, "learning_rate": 3.988315134584976e-06, "loss": 0.86868954, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.6837217807769775 }, { "auxiliary_loss_clip": 0.01348806, "auxiliary_loss_mlp": 0.01073485, "balance_loss_clip": 1.10264587, "balance_loss_mlp": 1.04892826, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.7596427566980672, "language_loss": 0.80598003, "learning_rate": 3.991254067418851e-06, "loss": 0.83020294, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.753472089767456 }, { "auxiliary_loss_clip": 0.01337297, "auxiliary_loss_mlp": 0.01063621, "balance_loss_clip": 1.09834969, "balance_loss_mlp": 1.04055405, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.0857365202818414, "language_loss": 0.83202302, "learning_rate": 3.994181125744254e-06, "loss": 0.85603225, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.7645978927612305 }, { "auxiliary_loss_clip": 0.01343529, "auxiliary_loss_mlp": 0.01078133, "balance_loss_clip": 1.1011734, "balance_loss_mlp": 1.05324197, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 1.856904781396436, "language_loss": 0.74215227, "learning_rate": 3.99709640513106e-06, "loss": 0.76636887, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.722590684890747 }, { "auxiliary_loss_clip": 0.01350334, "auxiliary_loss_mlp": 0.01067366, "balance_loss_clip": 1.10383487, "balance_loss_mlp": 1.04212964, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 2.105953870002864, "language_loss": 0.8549, "learning_rate": 4e-06, "loss": 0.87907696, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 2.7361209392547607 }, { "auxiliary_loss_clip": 0.01347714, "auxiliary_loss_mlp": 0.01063003, "balance_loss_clip": 1.10668254, "balance_loss_mlp": 1.03944731, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 2.8829844693647004, "language_loss": 0.8857289, "learning_rate": 3.999999848300794e-06, "loss": 0.90983605, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.4900498390197754 }, { "auxiliary_loss_clip": 0.01337732, "auxiliary_loss_mlp": 0.01079132, "balance_loss_clip": 1.09624577, "balance_loss_mlp": 1.05577874, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.5809425833560082, "language_loss": 0.89047796, "learning_rate": 3.999999393203203e-06, "loss": 0.91464663, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 4.590440988540649 }, { "auxiliary_loss_clip": 0.01334798, "auxiliary_loss_mlp": 0.01065429, "balance_loss_clip": 1.09545386, "balance_loss_mlp": 1.0411222, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.289893376058141, "language_loss": 0.84868735, "learning_rate": 3.999998634707293e-06, "loss": 0.8726896, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.6818125247955322 }, { "auxiliary_loss_clip": 0.01344703, "auxiliary_loss_mlp": 0.01066053, "balance_loss_clip": 1.1044848, "balance_loss_mlp": 1.04149544, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 4.91595988392828, "language_loss": 0.96379244, "learning_rate": 3.999997572813182e-06, "loss": 0.98789996, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 3.52864146232605 }, { "auxiliary_loss_clip": 0.01337639, "auxiliary_loss_mlp": 0.01078044, "balance_loss_clip": 1.09878421, "balance_loss_mlp": 1.0536654, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.8324908657509582, "language_loss": 0.87587774, "learning_rate": 3.999996207521028e-06, "loss": 0.90003455, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.628528118133545 }, { "auxiliary_loss_clip": 0.01339392, "auxiliary_loss_mlp": 0.01064605, "balance_loss_clip": 1.0988338, "balance_loss_mlp": 1.03961909, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.5871490744840164, "language_loss": 0.82177716, "learning_rate": 3.999994538831039e-06, "loss": 0.84581709, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.588918447494507 }, { "auxiliary_loss_clip": 0.01336822, "auxiliary_loss_mlp": 0.01059462, "balance_loss_clip": 1.09770668, "balance_loss_mlp": 1.03559625, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.843031854978887, "language_loss": 0.85769641, "learning_rate": 3.99999256674347e-06, "loss": 0.88165927, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.710085153579712 }, { "auxiliary_loss_clip": 0.01336217, "auxiliary_loss_mlp": 0.01020433, "balance_loss_clip": 1.15786099, "balance_loss_mlp": 1.00174141, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0090276440086414, "language_loss": 0.53513014, "learning_rate": 3.999990291258618e-06, "loss": 0.55869663, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.1526241302490234 }, { "auxiliary_loss_clip": 0.01339927, "auxiliary_loss_mlp": 0.01064244, "balance_loss_clip": 1.10158336, "balance_loss_mlp": 1.03986526, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.2458592609749464, "language_loss": 0.86540735, "learning_rate": 3.999987712376829e-06, "loss": 0.88944906, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.681306838989258 }, { "auxiliary_loss_clip": 0.01337991, "auxiliary_loss_mlp": 0.010629, "balance_loss_clip": 1.10018408, "balance_loss_mlp": 1.03976202, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 1.9503577024033176, "language_loss": 0.82219505, "learning_rate": 3.999984830098494e-06, "loss": 0.84620398, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.7045748233795166 }, { "auxiliary_loss_clip": 0.0134, "auxiliary_loss_mlp": 0.01074978, "balance_loss_clip": 1.10087299, "balance_loss_mlp": 1.05034971, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 3.062506771765727, "language_loss": 0.97906584, "learning_rate": 3.999981644424051e-06, "loss": 1.00321555, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.6952881813049316 }, { "auxiliary_loss_clip": 0.01337409, "auxiliary_loss_mlp": 0.01077973, "balance_loss_clip": 1.09862018, "balance_loss_mlp": 1.05335677, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.3890442892036265, "language_loss": 0.86307955, "learning_rate": 3.999978155353982e-06, "loss": 0.88723338, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.6628551483154297 }, { "auxiliary_loss_clip": 0.01338835, "auxiliary_loss_mlp": 0.01060735, "balance_loss_clip": 1.09929132, "balance_loss_mlp": 1.03603458, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.3403976063385032, "language_loss": 0.80391967, "learning_rate": 3.9999743628888186e-06, "loss": 0.82791543, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.783906936645508 }, { "auxiliary_loss_clip": 0.01335705, "auxiliary_loss_mlp": 0.01067495, "balance_loss_clip": 1.09747553, "balance_loss_mlp": 1.04335511, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 3.147541751871499, "language_loss": 0.89422321, "learning_rate": 3.999970267029133e-06, "loss": 0.91825521, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.7107770442962646 }, { "auxiliary_loss_clip": 0.01330505, "auxiliary_loss_mlp": 0.01063053, "balance_loss_clip": 1.0946393, "balance_loss_mlp": 1.03922284, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 1.9508836557499873, "language_loss": 0.80028558, "learning_rate": 3.999965867775548e-06, "loss": 0.82422125, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.631235122680664 }, { "auxiliary_loss_clip": 0.01331241, "auxiliary_loss_mlp": 0.0106737, "balance_loss_clip": 1.09544206, "balance_loss_mlp": 1.04269385, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.881047464181732, "language_loss": 0.87042403, "learning_rate": 3.9999611651287315e-06, "loss": 0.89441013, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.617414951324463 }, { "auxiliary_loss_clip": 0.01332719, "auxiliary_loss_mlp": 0.01076135, "balance_loss_clip": 1.09657466, "balance_loss_mlp": 1.05181599, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 3.7940687898769054, "language_loss": 0.78691453, "learning_rate": 3.999956159089396e-06, "loss": 0.81100303, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.607097864151001 }, { "auxiliary_loss_clip": 0.01332113, "auxiliary_loss_mlp": 0.01063492, "balance_loss_clip": 1.09792328, "balance_loss_mlp": 1.04018641, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.1877170444378864, "language_loss": 0.79694068, "learning_rate": 3.999950849658302e-06, "loss": 0.82089674, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.7050960063934326 }, { "auxiliary_loss_clip": 0.01337263, "auxiliary_loss_mlp": 0.01072623, "balance_loss_clip": 1.10064626, "balance_loss_mlp": 1.04812574, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.1678837283168324, "language_loss": 0.84009469, "learning_rate": 3.999945236836254e-06, "loss": 0.86419356, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.596172332763672 }, { "auxiliary_loss_clip": 0.0133696, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.099208, "balance_loss_mlp": 1.03980982, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 2.974143678885928, "language_loss": 0.94610828, "learning_rate": 3.999939320624103e-06, "loss": 0.97012275, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.6491568088531494 }, { "auxiliary_loss_clip": 0.01333641, "auxiliary_loss_mlp": 0.01061049, "balance_loss_clip": 1.097296, "balance_loss_mlp": 1.03695679, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.8608006563716155, "language_loss": 0.89680457, "learning_rate": 3.999933101022749e-06, "loss": 0.92075151, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.77817440032959 }, { "auxiliary_loss_clip": 0.01329806, "auxiliary_loss_mlp": 0.01068224, "balance_loss_clip": 1.09666324, "balance_loss_mlp": 1.04382205, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 1.8986045791647799, "language_loss": 0.86757129, "learning_rate": 3.999926578033132e-06, "loss": 0.89155155, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.678769111633301 }, { "auxiliary_loss_clip": 0.01333718, "auxiliary_loss_mlp": 0.01067255, "balance_loss_clip": 1.09723771, "balance_loss_mlp": 1.04198301, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.0154161326158255, "language_loss": 0.62772679, "learning_rate": 3.999919751656244e-06, "loss": 0.65173656, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.866671562194824 }, { "auxiliary_loss_clip": 0.01332609, "auxiliary_loss_mlp": 0.01067911, "balance_loss_clip": 1.09689939, "balance_loss_mlp": 1.04359257, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 2.20265593041606, "language_loss": 0.75785077, "learning_rate": 3.9999126218931195e-06, "loss": 0.78185594, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.6483957767486572 }, { "auxiliary_loss_clip": 0.01333927, "auxiliary_loss_mlp": 0.01064108, "balance_loss_clip": 1.10035872, "balance_loss_mlp": 1.04057622, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.262785678532056, "language_loss": 0.89583969, "learning_rate": 3.99990518874484e-06, "loss": 0.91982007, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.6572208404541016 }, { "auxiliary_loss_clip": 0.01334203, "auxiliary_loss_mlp": 0.01061824, "balance_loss_clip": 1.10038221, "balance_loss_mlp": 1.0379467, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 4.920455262731301, "language_loss": 0.92524385, "learning_rate": 3.999897452212534e-06, "loss": 0.94920415, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 3.5968387126922607 }, { "auxiliary_loss_clip": 0.01333962, "auxiliary_loss_mlp": 0.01062868, "balance_loss_clip": 1.09754455, "balance_loss_mlp": 1.03871632, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 3.2448937345578734, "language_loss": 1.00209892, "learning_rate": 3.999889412297374e-06, "loss": 1.02606726, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 2.7010481357574463 }, { "auxiliary_loss_clip": 0.0132888, "auxiliary_loss_mlp": 0.01071597, "balance_loss_clip": 1.09257936, "balance_loss_mlp": 1.04771924, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 2.6310469628591844, "language_loss": 0.79070401, "learning_rate": 3.999881069000581e-06, "loss": 0.81470883, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 3.669483184814453 }, { "auxiliary_loss_clip": 0.01328467, "auxiliary_loss_mlp": 0.0106885, "balance_loss_clip": 1.09585142, "balance_loss_mlp": 1.04556882, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.463350880204464, "language_loss": 0.86669737, "learning_rate": 3.99987242232342e-06, "loss": 0.89067054, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 4.222239255905151 }, { "auxiliary_loss_clip": 0.01330692, "auxiliary_loss_mlp": 0.01074244, "balance_loss_clip": 1.09788632, "balance_loss_mlp": 1.05091453, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 2.312771700753107, "language_loss": 0.79688394, "learning_rate": 3.9998634722672026e-06, "loss": 0.82093328, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.6515233516693115 }, { "auxiliary_loss_clip": 0.01330891, "auxiliary_loss_mlp": 0.01062463, "balance_loss_clip": 1.09897113, "balance_loss_mlp": 1.03859735, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 4.282575236126002, "language_loss": 0.78758496, "learning_rate": 3.999854218833286e-06, "loss": 0.81151855, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.7758874893188477 }, { "auxiliary_loss_clip": 0.0132709, "auxiliary_loss_mlp": 0.01071794, "balance_loss_clip": 1.09572959, "balance_loss_mlp": 1.0482738, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 1.8844170008100474, "language_loss": 0.8167305, "learning_rate": 3.999844662023075e-06, "loss": 0.84071934, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.655845880508423 }, { "auxiliary_loss_clip": 0.01322445, "auxiliary_loss_mlp": 0.01068145, "balance_loss_clip": 1.09309459, "balance_loss_mlp": 1.04491091, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.8329619374641515, "language_loss": 0.92032635, "learning_rate": 3.999834801838018e-06, "loss": 0.94423223, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.718855142593384 }, { "auxiliary_loss_clip": 0.01324076, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.09541416, "balance_loss_mlp": 1.0400064, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.1581639113847735, "language_loss": 0.74159193, "learning_rate": 3.9998246382796115e-06, "loss": 0.76546019, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.661202907562256 }, { "auxiliary_loss_clip": 0.0132992, "auxiliary_loss_mlp": 0.01060091, "balance_loss_clip": 1.09594691, "balance_loss_mlp": 1.03648722, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.1403058092341247, "language_loss": 0.90954286, "learning_rate": 3.999814171349399e-06, "loss": 0.93344295, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.711761236190796 }, { "auxiliary_loss_clip": 0.01319973, "auxiliary_loss_mlp": 0.01073965, "balance_loss_clip": 1.09142542, "balance_loss_mlp": 1.05153012, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 1.8289800866830341, "language_loss": 0.73354638, "learning_rate": 3.9998034010489655e-06, "loss": 0.75748569, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.745896339416504 }, { "auxiliary_loss_clip": 0.01325843, "auxiliary_loss_mlp": 0.01056187, "balance_loss_clip": 1.09730601, "balance_loss_mlp": 1.03353739, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.1277015914157533, "language_loss": 0.75806522, "learning_rate": 3.999792327379946e-06, "loss": 0.7818855, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.706616163253784 }, { "auxiliary_loss_clip": 0.01326542, "auxiliary_loss_mlp": 0.01080141, "balance_loss_clip": 1.0976814, "balance_loss_mlp": 1.05672824, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.2123345061438853, "language_loss": 0.96306598, "learning_rate": 3.999780950344021e-06, "loss": 0.98713279, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.6613638401031494 }, { "auxiliary_loss_clip": 0.01328482, "auxiliary_loss_mlp": 0.01064672, "balance_loss_clip": 1.0983429, "balance_loss_mlp": 1.04011488, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.9493743511953483, "language_loss": 0.82675958, "learning_rate": 3.999769269942916e-06, "loss": 0.85069108, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.7832796573638916 }, { "auxiliary_loss_clip": 0.01321252, "auxiliary_loss_mlp": 0.01062625, "balance_loss_clip": 1.09573245, "balance_loss_mlp": 1.04029715, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 2.1313832560471586, "language_loss": 0.8114897, "learning_rate": 3.999757286178402e-06, "loss": 0.83532846, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.7024195194244385 }, { "auxiliary_loss_clip": 0.01322084, "auxiliary_loss_mlp": 0.01068161, "balance_loss_clip": 1.09251893, "balance_loss_mlp": 1.04580903, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 8.858344390077963, "language_loss": 0.90790844, "learning_rate": 3.999744999052299e-06, "loss": 0.93181086, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.746553897857666 }, { "auxiliary_loss_clip": 0.01307932, "auxiliary_loss_mlp": 0.01022817, "balance_loss_clip": 1.14149332, "balance_loss_mlp": 1.00689054, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9641109501006598, "language_loss": 0.61186445, "learning_rate": 3.9997324085664675e-06, "loss": 0.63517201, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.217590093612671 }, { "auxiliary_loss_clip": 0.01322287, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.09392536, "balance_loss_mlp": 1.03685308, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.083837455310632, "language_loss": 0.91919136, "learning_rate": 3.999719514722821e-06, "loss": 0.94301391, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.6710171699523926 }, { "auxiliary_loss_clip": 0.01317595, "auxiliary_loss_mlp": 0.01065915, "balance_loss_clip": 1.09308791, "balance_loss_mlp": 1.04279995, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 3.882183031014349, "language_loss": 0.74613702, "learning_rate": 3.999706317523314e-06, "loss": 0.76997209, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.78429913520813 }, { "auxiliary_loss_clip": 0.01319038, "auxiliary_loss_mlp": 0.01069609, "balance_loss_clip": 1.09380555, "balance_loss_mlp": 1.04748392, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.529272320643377, "language_loss": 0.86244059, "learning_rate": 3.999692816969948e-06, "loss": 0.88632703, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.6511523723602295 }, { "auxiliary_loss_clip": 0.01298639, "auxiliary_loss_mlp": 0.01015664, "balance_loss_clip": 1.13473439, "balance_loss_mlp": 1.0005964, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.0036340154870662, "language_loss": 0.69440246, "learning_rate": 3.999679013064772e-06, "loss": 0.71754551, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.2824437618255615 }, { "auxiliary_loss_clip": 0.0131893, "auxiliary_loss_mlp": 0.01049756, "balance_loss_clip": 1.0923562, "balance_loss_mlp": 1.02807236, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.8729601570438947, "language_loss": 0.85835654, "learning_rate": 3.99966490580988e-06, "loss": 0.88204336, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.6465861797332764 }, { "auxiliary_loss_clip": 0.01325655, "auxiliary_loss_mlp": 0.01065597, "balance_loss_clip": 1.09709799, "balance_loss_mlp": 1.0436151, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.7561209482021485, "language_loss": 0.6569066, "learning_rate": 3.999650495207411e-06, "loss": 0.68081909, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.816352128982544 }, { "auxiliary_loss_clip": 0.01321389, "auxiliary_loss_mlp": 0.01066832, "balance_loss_clip": 1.09590948, "balance_loss_mlp": 1.04397976, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 2.8270227389092226, "language_loss": 0.9043532, "learning_rate": 3.999635781259553e-06, "loss": 0.92823547, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.6051008701324463 }, { "auxiliary_loss_clip": 0.0128579, "auxiliary_loss_mlp": 0.01016456, "balance_loss_clip": 1.12717628, "balance_loss_mlp": 1.0016737, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.9241902326290455, "language_loss": 0.5224461, "learning_rate": 3.999620763968535e-06, "loss": 0.54546857, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.091032028198242 }, { "auxiliary_loss_clip": 0.01316424, "auxiliary_loss_mlp": 0.01069031, "balance_loss_clip": 1.09260499, "balance_loss_mlp": 1.04646504, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.8944632265779322, "language_loss": 0.86559427, "learning_rate": 3.999605443336638e-06, "loss": 0.88944882, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.6671953201293945 }, { "auxiliary_loss_clip": 0.01324735, "auxiliary_loss_mlp": 0.01072919, "balance_loss_clip": 1.09714437, "balance_loss_mlp": 1.05042386, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.5099355463685096, "language_loss": 0.89564037, "learning_rate": 3.999589819366185e-06, "loss": 0.91961688, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 3.508256673812866 }, { "auxiliary_loss_clip": 0.01322786, "auxiliary_loss_mlp": 0.01065112, "balance_loss_clip": 1.09576201, "balance_loss_mlp": 1.04172313, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.0487463773601977, "language_loss": 0.84804529, "learning_rate": 3.999573892059547e-06, "loss": 0.87192422, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 2.7009341716766357 }, { "auxiliary_loss_clip": 0.01323369, "auxiliary_loss_mlp": 0.01068739, "balance_loss_clip": 1.09314036, "balance_loss_mlp": 1.04611349, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 2.3702425465807067, "language_loss": 0.8093276, "learning_rate": 3.999557661419138e-06, "loss": 0.83324873, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 2.6587002277374268 }, { "auxiliary_loss_clip": 0.01322042, "auxiliary_loss_mlp": 0.01059899, "balance_loss_clip": 1.09654975, "balance_loss_mlp": 1.03877461, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 1.8807473964830435, "language_loss": 0.81642222, "learning_rate": 3.9995411274474225e-06, "loss": 0.84024161, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 5.488084077835083 }, { "auxiliary_loss_clip": 0.01320489, "auxiliary_loss_mlp": 0.01064951, "balance_loss_clip": 1.09647799, "balance_loss_mlp": 1.04269493, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 1.9694442810615287, "language_loss": 0.81386828, "learning_rate": 3.999524290146908e-06, "loss": 0.83772272, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 2.7376348972320557 }, { "auxiliary_loss_clip": 0.01320385, "auxiliary_loss_mlp": 0.01060098, "balance_loss_clip": 1.09635234, "balance_loss_mlp": 1.0383898, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 2.232922656420018, "language_loss": 0.92664731, "learning_rate": 3.9995071495201485e-06, "loss": 0.95045209, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.666546583175659 }, { "auxiliary_loss_clip": 0.0131929, "auxiliary_loss_mlp": 0.01057004, "balance_loss_clip": 1.09615588, "balance_loss_mlp": 1.03450894, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.626880693849435, "language_loss": 0.97875428, "learning_rate": 3.999489705569744e-06, "loss": 1.00251722, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.708062171936035 }, { "auxiliary_loss_clip": 0.01317924, "auxiliary_loss_mlp": 0.01055454, "balance_loss_clip": 1.09347725, "balance_loss_mlp": 1.03323293, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 3.015957725096343, "language_loss": 0.86494613, "learning_rate": 3.999471958298341e-06, "loss": 0.88867986, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.6541125774383545 }, { "auxiliary_loss_clip": 0.01319834, "auxiliary_loss_mlp": 0.01061043, "balance_loss_clip": 1.09489942, "balance_loss_mlp": 1.03819013, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 2.300209568067235, "language_loss": 0.75948107, "learning_rate": 3.999453907708631e-06, "loss": 0.78328979, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.7758736610412598 }, { "auxiliary_loss_clip": 0.01315631, "auxiliary_loss_mlp": 0.0106154, "balance_loss_clip": 1.09214854, "balance_loss_mlp": 1.04001117, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 1.9376636854517177, "language_loss": 0.81419098, "learning_rate": 3.999435553803353e-06, "loss": 0.83796263, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.6598050594329834 }, { "auxiliary_loss_clip": 0.01316063, "auxiliary_loss_mlp": 0.01059837, "balance_loss_clip": 1.09105396, "balance_loss_mlp": 1.03779459, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 3.907402739023888, "language_loss": 0.83593678, "learning_rate": 3.999416896585292e-06, "loss": 0.85969579, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.6337056159973145 }, { "auxiliary_loss_clip": 0.01313694, "auxiliary_loss_mlp": 0.01056751, "balance_loss_clip": 1.08867455, "balance_loss_mlp": 1.03554404, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 11.53245340968813, "language_loss": 0.86211073, "learning_rate": 3.9993979360572775e-06, "loss": 0.88581514, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.6976168155670166 }, { "auxiliary_loss_clip": 0.01324332, "auxiliary_loss_mlp": 0.01060894, "balance_loss_clip": 1.09581172, "balance_loss_mlp": 1.03831601, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 2.442914708629604, "language_loss": 0.83059835, "learning_rate": 3.999378672222185e-06, "loss": 0.85445058, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.5903759002685547 }, { "auxiliary_loss_clip": 0.01314834, "auxiliary_loss_mlp": 0.01065583, "balance_loss_clip": 1.09326684, "balance_loss_mlp": 1.04391098, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 2.1377524479707133, "language_loss": 0.82969165, "learning_rate": 3.9993591050829385e-06, "loss": 0.85349584, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.6229896545410156 }, { "auxiliary_loss_clip": 0.01314691, "auxiliary_loss_mlp": 0.01064976, "balance_loss_clip": 1.09321976, "balance_loss_mlp": 1.04311252, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 1.857831628251106, "language_loss": 0.79325926, "learning_rate": 3.999339234642506e-06, "loss": 0.81705594, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.620015859603882 }, { "auxiliary_loss_clip": 0.01316913, "auxiliary_loss_mlp": 0.01058172, "balance_loss_clip": 1.09183574, "balance_loss_mlp": 1.03583217, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 1.9510115322388935, "language_loss": 0.8393665, "learning_rate": 3.9993190609038994e-06, "loss": 0.86311734, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.7454469203948975 }, { "auxiliary_loss_clip": 0.01315167, "auxiliary_loss_mlp": 0.0106507, "balance_loss_clip": 1.09130359, "balance_loss_mlp": 1.04219341, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 1.9868409700056595, "language_loss": 0.8325839, "learning_rate": 3.999298583870182e-06, "loss": 0.8563863, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.6600868701934814 }, { "auxiliary_loss_clip": 0.01315126, "auxiliary_loss_mlp": 0.01063264, "balance_loss_clip": 1.09247422, "balance_loss_mlp": 1.04107952, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.03203987224393, "language_loss": 0.77463257, "learning_rate": 3.999277803544458e-06, "loss": 0.7984165, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.7370822429656982 }, { "auxiliary_loss_clip": 0.01263811, "auxiliary_loss_mlp": 0.01018639, "balance_loss_clip": 1.11611867, "balance_loss_mlp": 1.00433433, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9619081896337921, "language_loss": 0.62403023, "learning_rate": 3.999256719929882e-06, "loss": 0.6468547, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.186304807662964 }, { "auxiliary_loss_clip": 0.01261263, "auxiliary_loss_mlp": 0.01017225, "balance_loss_clip": 1.11389077, "balance_loss_mlp": 1.00311089, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.1992828417567836, "language_loss": 0.67133671, "learning_rate": 3.999235333029651e-06, "loss": 0.6941216, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.088900327682495 }, { "auxiliary_loss_clip": 0.01310393, "auxiliary_loss_mlp": 0.01069797, "balance_loss_clip": 1.09156227, "balance_loss_mlp": 1.04791057, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 2.00903159400857, "language_loss": 0.81924093, "learning_rate": 3.999213642847009e-06, "loss": 0.84304279, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.728578805923462 }, { "auxiliary_loss_clip": 0.013136, "auxiliary_loss_mlp": 0.01071233, "balance_loss_clip": 1.09009743, "balance_loss_mlp": 1.04988325, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.7895815416867382, "language_loss": 0.90939689, "learning_rate": 3.999191649385247e-06, "loss": 0.93324518, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.6723268032073975 }, { "auxiliary_loss_clip": 0.01254745, "auxiliary_loss_mlp": 0.01014403, "balance_loss_clip": 1.10933471, "balance_loss_mlp": 1.00038385, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9085464173517437, "language_loss": 0.59818619, "learning_rate": 3.999169352647702e-06, "loss": 0.62087768, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.1438770294189453 }, { "auxiliary_loss_clip": 0.01320657, "auxiliary_loss_mlp": 0.01059355, "balance_loss_clip": 1.09558558, "balance_loss_mlp": 1.03746784, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 1.901315528731458, "language_loss": 0.83109146, "learning_rate": 3.999146752637755e-06, "loss": 0.85489154, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.719205141067505 }, { "auxiliary_loss_clip": 0.01315611, "auxiliary_loss_mlp": 0.01070481, "balance_loss_clip": 1.09346437, "balance_loss_mlp": 1.04778385, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.6629754748730234, "language_loss": 0.90081429, "learning_rate": 3.999123849358836e-06, "loss": 0.92467523, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.6653101444244385 }, { "auxiliary_loss_clip": 0.01310848, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.09066415, "balance_loss_mlp": 1.03757024, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.1921074261145255, "language_loss": 0.74753428, "learning_rate": 3.999100642814418e-06, "loss": 0.77123785, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.7855236530303955 }, { "auxiliary_loss_clip": 0.01312632, "auxiliary_loss_mlp": 0.01071626, "balance_loss_clip": 1.09315348, "balance_loss_mlp": 1.05017996, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.3296844797943126, "language_loss": 0.88681638, "learning_rate": 3.999077133008022e-06, "loss": 0.91065896, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 3.5846190452575684 }, { "auxiliary_loss_clip": 0.01315205, "auxiliary_loss_mlp": 0.01057868, "balance_loss_clip": 1.09391785, "balance_loss_mlp": 1.0360173, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 1.8517585743056286, "language_loss": 0.90613842, "learning_rate": 3.9990533199432145e-06, "loss": 0.92986917, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.7268755435943604 }, { "auxiliary_loss_clip": 0.01314886, "auxiliary_loss_mlp": 0.01065265, "balance_loss_clip": 1.09335423, "balance_loss_mlp": 1.0437479, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.251860417915879, "language_loss": 0.75436342, "learning_rate": 3.999029203623608e-06, "loss": 0.77816492, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 3.5149765014648438 }, { "auxiliary_loss_clip": 0.01312589, "auxiliary_loss_mlp": 0.01060137, "balance_loss_clip": 1.09274769, "balance_loss_mlp": 1.03844059, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 4.23892521322517, "language_loss": 0.86637819, "learning_rate": 3.99900478405286e-06, "loss": 0.89010537, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 3.611269474029541 }, { "auxiliary_loss_clip": 0.0131454, "auxiliary_loss_mlp": 0.01062653, "balance_loss_clip": 1.09625292, "balance_loss_mlp": 1.04214859, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.780886306906911, "language_loss": 0.82539403, "learning_rate": 3.998980061234676e-06, "loss": 0.84916604, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 3.637096643447876 }, { "auxiliary_loss_clip": 0.0131261, "auxiliary_loss_mlp": 0.01057491, "balance_loss_clip": 1.09227514, "balance_loss_mlp": 1.03547287, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.372746938253122, "language_loss": 0.75590706, "learning_rate": 3.9989550351728055e-06, "loss": 0.77960807, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.7528932094573975 }, { "auxiliary_loss_clip": 0.01309134, "auxiliary_loss_mlp": 0.01056564, "balance_loss_clip": 1.08971584, "balance_loss_mlp": 1.03578568, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 4.875387993332005, "language_loss": 0.84670526, "learning_rate": 3.998929705871046e-06, "loss": 0.87036222, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.691333055496216 }, { "auxiliary_loss_clip": 0.0130671, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.09039187, "balance_loss_mlp": 1.03144026, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.8228989246355756, "language_loss": 0.89157814, "learning_rate": 3.99890407333324e-06, "loss": 0.91517127, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.629331588745117 }, { "auxiliary_loss_clip": 0.01307331, "auxiliary_loss_mlp": 0.01063499, "balance_loss_clip": 1.08723283, "balance_loss_mlp": 1.04075384, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.7554456711952204, "language_loss": 0.87322581, "learning_rate": 3.998878137563275e-06, "loss": 0.89693409, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.559574842453003 }, { "auxiliary_loss_clip": 0.01309603, "auxiliary_loss_mlp": 0.01061623, "balance_loss_clip": 1.09036565, "balance_loss_mlp": 1.0399034, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 2.7232632554314025, "language_loss": 0.85431623, "learning_rate": 3.998851898565085e-06, "loss": 0.87802851, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.6453206539154053 }, { "auxiliary_loss_clip": 0.01309832, "auxiliary_loss_mlp": 0.01060076, "balance_loss_clip": 1.08929968, "balance_loss_mlp": 1.0387975, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.8781649923436936, "language_loss": 0.83144385, "learning_rate": 3.998825356342653e-06, "loss": 0.85514295, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.5867464542388916 }, { "auxiliary_loss_clip": 0.01310874, "auxiliary_loss_mlp": 0.01063028, "balance_loss_clip": 1.09090257, "balance_loss_mlp": 1.04183221, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 2.8855204730901285, "language_loss": 0.72965503, "learning_rate": 3.998798510900003e-06, "loss": 0.75339401, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.7967545986175537 }, { "auxiliary_loss_clip": 0.01307348, "auxiliary_loss_mlp": 0.01053648, "balance_loss_clip": 1.08927119, "balance_loss_mlp": 1.03121257, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.4156557076040146, "language_loss": 0.83781892, "learning_rate": 3.998771362241207e-06, "loss": 0.86142886, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.6457574367523193 }, { "auxiliary_loss_clip": 0.01306417, "auxiliary_loss_mlp": 0.01057889, "balance_loss_clip": 1.087479, "balance_loss_mlp": 1.03700376, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 1.9264585102936957, "language_loss": 0.88123918, "learning_rate": 3.998743910370385e-06, "loss": 0.90488231, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.637000560760498 }, { "auxiliary_loss_clip": 0.01312359, "auxiliary_loss_mlp": 0.01082133, "balance_loss_clip": 1.0945574, "balance_loss_mlp": 1.06146193, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.053611667948584, "language_loss": 0.73461312, "learning_rate": 3.998716155291702e-06, "loss": 0.75855803, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.6150951385498047 }, { "auxiliary_loss_clip": 0.01310419, "auxiliary_loss_mlp": 0.01054409, "balance_loss_clip": 1.09454942, "balance_loss_mlp": 1.03333235, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 3.3495311026598116, "language_loss": 0.90645063, "learning_rate": 3.998688097009366e-06, "loss": 0.93009889, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.6575710773468018 }, { "auxiliary_loss_clip": 0.01313929, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.09438348, "balance_loss_mlp": 1.03610492, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.166910097939507, "language_loss": 0.80061162, "learning_rate": 3.998659735527636e-06, "loss": 0.82432032, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.679595708847046 }, { "auxiliary_loss_clip": 0.01307727, "auxiliary_loss_mlp": 0.01054636, "balance_loss_clip": 1.08920598, "balance_loss_mlp": 1.03391743, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 1.7588726165902107, "language_loss": 0.77846134, "learning_rate": 3.998631070850813e-06, "loss": 0.80208498, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.736501932144165 }, { "auxiliary_loss_clip": 0.0130523, "auxiliary_loss_mlp": 0.01061977, "balance_loss_clip": 1.08909321, "balance_loss_mlp": 1.04098392, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.139244445199227, "language_loss": 0.83814639, "learning_rate": 3.9986021029832455e-06, "loss": 0.86181849, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.6283817291259766 }, { "auxiliary_loss_clip": 0.01309037, "auxiliary_loss_mlp": 0.01059673, "balance_loss_clip": 1.088642, "balance_loss_mlp": 1.0376792, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 2.641805107375726, "language_loss": 0.91801465, "learning_rate": 3.9985728319293285e-06, "loss": 0.94170177, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.64424467086792 }, { "auxiliary_loss_clip": 0.01314168, "auxiliary_loss_mlp": 0.01055587, "balance_loss_clip": 1.09112751, "balance_loss_mlp": 1.03280628, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 3.216113417902212, "language_loss": 0.85309905, "learning_rate": 3.998543257693501e-06, "loss": 0.8767966, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.6274006366729736 }, { "auxiliary_loss_clip": 0.01307283, "auxiliary_loss_mlp": 0.01060465, "balance_loss_clip": 1.09064722, "balance_loss_mlp": 1.03976989, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 1.8034005985560237, "language_loss": 0.88043869, "learning_rate": 3.998513380280251e-06, "loss": 0.90411615, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.6561925411224365 }, { "auxiliary_loss_clip": 0.01311805, "auxiliary_loss_mlp": 0.01069145, "balance_loss_clip": 1.09273612, "balance_loss_mlp": 1.04662633, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 6.940880308460208, "language_loss": 0.94892013, "learning_rate": 3.99848319969411e-06, "loss": 0.97272968, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.632089376449585 }, { "auxiliary_loss_clip": 0.01313767, "auxiliary_loss_mlp": 0.01072071, "balance_loss_clip": 1.09313059, "balance_loss_mlp": 1.04951715, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.2312664409777336, "language_loss": 0.79257929, "learning_rate": 3.9984527159396564e-06, "loss": 0.81643766, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.652055025100708 }, { "auxiliary_loss_clip": 0.01307977, "auxiliary_loss_mlp": 0.0105895, "balance_loss_clip": 1.08927917, "balance_loss_mlp": 1.03717017, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.208389317327154, "language_loss": 0.84553295, "learning_rate": 3.9984219290215154e-06, "loss": 0.8692022, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.7248663902282715 }, { "auxiliary_loss_clip": 0.01302748, "auxiliary_loss_mlp": 0.01059723, "balance_loss_clip": 1.08989155, "balance_loss_mlp": 1.03840852, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.7928136142455136, "language_loss": 0.89132035, "learning_rate": 3.998390838944356e-06, "loss": 0.91494513, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.658233642578125 }, { "auxiliary_loss_clip": 0.01309938, "auxiliary_loss_mlp": 0.01059134, "balance_loss_clip": 1.09247661, "balance_loss_mlp": 1.03672302, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.228159977022482, "language_loss": 0.90304059, "learning_rate": 3.998359445712895e-06, "loss": 0.92673129, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.6611008644104004 }, { "auxiliary_loss_clip": 0.01307614, "auxiliary_loss_mlp": 0.01060008, "balance_loss_clip": 1.0873872, "balance_loss_mlp": 1.03832388, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.602402153255236, "language_loss": 0.80933744, "learning_rate": 3.9983277493318955e-06, "loss": 0.83301365, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 3.589946746826172 }, { "auxiliary_loss_clip": 0.01304428, "auxiliary_loss_mlp": 0.01062502, "balance_loss_clip": 1.0876627, "balance_loss_mlp": 1.04087698, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 1.6558927516685515, "language_loss": 0.81334871, "learning_rate": 3.998295749806165e-06, "loss": 0.83701801, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 2.6910057067871094 }, { "auxiliary_loss_clip": 0.0130892, "auxiliary_loss_mlp": 0.01067994, "balance_loss_clip": 1.09042752, "balance_loss_mlp": 1.04684591, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 1.8757110895280988, "language_loss": 0.83434612, "learning_rate": 3.998263447140558e-06, "loss": 0.85811526, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 3.596390962600708 }, { "auxiliary_loss_clip": 0.01301892, "auxiliary_loss_mlp": 0.01062872, "balance_loss_clip": 1.08694148, "balance_loss_mlp": 1.04248691, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 2.251044961267766, "language_loss": 0.81757545, "learning_rate": 3.998230841339976e-06, "loss": 0.84122306, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 4.6655731201171875 }, { "auxiliary_loss_clip": 0.01299986, "auxiliary_loss_mlp": 0.01071193, "balance_loss_clip": 1.08787537, "balance_loss_mlp": 1.05093956, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.2662431737574122, "language_loss": 0.84754741, "learning_rate": 3.998197932409363e-06, "loss": 0.87125921, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 2.6470181941986084 }, { "auxiliary_loss_clip": 0.01301252, "auxiliary_loss_mlp": 0.01058012, "balance_loss_clip": 1.08784962, "balance_loss_mlp": 1.03767538, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.190021304727583, "language_loss": 0.86366898, "learning_rate": 3.9981647203537125e-06, "loss": 0.88726169, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.662658929824829 }, { "auxiliary_loss_clip": 0.01305389, "auxiliary_loss_mlp": 0.01059512, "balance_loss_clip": 1.0882926, "balance_loss_mlp": 1.03903222, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.610727390941748, "language_loss": 0.96002996, "learning_rate": 3.998131205178063e-06, "loss": 0.98367894, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.613988161087036 }, { "auxiliary_loss_clip": 0.0130351, "auxiliary_loss_mlp": 0.01067617, "balance_loss_clip": 1.08891571, "balance_loss_mlp": 1.04609942, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 2.6404713504665414, "language_loss": 0.7670331, "learning_rate": 3.998097386887498e-06, "loss": 0.79074436, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.78865385055542 }, { "auxiliary_loss_clip": 0.01298265, "auxiliary_loss_mlp": 0.01060712, "balance_loss_clip": 1.08678436, "balance_loss_mlp": 1.04020798, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.6681729397743112, "language_loss": 0.84771371, "learning_rate": 3.998063265487148e-06, "loss": 0.87130344, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.7286148071289062 }, { "auxiliary_loss_clip": 0.01301496, "auxiliary_loss_mlp": 0.01052679, "balance_loss_clip": 1.08774316, "balance_loss_mlp": 1.03235352, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 2.226215798407452, "language_loss": 0.80932528, "learning_rate": 3.99802884098219e-06, "loss": 0.83286709, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.694917678833008 }, { "auxiliary_loss_clip": 0.01305441, "auxiliary_loss_mlp": 0.01058174, "balance_loss_clip": 1.08995593, "balance_loss_mlp": 1.0374676, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.747659244762178, "language_loss": 0.82166564, "learning_rate": 3.997994113377845e-06, "loss": 0.84530175, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.6808526515960693 }, { "auxiliary_loss_clip": 0.01303048, "auxiliary_loss_mlp": 0.0105946, "balance_loss_clip": 1.08870757, "balance_loss_mlp": 1.03832459, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.055535590122189, "language_loss": 0.83361298, "learning_rate": 3.9979590826793815e-06, "loss": 0.85723805, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.7208778858184814 }, { "auxiliary_loss_clip": 0.01306395, "auxiliary_loss_mlp": 0.01063204, "balance_loss_clip": 1.0909723, "balance_loss_mlp": 1.04148448, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.096205425260632, "language_loss": 0.80744421, "learning_rate": 3.997923748892113e-06, "loss": 0.83114016, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.6190004348754883 }, { "auxiliary_loss_clip": 0.01298064, "auxiliary_loss_mlp": 0.01071697, "balance_loss_clip": 1.0876925, "balance_loss_mlp": 1.05174184, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 2.1257038961050894, "language_loss": 0.88695985, "learning_rate": 3.9978881120214015e-06, "loss": 0.91065747, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.725163221359253 }, { "auxiliary_loss_clip": 0.0130242, "auxiliary_loss_mlp": 0.01067548, "balance_loss_clip": 1.0891248, "balance_loss_mlp": 1.04619741, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.9000818527515715, "language_loss": 0.79327387, "learning_rate": 3.997852172072652e-06, "loss": 0.81697351, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.681840181350708 }, { "auxiliary_loss_clip": 0.01302895, "auxiliary_loss_mlp": 0.01069192, "balance_loss_clip": 1.08715045, "balance_loss_mlp": 1.04849708, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 2.4492301012797824, "language_loss": 0.89433086, "learning_rate": 3.9978159290513155e-06, "loss": 0.91805172, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.5987110137939453 }, { "auxiliary_loss_clip": 0.01301245, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.08679295, "balance_loss_mlp": 1.033831, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.7506978426820528, "language_loss": 0.80045485, "learning_rate": 3.997779382962892e-06, "loss": 0.82401055, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.7662546634674072 }, { "auxiliary_loss_clip": 0.01297905, "auxiliary_loss_mlp": 0.01054225, "balance_loss_clip": 1.08589053, "balance_loss_mlp": 1.0342809, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.254188165843151, "language_loss": 0.73617846, "learning_rate": 3.997742533812924e-06, "loss": 0.75969976, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.6466197967529297 }, { "auxiliary_loss_clip": 0.01302292, "auxiliary_loss_mlp": 0.01060808, "balance_loss_clip": 1.08850551, "balance_loss_mlp": 1.03851557, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.973171804135303, "language_loss": 0.92470729, "learning_rate": 3.997705381607001e-06, "loss": 0.94833833, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.6682817935943604 }, { "auxiliary_loss_clip": 0.01243055, "auxiliary_loss_mlp": 0.01024397, "balance_loss_clip": 1.10176635, "balance_loss_mlp": 1.01152217, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 1.0576281229169933, "language_loss": 0.6022436, "learning_rate": 3.997667926350761e-06, "loss": 0.6249181, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.148576021194458 }, { "auxiliary_loss_clip": 0.01242325, "auxiliary_loss_mlp": 0.01019633, "balance_loss_clip": 1.10035849, "balance_loss_mlp": 1.00675869, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9088121781599864, "language_loss": 0.57796377, "learning_rate": 3.997630168049886e-06, "loss": 0.60058337, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.262816905975342 }, { "auxiliary_loss_clip": 0.01305198, "auxiliary_loss_mlp": 0.01064986, "balance_loss_clip": 1.09027219, "balance_loss_mlp": 1.04345679, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 1.7631382891080742, "language_loss": 0.77335405, "learning_rate": 3.997592106710101e-06, "loss": 0.7970559, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.6832709312438965 }, { "auxiliary_loss_clip": 0.01297076, "auxiliary_loss_mlp": 0.01056458, "balance_loss_clip": 1.08500814, "balance_loss_mlp": 1.03567946, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.9463423002322244, "language_loss": 0.65744388, "learning_rate": 3.997553742337182e-06, "loss": 0.68097913, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.718764305114746 }, { "auxiliary_loss_clip": 0.01301825, "auxiliary_loss_mlp": 0.0106125, "balance_loss_clip": 1.09021652, "balance_loss_mlp": 1.04063845, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.937602946215284, "language_loss": 0.91584682, "learning_rate": 3.997515074936949e-06, "loss": 0.93947756, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.6985363960266113 }, { "auxiliary_loss_clip": 0.01299987, "auxiliary_loss_mlp": 0.01062897, "balance_loss_clip": 1.08867586, "balance_loss_mlp": 1.04201198, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 23.20721063318923, "language_loss": 0.86732209, "learning_rate": 3.997476104515268e-06, "loss": 0.89095086, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.6107349395751953 }, { "auxiliary_loss_clip": 0.01294602, "auxiliary_loss_mlp": 0.0105676, "balance_loss_clip": 1.08651114, "balance_loss_mlp": 1.03716195, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 1.9974690203876182, "language_loss": 0.777354, "learning_rate": 3.9974368310780485e-06, "loss": 0.80086768, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.5962109565734863 }, { "auxiliary_loss_clip": 0.01306545, "auxiliary_loss_mlp": 0.01059517, "balance_loss_clip": 1.09112465, "balance_loss_mlp": 1.03810644, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 2.6666949975502727, "language_loss": 0.74248886, "learning_rate": 3.997397254631251e-06, "loss": 0.76614952, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.6451597213745117 }, { "auxiliary_loss_clip": 0.01230027, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.09227252, "balance_loss_mlp": 1.01097953, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8360765962857247, "language_loss": 0.6009872, "learning_rate": 3.997357375180878e-06, "loss": 0.62352407, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.210501670837402 }, { "auxiliary_loss_clip": 0.01300277, "auxiliary_loss_mlp": 0.01065395, "balance_loss_clip": 1.08786941, "balance_loss_mlp": 1.04449773, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.9599931638547992, "language_loss": 0.75092161, "learning_rate": 3.997317192732979e-06, "loss": 0.77457833, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 3.5558218955993652 }, { "auxiliary_loss_clip": 0.01297485, "auxiliary_loss_mlp": 0.01066023, "balance_loss_clip": 1.08480191, "balance_loss_mlp": 1.04510224, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 2.0041792280862674, "language_loss": 0.82669234, "learning_rate": 3.99727670729365e-06, "loss": 0.85032737, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 3.512895345687866 }, { "auxiliary_loss_clip": 0.01299532, "auxiliary_loss_mlp": 0.01056991, "balance_loss_clip": 1.08983111, "balance_loss_mlp": 1.03827524, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 2.3589650995691245, "language_loss": 0.77900791, "learning_rate": 3.997235918869033e-06, "loss": 0.80257314, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.50996470451355 }, { "auxiliary_loss_clip": 0.01297367, "auxiliary_loss_mlp": 0.01047748, "balance_loss_clip": 1.08711064, "balance_loss_mlp": 1.02762556, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 2.0545839832689734, "language_loss": 0.82461339, "learning_rate": 3.997194827465315e-06, "loss": 0.84806454, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.6161656379699707 }, { "auxiliary_loss_clip": 0.01298199, "auxiliary_loss_mlp": 0.01050371, "balance_loss_clip": 1.08555794, "balance_loss_mlp": 1.03060651, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 4.561850226094231, "language_loss": 0.91415358, "learning_rate": 3.997153433088728e-06, "loss": 0.93763936, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.5587427616119385 }, { "auxiliary_loss_clip": 0.01297319, "auxiliary_loss_mlp": 0.01055015, "balance_loss_clip": 1.08630848, "balance_loss_mlp": 1.03443968, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 2.137210443878174, "language_loss": 0.81012732, "learning_rate": 3.997111735745554e-06, "loss": 0.83365065, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.6825942993164062 }, { "auxiliary_loss_clip": 0.01298807, "auxiliary_loss_mlp": 0.01055568, "balance_loss_clip": 1.08810771, "balance_loss_mlp": 1.03462279, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 2.4346638021286218, "language_loss": 0.82790953, "learning_rate": 3.997069735442118e-06, "loss": 0.8514533, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.6192731857299805 }, { "auxiliary_loss_clip": 0.01292042, "auxiliary_loss_mlp": 0.01058056, "balance_loss_clip": 1.08340406, "balance_loss_mlp": 1.03874397, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.46163956587128, "language_loss": 0.8007741, "learning_rate": 3.997027432184792e-06, "loss": 0.82427508, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.741412401199341 }, { "auxiliary_loss_clip": 0.0129864, "auxiliary_loss_mlp": 0.01061088, "balance_loss_clip": 1.08579803, "balance_loss_mlp": 1.041013, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 2.039972295856756, "language_loss": 0.89499515, "learning_rate": 3.99698482597999e-06, "loss": 0.91859239, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.6054229736328125 }, { "auxiliary_loss_clip": 0.01220577, "auxiliary_loss_mlp": 0.01016574, "balance_loss_clip": 1.08673477, "balance_loss_mlp": 1.00451016, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8634841123672469, "language_loss": 0.63846809, "learning_rate": 3.99694191683418e-06, "loss": 0.66083962, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.209542989730835 }, { "auxiliary_loss_clip": 0.01302821, "auxiliary_loss_mlp": 0.01059254, "balance_loss_clip": 1.09026694, "balance_loss_mlp": 1.03963256, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 1.7580792656895714, "language_loss": 0.81864786, "learning_rate": 3.996898704753867e-06, "loss": 0.84226865, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.5606086254119873 }, { "auxiliary_loss_clip": 0.01300175, "auxiliary_loss_mlp": 0.01056747, "balance_loss_clip": 1.0875634, "balance_loss_mlp": 1.03562284, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.29820316786318, "language_loss": 0.87645292, "learning_rate": 3.996855189745609e-06, "loss": 0.90002209, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.643512010574341 }, { "auxiliary_loss_clip": 0.01291607, "auxiliary_loss_mlp": 0.01052935, "balance_loss_clip": 1.08154964, "balance_loss_mlp": 1.03325391, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 1.9293308431651837, "language_loss": 0.92662674, "learning_rate": 3.996811371816007e-06, "loss": 0.95007217, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.700845718383789 }, { "auxiliary_loss_clip": 0.01290197, "auxiliary_loss_mlp": 0.01057438, "balance_loss_clip": 1.08475578, "balance_loss_mlp": 1.03661239, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 2.1979535489302156, "language_loss": 0.77679688, "learning_rate": 3.996767250971707e-06, "loss": 0.80027324, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.7203352451324463 }, { "auxiliary_loss_clip": 0.01297995, "auxiliary_loss_mlp": 0.01059673, "balance_loss_clip": 1.08856535, "balance_loss_mlp": 1.03954995, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 2.1009855491274405, "language_loss": 0.8687489, "learning_rate": 3.996722827219403e-06, "loss": 0.89232558, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.6800246238708496 }, { "auxiliary_loss_clip": 0.01299398, "auxiliary_loss_mlp": 0.01062643, "balance_loss_clip": 1.08886409, "balance_loss_mlp": 1.0431881, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 3.001127442095122, "language_loss": 0.82355845, "learning_rate": 3.996678100565833e-06, "loss": 0.84717894, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.6074280738830566 }, { "auxiliary_loss_clip": 0.01290225, "auxiliary_loss_mlp": 0.01062164, "balance_loss_clip": 1.083076, "balance_loss_mlp": 1.042256, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.1575685100234976, "language_loss": 0.88447928, "learning_rate": 3.996633071017783e-06, "loss": 0.90800315, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.6081855297088623 }, { "auxiliary_loss_clip": 0.01294471, "auxiliary_loss_mlp": 0.01060122, "balance_loss_clip": 1.08709204, "balance_loss_mlp": 1.03979635, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.3485766502719265, "language_loss": 0.81895274, "learning_rate": 3.996587738582084e-06, "loss": 0.84249866, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.574200391769409 }, { "auxiliary_loss_clip": 0.01292339, "auxiliary_loss_mlp": 0.01053863, "balance_loss_clip": 1.08510554, "balance_loss_mlp": 1.03453887, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 2.3453215897518143, "language_loss": 0.86216259, "learning_rate": 3.9965421032656115e-06, "loss": 0.88562465, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.773998260498047 }, { "auxiliary_loss_clip": 0.01295777, "auxiliary_loss_mlp": 0.01058372, "balance_loss_clip": 1.0874629, "balance_loss_mlp": 1.03797507, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 4.50362311646072, "language_loss": 0.94437397, "learning_rate": 3.99649616507529e-06, "loss": 0.96791542, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.5891976356506348 }, { "auxiliary_loss_clip": 0.01213468, "auxiliary_loss_mlp": 0.01012939, "balance_loss_clip": 1.08215213, "balance_loss_mlp": 1.00130451, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.893158689670192, "language_loss": 0.63176483, "learning_rate": 3.996449924018088e-06, "loss": 0.65402883, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.201284885406494 }, { "auxiliary_loss_clip": 0.01287364, "auxiliary_loss_mlp": 0.01055019, "balance_loss_clip": 1.08198667, "balance_loss_mlp": 1.03623152, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.450981897697582, "language_loss": 0.79566956, "learning_rate": 3.99640338010102e-06, "loss": 0.81909341, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.582526683807373 }, { "auxiliary_loss_clip": 0.01292577, "auxiliary_loss_mlp": 0.01061403, "balance_loss_clip": 1.08468366, "balance_loss_mlp": 1.04166174, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 2.3140397782497746, "language_loss": 0.78743511, "learning_rate": 3.996356533331146e-06, "loss": 0.81097496, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.6958441734313965 }, { "auxiliary_loss_clip": 0.0129815, "auxiliary_loss_mlp": 0.01061868, "balance_loss_clip": 1.08508992, "balance_loss_mlp": 1.03986204, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.9901697128633447, "language_loss": 0.61886692, "learning_rate": 3.996309383715573e-06, "loss": 0.64246714, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.661607027053833 }, { "auxiliary_loss_clip": 0.01293186, "auxiliary_loss_mlp": 0.01056048, "balance_loss_clip": 1.08300459, "balance_loss_mlp": 1.03606868, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 2.3659019143156153, "language_loss": 0.73954022, "learning_rate": 3.996261931261454e-06, "loss": 0.76303256, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.75179123878479 }, { "auxiliary_loss_clip": 0.01292312, "auxiliary_loss_mlp": 0.01059092, "balance_loss_clip": 1.08456898, "balance_loss_mlp": 1.03975642, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.6309278607938373, "language_loss": 0.86308908, "learning_rate": 3.996214175975987e-06, "loss": 0.88660312, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 2.75114107131958 }, { "auxiliary_loss_clip": 0.01294051, "auxiliary_loss_mlp": 0.01061124, "balance_loss_clip": 1.08713627, "balance_loss_mlp": 1.04082227, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 2.569436289646412, "language_loss": 0.78960365, "learning_rate": 3.996166117866417e-06, "loss": 0.81315541, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 3.646131753921509 }, { "auxiliary_loss_clip": 0.01281446, "auxiliary_loss_mlp": 0.01054541, "balance_loss_clip": 1.07703435, "balance_loss_mlp": 1.03580153, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 2.054379465591192, "language_loss": 0.8667624, "learning_rate": 3.996117756940035e-06, "loss": 0.89012229, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.585024118423462 }, { "auxiliary_loss_clip": 0.01289565, "auxiliary_loss_mlp": 0.0105419, "balance_loss_clip": 1.08263624, "balance_loss_mlp": 1.03529549, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.0868281023718103, "language_loss": 0.97565877, "learning_rate": 3.996069093204175e-06, "loss": 0.99909627, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 3.517374277114868 }, { "auxiliary_loss_clip": 0.01298458, "auxiliary_loss_mlp": 0.01066886, "balance_loss_clip": 1.0892539, "balance_loss_mlp": 1.04648888, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.6901450216165843, "language_loss": 0.87792456, "learning_rate": 3.996020126666221e-06, "loss": 0.90157795, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 3.4282279014587402 }, { "auxiliary_loss_clip": 0.0128971, "auxiliary_loss_mlp": 0.01054016, "balance_loss_clip": 1.08415246, "balance_loss_mlp": 1.03510928, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 2.7521691056456206, "language_loss": 0.81983507, "learning_rate": 3.995970857333601e-06, "loss": 0.84327233, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.651104688644409 }, { "auxiliary_loss_clip": 0.01292947, "auxiliary_loss_mlp": 0.01053166, "balance_loss_clip": 1.08394551, "balance_loss_mlp": 1.03334105, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.973847337720772, "language_loss": 0.79665315, "learning_rate": 3.995921285213789e-06, "loss": 0.82011431, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.6171352863311768 }, { "auxiliary_loss_clip": 0.01289395, "auxiliary_loss_mlp": 0.01049655, "balance_loss_clip": 1.08518088, "balance_loss_mlp": 1.03049874, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.1673607593591613, "language_loss": 0.80609399, "learning_rate": 3.995871410314305e-06, "loss": 0.82948452, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.6290440559387207 }, { "auxiliary_loss_clip": 0.01192831, "auxiliary_loss_mlp": 0.01012889, "balance_loss_clip": 1.07783937, "balance_loss_mlp": 1.0017314, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9845054511897754, "language_loss": 0.59655148, "learning_rate": 3.995821232642714e-06, "loss": 0.61860871, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.2588698863983154 }, { "auxiliary_loss_clip": 0.01270171, "auxiliary_loss_mlp": 0.01056729, "balance_loss_clip": 1.0828408, "balance_loss_mlp": 1.03800082, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 2.251052267556529, "language_loss": 0.82244229, "learning_rate": 3.995770752206629e-06, "loss": 0.84571129, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.6779839992523193 }, { "auxiliary_loss_clip": 0.01288205, "auxiliary_loss_mlp": 0.01055771, "balance_loss_clip": 1.08305311, "balance_loss_mlp": 1.03625655, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.165661418282589, "language_loss": 0.96949559, "learning_rate": 3.995719969013709e-06, "loss": 0.99293542, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.6699533462524414 }, { "auxiliary_loss_clip": 0.01249471, "auxiliary_loss_mlp": 0.01047418, "balance_loss_clip": 1.07987452, "balance_loss_mlp": 1.02810574, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 3.0399073698885575, "language_loss": 0.858886, "learning_rate": 3.995668883071655e-06, "loss": 0.88185489, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.6304986476898193 }, { "auxiliary_loss_clip": 0.01288608, "auxiliary_loss_mlp": 0.01062994, "balance_loss_clip": 1.08377326, "balance_loss_mlp": 1.04313421, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.2621090017733954, "language_loss": 0.90898025, "learning_rate": 3.995617494388219e-06, "loss": 0.93249631, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.6636345386505127 }, { "auxiliary_loss_clip": 0.01249982, "auxiliary_loss_mlp": 0.01043913, "balance_loss_clip": 1.07649446, "balance_loss_mlp": 1.02438641, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 2.6935380534028575, "language_loss": 0.80244684, "learning_rate": 3.995565802971196e-06, "loss": 0.82538581, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.7117760181427 }, { "auxiliary_loss_clip": 0.01246969, "auxiliary_loss_mlp": 0.01057293, "balance_loss_clip": 1.07904887, "balance_loss_mlp": 1.03919697, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 2.5470774688018034, "language_loss": 0.67122674, "learning_rate": 3.995513808828427e-06, "loss": 0.69426942, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.7630224227905273 }, { "auxiliary_loss_clip": 0.01251398, "auxiliary_loss_mlp": 0.01059712, "balance_loss_clip": 1.08020318, "balance_loss_mlp": 1.04055476, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 5.595467279317176, "language_loss": 0.76550508, "learning_rate": 3.9954615119678e-06, "loss": 0.78861618, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.6815836429595947 }, { "auxiliary_loss_clip": 0.01257329, "auxiliary_loss_mlp": 0.01056927, "balance_loss_clip": 1.07874167, "balance_loss_mlp": 1.03629184, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 1.9706119004767002, "language_loss": 0.80632961, "learning_rate": 3.995408912397248e-06, "loss": 0.82947218, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.7317774295806885 }, { "auxiliary_loss_clip": 0.0125415, "auxiliary_loss_mlp": 0.01058469, "balance_loss_clip": 1.08002853, "balance_loss_mlp": 1.03883529, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.36888044133777, "language_loss": 0.92856318, "learning_rate": 3.99535601012475e-06, "loss": 0.95168936, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.7280800342559814 }, { "auxiliary_loss_clip": 0.01226113, "auxiliary_loss_mlp": 0.00714839, "balance_loss_clip": 1.07498419, "balance_loss_mlp": 1.00037384, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.925162169272267, "language_loss": 0.75443554, "learning_rate": 3.995302805158333e-06, "loss": 0.77384508, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.876927614212036 }, { "auxiliary_loss_clip": 0.01235375, "auxiliary_loss_mlp": 0.01054191, "balance_loss_clip": 1.07384014, "balance_loss_mlp": 1.03309047, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 2.282910968333549, "language_loss": 0.8371613, "learning_rate": 3.9952492975060665e-06, "loss": 0.86005694, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.725764036178589 }, { "auxiliary_loss_clip": 0.01261022, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.07719922, "balance_loss_mlp": 1.03105223, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 2.9578422425120277, "language_loss": 0.85486412, "learning_rate": 3.995195487176067e-06, "loss": 0.87797928, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.7591395378112793 }, { "auxiliary_loss_clip": 0.01285099, "auxiliary_loss_mlp": 0.01048157, "balance_loss_clip": 1.08020782, "balance_loss_mlp": 1.02913117, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 1.8768212018984163, "language_loss": 0.85523927, "learning_rate": 3.995141374176499e-06, "loss": 0.87857175, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.647000551223755 }, { "auxiliary_loss_clip": 0.01160194, "auxiliary_loss_mlp": 0.00705461, "balance_loss_clip": 1.07159066, "balance_loss_mlp": 1.00044239, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8723643910140885, "language_loss": 0.63126481, "learning_rate": 3.995086958515572e-06, "loss": 0.64992136, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.298923969268799 }, { "auxiliary_loss_clip": 0.01204078, "auxiliary_loss_mlp": 0.00705246, "balance_loss_clip": 1.07518172, "balance_loss_mlp": 1.00030041, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.8590264666346829, "language_loss": 0.59914434, "learning_rate": 3.995032240201538e-06, "loss": 0.61823761, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.1488397121429443 }, { "auxiliary_loss_clip": 0.01176461, "auxiliary_loss_mlp": 0.01013569, "balance_loss_clip": 1.07345486, "balance_loss_mlp": 1.00250649, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9941561327972286, "language_loss": 0.63106918, "learning_rate": 3.9949772192427e-06, "loss": 0.65296948, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 2.926957607269287 }, { "auxiliary_loss_clip": 0.01248537, "auxiliary_loss_mlp": 0.01051768, "balance_loss_clip": 1.07732129, "balance_loss_mlp": 1.03289676, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 2.0151850057493528, "language_loss": 0.79579735, "learning_rate": 3.994921895647405e-06, "loss": 0.81880045, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.754389762878418 }, { "auxiliary_loss_clip": 0.01200255, "auxiliary_loss_mlp": 0.01011831, "balance_loss_clip": 1.07267177, "balance_loss_mlp": 1.00105464, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8871756002118969, "language_loss": 0.55310625, "learning_rate": 3.994866269424043e-06, "loss": 0.57522708, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 3.148367166519165 }, { "auxiliary_loss_clip": 0.0119066, "auxiliary_loss_mlp": 0.01047995, "balance_loss_clip": 1.0644927, "balance_loss_mlp": 1.02873039, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.2557058697104178, "language_loss": 0.78143758, "learning_rate": 3.9948103405810545e-06, "loss": 0.80382413, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 3.6494836807250977 }, { "auxiliary_loss_clip": 0.0120631, "auxiliary_loss_mlp": 0.01053314, "balance_loss_clip": 1.06746221, "balance_loss_mlp": 1.03396678, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 2.1581231273781283, "language_loss": 0.86139274, "learning_rate": 3.994754109126923e-06, "loss": 0.88398898, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 3.755035877227783 }, { "auxiliary_loss_clip": 0.01185335, "auxiliary_loss_mlp": 0.01051512, "balance_loss_clip": 1.07216549, "balance_loss_mlp": 1.03304625, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.9242313861722462, "language_loss": 0.93307149, "learning_rate": 3.994697575070181e-06, "loss": 0.95543998, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 3.855872869491577 }, { "auxiliary_loss_clip": 0.01247439, "auxiliary_loss_mlp": 0.01057386, "balance_loss_clip": 1.08122206, "balance_loss_mlp": 1.03751373, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.9579043074981142, "language_loss": 0.90956473, "learning_rate": 3.994640738419402e-06, "loss": 0.93261302, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.7126362323760986 }, { "auxiliary_loss_clip": 0.01265793, "auxiliary_loss_mlp": 0.01055211, "balance_loss_clip": 1.07921088, "balance_loss_mlp": 1.03693664, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 1.8466199086942845, "language_loss": 0.80849785, "learning_rate": 3.9945835991832075e-06, "loss": 0.83170784, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.7639262676239014 }, { "auxiliary_loss_clip": 0.01287815, "auxiliary_loss_mlp": 0.01053662, "balance_loss_clip": 1.08594275, "balance_loss_mlp": 1.03502941, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.057744328519696, "language_loss": 0.92615628, "learning_rate": 3.994526157370268e-06, "loss": 0.94957101, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.707263469696045 }, { "auxiliary_loss_clip": 0.01172297, "auxiliary_loss_mlp": 0.01011856, "balance_loss_clip": 1.06780016, "balance_loss_mlp": 1.00088871, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.89493707686831, "language_loss": 0.5926491, "learning_rate": 3.994468412989296e-06, "loss": 0.61449063, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.3429951667785645 }, { "auxiliary_loss_clip": 0.01222635, "auxiliary_loss_mlp": 0.01065374, "balance_loss_clip": 1.07100093, "balance_loss_mlp": 1.04690862, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.4882462217947747, "language_loss": 0.92401183, "learning_rate": 3.994410366049052e-06, "loss": 0.9468919, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.7052268981933594 }, { "auxiliary_loss_clip": 0.01266185, "auxiliary_loss_mlp": 0.01053063, "balance_loss_clip": 1.07926631, "balance_loss_mlp": 1.03446627, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 3.0451690215071197, "language_loss": 0.83252692, "learning_rate": 3.994352016558341e-06, "loss": 0.85571945, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.6277968883514404 }, { "auxiliary_loss_clip": 0.01263471, "auxiliary_loss_mlp": 0.0106986, "balance_loss_clip": 1.08064318, "balance_loss_mlp": 1.05054843, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 1.9117082930868867, "language_loss": 0.73789334, "learning_rate": 3.994293364526014e-06, "loss": 0.76122665, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.7233288288116455 }, { "auxiliary_loss_clip": 0.01238386, "auxiliary_loss_mlp": 0.01052397, "balance_loss_clip": 1.07787955, "balance_loss_mlp": 1.03265595, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 12.4142685529688, "language_loss": 0.84813488, "learning_rate": 3.99423440996097e-06, "loss": 0.87104267, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.6628963947296143 }, { "auxiliary_loss_clip": 0.01257518, "auxiliary_loss_mlp": 0.01048106, "balance_loss_clip": 1.08385634, "balance_loss_mlp": 1.03054643, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 2.3167641693534504, "language_loss": 0.81444108, "learning_rate": 3.994175152872152e-06, "loss": 0.83749735, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.8092644214630127 }, { "auxiliary_loss_clip": 0.01265541, "auxiliary_loss_mlp": 0.01058694, "balance_loss_clip": 1.0776515, "balance_loss_mlp": 1.0407058, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 1.9606845448826007, "language_loss": 0.78496552, "learning_rate": 3.994115593268548e-06, "loss": 0.80820787, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.743617296218872 }, { "auxiliary_loss_clip": 0.01285863, "auxiliary_loss_mlp": 0.01058534, "balance_loss_clip": 1.08512485, "balance_loss_mlp": 1.03923404, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 2.253748039374617, "language_loss": 0.82002723, "learning_rate": 3.994055731159195e-06, "loss": 0.84347117, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.703089475631714 }, { "auxiliary_loss_clip": 0.01266512, "auxiliary_loss_mlp": 0.01055793, "balance_loss_clip": 1.08270764, "balance_loss_mlp": 1.03768492, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 3.480242457705821, "language_loss": 0.87211144, "learning_rate": 3.993995566553172e-06, "loss": 0.89533448, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.739323139190674 }, { "auxiliary_loss_clip": 0.01234998, "auxiliary_loss_mlp": 0.01056902, "balance_loss_clip": 1.07153571, "balance_loss_mlp": 1.03936672, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.599705453165824, "language_loss": 0.77083528, "learning_rate": 3.993935099459607e-06, "loss": 0.79375434, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.7947938442230225 }, { "auxiliary_loss_clip": 0.01275967, "auxiliary_loss_mlp": 0.01056144, "balance_loss_clip": 1.07712734, "balance_loss_mlp": 1.03909671, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 2.196406808286906, "language_loss": 0.73636019, "learning_rate": 3.993874329887673e-06, "loss": 0.75968128, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.6838271617889404 }, { "auxiliary_loss_clip": 0.01267847, "auxiliary_loss_mlp": 0.01049987, "balance_loss_clip": 1.08117509, "balance_loss_mlp": 1.03135455, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.712025513611689, "language_loss": 0.86756957, "learning_rate": 3.993813257846589e-06, "loss": 0.8907479, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.636164426803589 }, { "auxiliary_loss_clip": 0.01268891, "auxiliary_loss_mlp": 0.01055944, "balance_loss_clip": 1.08142734, "balance_loss_mlp": 1.03613162, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.7343558562073063, "language_loss": 0.92683113, "learning_rate": 3.993751883345619e-06, "loss": 0.9500795, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.7163631916046143 }, { "auxiliary_loss_clip": 0.01240576, "auxiliary_loss_mlp": 0.01068175, "balance_loss_clip": 1.07702756, "balance_loss_mlp": 1.04948306, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.8428869735654847, "language_loss": 0.87328136, "learning_rate": 3.993690206394073e-06, "loss": 0.89636892, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.691941738128662 }, { "auxiliary_loss_clip": 0.01256527, "auxiliary_loss_mlp": 0.01053877, "balance_loss_clip": 1.08020055, "balance_loss_mlp": 1.03526843, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.717089767158013, "language_loss": 0.87670195, "learning_rate": 3.993628227001307e-06, "loss": 0.89980602, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.696645975112915 }, { "auxiliary_loss_clip": 0.01239726, "auxiliary_loss_mlp": 0.01055546, "balance_loss_clip": 1.07418621, "balance_loss_mlp": 1.03811777, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.9613782952318735, "language_loss": 0.71486139, "learning_rate": 3.993565945176726e-06, "loss": 0.73781407, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 2.9047014713287354 }, { "auxiliary_loss_clip": 0.0124308, "auxiliary_loss_mlp": 0.01050687, "balance_loss_clip": 1.07566965, "balance_loss_mlp": 1.03391445, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 1.8910104061285302, "language_loss": 0.84362221, "learning_rate": 3.993503360929776e-06, "loss": 0.8665598, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.651975154876709 }, { "auxiliary_loss_clip": 0.01177198, "auxiliary_loss_mlp": 0.01051281, "balance_loss_clip": 1.06673515, "balance_loss_mlp": 1.03189719, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.6531686127720069, "language_loss": 0.81181097, "learning_rate": 3.99344047426995e-06, "loss": 0.83409572, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.0561583042144775 }, { "auxiliary_loss_clip": 0.01217151, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.07285953, "balance_loss_mlp": 1.04230142, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.421420700891032, "language_loss": 0.93374777, "learning_rate": 3.993377285206789e-06, "loss": 0.95653164, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 3.105116367340088 }, { "auxiliary_loss_clip": 0.01201643, "auxiliary_loss_mlp": 0.01049902, "balance_loss_clip": 1.06935132, "balance_loss_mlp": 1.03105533, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.720901831665902, "language_loss": 0.86603761, "learning_rate": 3.99331379374988e-06, "loss": 0.88855302, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 2.8778183460235596 }, { "auxiliary_loss_clip": 0.01244255, "auxiliary_loss_mlp": 0.0105413, "balance_loss_clip": 1.07056749, "balance_loss_mlp": 1.03612947, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 1.9612144529255848, "language_loss": 0.80050725, "learning_rate": 3.993249999908852e-06, "loss": 0.8234911, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 4.645914793014526 }, { "auxiliary_loss_clip": 0.01282521, "auxiliary_loss_mlp": 0.0105275, "balance_loss_clip": 1.08043623, "balance_loss_mlp": 1.03407049, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 2.4541363850903095, "language_loss": 0.87201303, "learning_rate": 3.993185903693384e-06, "loss": 0.89536572, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.6488490104675293 }, { "auxiliary_loss_clip": 0.01247416, "auxiliary_loss_mlp": 0.01054342, "balance_loss_clip": 1.07814538, "balance_loss_mlp": 1.03600788, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 1.9021324987226542, "language_loss": 0.82439733, "learning_rate": 3.9931215051131995e-06, "loss": 0.84741497, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 3.6629550457000732 }, { "auxiliary_loss_clip": 0.01249048, "auxiliary_loss_mlp": 0.01058845, "balance_loss_clip": 1.07489848, "balance_loss_mlp": 1.04039145, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.602759231962001, "language_loss": 0.79825938, "learning_rate": 3.993056804178068e-06, "loss": 0.8213383, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 3.706672430038452 }, { "auxiliary_loss_clip": 0.01210159, "auxiliary_loss_mlp": 0.01053378, "balance_loss_clip": 1.07513928, "balance_loss_mlp": 1.03451943, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.260068322023189, "language_loss": 0.84179342, "learning_rate": 3.992991800897803e-06, "loss": 0.86442882, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.8673670291900635 }, { "auxiliary_loss_clip": 0.01280026, "auxiliary_loss_mlp": 0.01064998, "balance_loss_clip": 1.07997608, "balance_loss_mlp": 1.04635406, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 3.2915590657821716, "language_loss": 0.89874768, "learning_rate": 3.9929264952822665e-06, "loss": 0.92219788, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.6330461502075195 }, { "auxiliary_loss_clip": 0.01270425, "auxiliary_loss_mlp": 0.01056955, "balance_loss_clip": 1.07948041, "balance_loss_mlp": 1.03659391, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 1.919677971974217, "language_loss": 0.88371789, "learning_rate": 3.992860887341366e-06, "loss": 0.90699172, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.647573232650757 }, { "auxiliary_loss_clip": 0.01218335, "auxiliary_loss_mlp": 0.01062929, "balance_loss_clip": 1.07194495, "balance_loss_mlp": 1.04509521, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.36008971818092, "language_loss": 0.81131303, "learning_rate": 3.992794977085052e-06, "loss": 0.8341257, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.7578866481781006 }, { "auxiliary_loss_clip": 0.01229077, "auxiliary_loss_mlp": 0.01059724, "balance_loss_clip": 1.07633042, "balance_loss_mlp": 1.04081798, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.175932797109867, "language_loss": 0.85099816, "learning_rate": 3.992728764523326e-06, "loss": 0.87388617, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.6737704277038574 }, { "auxiliary_loss_clip": 0.01246715, "auxiliary_loss_mlp": 0.01044739, "balance_loss_clip": 1.0790844, "balance_loss_mlp": 1.0252248, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.8434410156302328, "language_loss": 0.80936259, "learning_rate": 3.99266224966623e-06, "loss": 0.83227718, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.705970048904419 }, { "auxiliary_loss_clip": 0.01232169, "auxiliary_loss_mlp": 0.01053526, "balance_loss_clip": 1.07357383, "balance_loss_mlp": 1.03571653, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 1.9800684752244884, "language_loss": 0.87790924, "learning_rate": 3.992595432523855e-06, "loss": 0.90076625, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.633969783782959 }, { "auxiliary_loss_clip": 0.01217314, "auxiliary_loss_mlp": 0.01051631, "balance_loss_clip": 1.07504058, "balance_loss_mlp": 1.03372598, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 1.9852706753698988, "language_loss": 0.86269736, "learning_rate": 3.992528313106338e-06, "loss": 0.88538677, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.735612630844116 }, { "auxiliary_loss_clip": 0.01284674, "auxiliary_loss_mlp": 0.00715137, "balance_loss_clip": 1.08462548, "balance_loss_mlp": 1.00047064, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.3289912007072453, "language_loss": 0.8229, "learning_rate": 3.9924608914238595e-06, "loss": 0.84289813, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.589564323425293 }, { "auxiliary_loss_clip": 0.01265028, "auxiliary_loss_mlp": 0.01045431, "balance_loss_clip": 1.078915, "balance_loss_mlp": 1.02737093, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 3.223062206714865, "language_loss": 0.84440148, "learning_rate": 3.992393167486648e-06, "loss": 0.86750603, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.7230353355407715 }, { "auxiliary_loss_clip": 0.01286719, "auxiliary_loss_mlp": 0.01055477, "balance_loss_clip": 1.08503628, "balance_loss_mlp": 1.03578341, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.4166426933005485, "language_loss": 0.80623066, "learning_rate": 3.992325141304977e-06, "loss": 0.82965261, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.6589815616607666 }, { "auxiliary_loss_clip": 0.01210095, "auxiliary_loss_mlp": 0.01054306, "balance_loss_clip": 1.07291842, "balance_loss_mlp": 1.03578115, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.8752922518061994, "language_loss": 0.86494339, "learning_rate": 3.992256812889166e-06, "loss": 0.88758743, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.711385726928711 }, { "auxiliary_loss_clip": 0.01284759, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.08474684, "balance_loss_mlp": 1.03814268, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.2593815038087723, "language_loss": 0.76810497, "learning_rate": 3.992188182249582e-06, "loss": 0.79151285, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.7673096656799316 }, { "auxiliary_loss_clip": 0.01246506, "auxiliary_loss_mlp": 0.01054011, "balance_loss_clip": 1.07867956, "balance_loss_mlp": 1.03474689, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.654008945198202, "language_loss": 0.90416348, "learning_rate": 3.992119249396633e-06, "loss": 0.92716873, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.627385139465332 }, { "auxiliary_loss_clip": 0.01241987, "auxiliary_loss_mlp": 0.00715405, "balance_loss_clip": 1.07619512, "balance_loss_mlp": 1.00041509, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 2.0531676814563067, "language_loss": 0.82251984, "learning_rate": 3.992050014340778e-06, "loss": 0.84209377, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.7567873001098633 }, { "auxiliary_loss_clip": 0.01173678, "auxiliary_loss_mlp": 0.01012217, "balance_loss_clip": 1.06572652, "balance_loss_mlp": 1.00077248, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.8660955781874833, "language_loss": 0.55043375, "learning_rate": 3.99198047709252e-06, "loss": 0.57229269, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.2243430614471436 }, { "auxiliary_loss_clip": 0.01220863, "auxiliary_loss_mlp": 0.01056403, "balance_loss_clip": 1.06968403, "balance_loss_mlp": 1.0365665, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 2.203221977087516, "language_loss": 0.78668982, "learning_rate": 3.991910637662408e-06, "loss": 0.80946249, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.646590232849121 }, { "auxiliary_loss_clip": 0.01277058, "auxiliary_loss_mlp": 0.01054361, "balance_loss_clip": 1.0793767, "balance_loss_mlp": 1.0368135, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 2.066836584882042, "language_loss": 0.80578202, "learning_rate": 3.9918404960610355e-06, "loss": 0.8290962, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.5517659187316895 }, { "auxiliary_loss_clip": 0.01265565, "auxiliary_loss_mlp": 0.0105636, "balance_loss_clip": 1.07706296, "balance_loss_mlp": 1.03806174, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.4699672494508884, "language_loss": 0.77653348, "learning_rate": 3.991770052299043e-06, "loss": 0.79975271, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.739736318588257 }, { "auxiliary_loss_clip": 0.01244391, "auxiliary_loss_mlp": 0.0105982, "balance_loss_clip": 1.07449341, "balance_loss_mlp": 1.04227209, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.492926512328417, "language_loss": 0.87713826, "learning_rate": 3.991699306387118e-06, "loss": 0.9001804, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.7041826248168945 }, { "auxiliary_loss_clip": 0.01263643, "auxiliary_loss_mlp": 0.01047543, "balance_loss_clip": 1.07787597, "balance_loss_mlp": 1.02968585, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.89193662631201, "language_loss": 0.78201067, "learning_rate": 3.991628258335991e-06, "loss": 0.80512249, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.822883129119873 }, { "auxiliary_loss_clip": 0.01217557, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.0718503, "balance_loss_mlp": 1.02973115, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 4.5219820273577875, "language_loss": 0.87655771, "learning_rate": 3.991556908156442e-06, "loss": 0.89921057, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.8831584453582764 }, { "auxiliary_loss_clip": 0.01244211, "auxiliary_loss_mlp": 0.0105543, "balance_loss_clip": 1.07297742, "balance_loss_mlp": 1.03669071, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 1.9119118064797553, "language_loss": 0.87749255, "learning_rate": 3.9914852558592914e-06, "loss": 0.90048897, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 4.604870796203613 }, { "auxiliary_loss_clip": 0.01262018, "auxiliary_loss_mlp": 0.01046177, "balance_loss_clip": 1.07950091, "balance_loss_mlp": 1.02789068, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.625419784017889, "language_loss": 0.8041895, "learning_rate": 3.991413301455413e-06, "loss": 0.8272714, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.664207935333252 }, { "auxiliary_loss_clip": 0.01228525, "auxiliary_loss_mlp": 0.01055844, "balance_loss_clip": 1.0718621, "balance_loss_mlp": 1.03842711, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 4.201760989195712, "language_loss": 0.77637988, "learning_rate": 3.991341044955719e-06, "loss": 0.79922354, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.5753560066223145 }, { "auxiliary_loss_clip": 0.01259833, "auxiliary_loss_mlp": 0.0071539, "balance_loss_clip": 1.07452679, "balance_loss_mlp": 1.00040817, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 2.1080373887232384, "language_loss": 0.81543791, "learning_rate": 3.991268486371172e-06, "loss": 0.83519018, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 3.5914063453674316 }, { "auxiliary_loss_clip": 0.0124422, "auxiliary_loss_mlp": 0.01048802, "balance_loss_clip": 1.07499921, "balance_loss_mlp": 1.02956212, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 2.577514051764543, "language_loss": 0.87884378, "learning_rate": 3.991195625712779e-06, "loss": 0.90177399, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.7429518699645996 }, { "auxiliary_loss_clip": 0.01278736, "auxiliary_loss_mlp": 0.0105394, "balance_loss_clip": 1.08014965, "balance_loss_mlp": 1.03636825, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 2.0594212163501533, "language_loss": 0.81567532, "learning_rate": 3.991122462991592e-06, "loss": 0.83900207, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.7024953365325928 }, { "auxiliary_loss_clip": 0.01285205, "auxiliary_loss_mlp": 0.01057585, "balance_loss_clip": 1.0804472, "balance_loss_mlp": 1.03814149, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.3014161802116657, "language_loss": 0.81491601, "learning_rate": 3.991048998218712e-06, "loss": 0.83834386, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.600637912750244 }, { "auxiliary_loss_clip": 0.01260471, "auxiliary_loss_mlp": 0.01052664, "balance_loss_clip": 1.07570159, "balance_loss_mlp": 1.03469872, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.0876771730439363, "language_loss": 0.76266289, "learning_rate": 3.990975231405281e-06, "loss": 0.78579414, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.6772079467773438 }, { "auxiliary_loss_clip": 0.01259144, "auxiliary_loss_mlp": 0.01056712, "balance_loss_clip": 1.07514238, "balance_loss_mlp": 1.03838944, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 1.8520140742331241, "language_loss": 0.7870059, "learning_rate": 3.990901162562491e-06, "loss": 0.81016445, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.669926643371582 }, { "auxiliary_loss_clip": 0.0121936, "auxiliary_loss_mlp": 0.0071552, "balance_loss_clip": 1.06461811, "balance_loss_mlp": 1.00026441, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 1.8687317390373224, "language_loss": 0.90816104, "learning_rate": 3.9908267917015765e-06, "loss": 0.92750978, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.7369210720062256 }, { "auxiliary_loss_clip": 0.01250225, "auxiliary_loss_mlp": 0.01056613, "balance_loss_clip": 1.07398975, "balance_loss_mlp": 1.03857708, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.135768884126698, "language_loss": 0.93054533, "learning_rate": 3.990752118833821e-06, "loss": 0.95361364, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.6451141834259033 }, { "auxiliary_loss_clip": 0.01278817, "auxiliary_loss_mlp": 0.01063081, "balance_loss_clip": 1.07935452, "balance_loss_mlp": 1.04504466, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 2.1397804460936336, "language_loss": 0.77885342, "learning_rate": 3.990677143970553e-06, "loss": 0.80227232, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.708834171295166 }, { "auxiliary_loss_clip": 0.01221901, "auxiliary_loss_mlp": 0.01058484, "balance_loss_clip": 1.07448268, "balance_loss_mlp": 1.0398277, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 1.8700484443629015, "language_loss": 0.81116486, "learning_rate": 3.990601867123144e-06, "loss": 0.83396876, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.7078185081481934 }, { "auxiliary_loss_clip": 0.01207746, "auxiliary_loss_mlp": 0.01044295, "balance_loss_clip": 1.07138264, "balance_loss_mlp": 1.02627063, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.376883466397464, "language_loss": 0.84923697, "learning_rate": 3.990526288303014e-06, "loss": 0.87175739, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.758124828338623 }, { "auxiliary_loss_clip": 0.01243259, "auxiliary_loss_mlp": 0.00714208, "balance_loss_clip": 1.07530832, "balance_loss_mlp": 1.00016773, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 1.9520870138706334, "language_loss": 0.90725565, "learning_rate": 3.9904504075216295e-06, "loss": 0.92683029, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.7273459434509277 }, { "auxiliary_loss_clip": 0.01222708, "auxiliary_loss_mlp": 0.01061183, "balance_loss_clip": 1.06939018, "balance_loss_mlp": 1.04244328, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.377851943734658, "language_loss": 0.94002676, "learning_rate": 3.990374224790501e-06, "loss": 0.96286559, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.731008291244507 }, { "auxiliary_loss_clip": 0.01247405, "auxiliary_loss_mlp": 0.01055884, "balance_loss_clip": 1.07877362, "balance_loss_mlp": 1.03812194, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 1.9083661077467644, "language_loss": 0.70645821, "learning_rate": 3.990297740121185e-06, "loss": 0.72949111, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.6919174194335938 }, { "auxiliary_loss_clip": 0.01262512, "auxiliary_loss_mlp": 0.0071509, "balance_loss_clip": 1.07611048, "balance_loss_mlp": 1.0001297, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 1.7506038103388635, "language_loss": 0.7826618, "learning_rate": 3.990220953525284e-06, "loss": 0.80243778, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.7195844650268555 }, { "auxiliary_loss_clip": 0.01226742, "auxiliary_loss_mlp": 0.01050432, "balance_loss_clip": 1.06825984, "balance_loss_mlp": 1.03145361, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.6378220459195023, "language_loss": 0.7382071, "learning_rate": 3.9901438650144465e-06, "loss": 0.76097882, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.691354513168335 }, { "auxiliary_loss_clip": 0.01247506, "auxiliary_loss_mlp": 0.01050882, "balance_loss_clip": 1.07272196, "balance_loss_mlp": 1.03244066, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 2.38902160774315, "language_loss": 0.91329455, "learning_rate": 3.990066474600367e-06, "loss": 0.93627846, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.620168685913086 }, { "auxiliary_loss_clip": 0.01239415, "auxiliary_loss_mlp": 0.01044102, "balance_loss_clip": 1.06821084, "balance_loss_mlp": 1.02599418, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 2.4377351539300687, "language_loss": 0.67860603, "learning_rate": 3.989988782294786e-06, "loss": 0.70144129, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.7379374504089355 }, { "auxiliary_loss_clip": 0.01201592, "auxiliary_loss_mlp": 0.01054099, "balance_loss_clip": 1.06647205, "balance_loss_mlp": 1.03559732, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.720037759276805, "language_loss": 0.94856054, "learning_rate": 3.989910788109489e-06, "loss": 0.97111744, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.685523271560669 }, { "auxiliary_loss_clip": 0.01222171, "auxiliary_loss_mlp": 0.01051627, "balance_loss_clip": 1.07060492, "balance_loss_mlp": 1.03399658, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.3238256824484016, "language_loss": 0.75141585, "learning_rate": 3.989832492056307e-06, "loss": 0.77415383, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.8122622966766357 }, { "auxiliary_loss_clip": 0.01257077, "auxiliary_loss_mlp": 0.01051377, "balance_loss_clip": 1.07538497, "balance_loss_mlp": 1.0328877, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 2.501845734891576, "language_loss": 0.80834699, "learning_rate": 3.989753894147119e-06, "loss": 0.83143151, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.699174165725708 }, { "auxiliary_loss_clip": 0.01251704, "auxiliary_loss_mlp": 0.01053049, "balance_loss_clip": 1.07740712, "balance_loss_mlp": 1.0362289, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 1.7037118668162317, "language_loss": 0.80195975, "learning_rate": 3.989674994393846e-06, "loss": 0.82500732, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.7327544689178467 }, { "auxiliary_loss_clip": 0.01253457, "auxiliary_loss_mlp": 0.01053785, "balance_loss_clip": 1.07475865, "balance_loss_mlp": 1.03500962, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.6561317373286535, "language_loss": 0.93780154, "learning_rate": 3.98959579280846e-06, "loss": 0.96087396, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.7479655742645264 }, { "auxiliary_loss_clip": 0.01188904, "auxiliary_loss_mlp": 0.01051792, "balance_loss_clip": 1.06834674, "balance_loss_mlp": 1.03491187, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.2717723325320938, "language_loss": 0.82824343, "learning_rate": 3.989516289402973e-06, "loss": 0.85065043, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 3.6274962425231934 }, { "auxiliary_loss_clip": 0.01161501, "auxiliary_loss_mlp": 0.01049566, "balance_loss_clip": 1.05664194, "balance_loss_mlp": 1.03096914, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 3.4401013162234335, "language_loss": 0.80490708, "learning_rate": 3.989436484189447e-06, "loss": 0.82701778, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 3.7154335975646973 }, { "auxiliary_loss_clip": 0.01258457, "auxiliary_loss_mlp": 0.0105424, "balance_loss_clip": 1.07395315, "balance_loss_mlp": 1.03612065, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.5387610357364303, "language_loss": 0.81313717, "learning_rate": 3.9893563771799885e-06, "loss": 0.83626413, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 3.6318576335906982 }, { "auxiliary_loss_clip": 0.01277831, "auxiliary_loss_mlp": 0.01046508, "balance_loss_clip": 1.07816005, "balance_loss_mlp": 1.02780449, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.3015073882499686, "language_loss": 0.86290318, "learning_rate": 3.989275968386749e-06, "loss": 0.88614655, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 3.6264593601226807 }, { "auxiliary_loss_clip": 0.01225687, "auxiliary_loss_mlp": 0.0105456, "balance_loss_clip": 1.06754041, "balance_loss_mlp": 1.03603435, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 2.0663997889066685, "language_loss": 0.76926935, "learning_rate": 3.989195257821926e-06, "loss": 0.79207182, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.7407803535461426 }, { "auxiliary_loss_clip": 0.01238235, "auxiliary_loss_mlp": 0.01064597, "balance_loss_clip": 1.07582688, "balance_loss_mlp": 1.04650092, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.162712785505868, "language_loss": 0.84136301, "learning_rate": 3.989114245497765e-06, "loss": 0.86439133, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.6641149520874023 }, { "auxiliary_loss_clip": 0.01257596, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.07111776, "balance_loss_mlp": 1.03392243, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.163711881979113, "language_loss": 0.94606602, "learning_rate": 3.989032931426554e-06, "loss": 0.96915889, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.5872089862823486 }, { "auxiliary_loss_clip": 0.01238556, "auxiliary_loss_mlp": 0.01047111, "balance_loss_clip": 1.07170379, "balance_loss_mlp": 1.02908647, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 2.334230300296425, "language_loss": 0.86605507, "learning_rate": 3.9889513156206295e-06, "loss": 0.88891178, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.715182304382324 }, { "auxiliary_loss_clip": 0.01220445, "auxiliary_loss_mlp": 0.01050773, "balance_loss_clip": 1.07037973, "balance_loss_mlp": 1.03251064, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 3.3132509564977783, "language_loss": 0.73886025, "learning_rate": 3.988869398092371e-06, "loss": 0.76157242, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.6820688247680664 }, { "auxiliary_loss_clip": 0.012395, "auxiliary_loss_mlp": 0.01051131, "balance_loss_clip": 1.07333899, "balance_loss_mlp": 1.03239131, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.4352755837182936, "language_loss": 0.7874732, "learning_rate": 3.988787178854206e-06, "loss": 0.81037945, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.7465972900390625 }, { "auxiliary_loss_clip": 0.01278604, "auxiliary_loss_mlp": 0.0104783, "balance_loss_clip": 1.08056819, "balance_loss_mlp": 1.03005576, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.3463116557900614, "language_loss": 0.87255573, "learning_rate": 3.988704657918608e-06, "loss": 0.89582008, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.5997776985168457 }, { "auxiliary_loss_clip": 0.01260001, "auxiliary_loss_mlp": 0.01045652, "balance_loss_clip": 1.07775676, "balance_loss_mlp": 1.02851009, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.3689194830520193, "language_loss": 0.79742539, "learning_rate": 3.988621835298094e-06, "loss": 0.8204819, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.6202402114868164 }, { "auxiliary_loss_clip": 0.01271956, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.07731247, "balance_loss_mlp": 1.03927255, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 1.9810608414536761, "language_loss": 0.91752368, "learning_rate": 3.988538711005229e-06, "loss": 0.94080931, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.645916700363159 }, { "auxiliary_loss_clip": 0.01249549, "auxiliary_loss_mlp": 0.01055081, "balance_loss_clip": 1.07350302, "balance_loss_mlp": 1.0382961, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.040129619713735, "language_loss": 0.88052374, "learning_rate": 3.988455285052622e-06, "loss": 0.90357006, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.6968915462493896 }, { "auxiliary_loss_clip": 0.01250542, "auxiliary_loss_mlp": 0.01047212, "balance_loss_clip": 1.07477212, "balance_loss_mlp": 1.02850819, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.107375228767634, "language_loss": 0.83779883, "learning_rate": 3.98837155745293e-06, "loss": 0.86077642, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.659773826599121 }, { "auxiliary_loss_clip": 0.01254818, "auxiliary_loss_mlp": 0.01051167, "balance_loss_clip": 1.07589495, "balance_loss_mlp": 1.03320181, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.082339784944378, "language_loss": 0.76243734, "learning_rate": 3.988287528218854e-06, "loss": 0.78549719, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.6812193393707275 }, { "auxiliary_loss_clip": 0.01254809, "auxiliary_loss_mlp": 0.01054666, "balance_loss_clip": 1.08008265, "balance_loss_mlp": 1.03754747, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 1.9579831383436737, "language_loss": 0.90375733, "learning_rate": 3.98820319736314e-06, "loss": 0.92685205, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.6310524940490723 }, { "auxiliary_loss_clip": 0.01219846, "auxiliary_loss_mlp": 0.01053614, "balance_loss_clip": 1.06838298, "balance_loss_mlp": 1.03531504, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 3.131778478630931, "language_loss": 0.85439348, "learning_rate": 3.988118564898582e-06, "loss": 0.87712806, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.749556541442871 }, { "auxiliary_loss_clip": 0.01208291, "auxiliary_loss_mlp": 0.00714963, "balance_loss_clip": 1.07015014, "balance_loss_mlp": 1.00005221, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.6167059790363503, "language_loss": 0.89193308, "learning_rate": 3.988033630838019e-06, "loss": 0.91116565, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.7117583751678467 }, { "auxiliary_loss_clip": 0.01258465, "auxiliary_loss_mlp": 0.01055444, "balance_loss_clip": 1.07760322, "balance_loss_mlp": 1.03691888, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 2.405821872130984, "language_loss": 0.88105094, "learning_rate": 3.987948395194334e-06, "loss": 0.90419, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.743867874145508 }, { "auxiliary_loss_clip": 0.01245781, "auxiliary_loss_mlp": 0.01048956, "balance_loss_clip": 1.07125807, "balance_loss_mlp": 1.03108644, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.366010366082179, "language_loss": 0.76696849, "learning_rate": 3.987862857980458e-06, "loss": 0.78991592, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.6707003116607666 }, { "auxiliary_loss_clip": 0.01224829, "auxiliary_loss_mlp": 0.01055652, "balance_loss_clip": 1.07216167, "balance_loss_mlp": 1.03718615, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.6219625693269704, "language_loss": 0.76633513, "learning_rate": 3.987777019209368e-06, "loss": 0.78914005, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.8582372665405273 }, { "auxiliary_loss_clip": 0.01271055, "auxiliary_loss_mlp": 0.01051521, "balance_loss_clip": 1.07543242, "balance_loss_mlp": 1.03466439, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 1.8897715420710488, "language_loss": 0.81060123, "learning_rate": 3.987690878894084e-06, "loss": 0.83382696, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.6892237663269043 }, { "auxiliary_loss_clip": 0.01242187, "auxiliary_loss_mlp": 0.01052627, "balance_loss_clip": 1.07356834, "balance_loss_mlp": 1.03411412, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 3.306513016360243, "language_loss": 0.84914815, "learning_rate": 3.987604437047673e-06, "loss": 0.8720963, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.71467924118042 }, { "auxiliary_loss_clip": 0.01252375, "auxiliary_loss_mlp": 0.01058199, "balance_loss_clip": 1.07391524, "balance_loss_mlp": 1.04059219, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.3150695736676603, "language_loss": 0.77523297, "learning_rate": 3.987517693683251e-06, "loss": 0.79833865, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.7136595249176025 }, { "auxiliary_loss_clip": 0.01232595, "auxiliary_loss_mlp": 0.01050152, "balance_loss_clip": 1.07318556, "balance_loss_mlp": 1.03260422, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 3.0160390987113446, "language_loss": 0.96289569, "learning_rate": 3.9874306488139745e-06, "loss": 0.98572314, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.7577383518218994 }, { "auxiliary_loss_clip": 0.01215017, "auxiliary_loss_mlp": 0.01054358, "balance_loss_clip": 1.06954896, "balance_loss_mlp": 1.0365479, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.495564148983867, "language_loss": 0.87794077, "learning_rate": 3.987343302453049e-06, "loss": 0.90063453, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 3.756253242492676 }, { "auxiliary_loss_clip": 0.01236194, "auxiliary_loss_mlp": 0.01053858, "balance_loss_clip": 1.07422233, "balance_loss_mlp": 1.03602386, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.7034163027039197, "language_loss": 0.82602149, "learning_rate": 3.987255654613724e-06, "loss": 0.84892201, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 3.715973138809204 }, { "auxiliary_loss_clip": 0.01216424, "auxiliary_loss_mlp": 0.01057033, "balance_loss_clip": 1.06851363, "balance_loss_mlp": 1.03888929, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 3.4810889297948258, "language_loss": 0.7053895, "learning_rate": 3.987167705309296e-06, "loss": 0.72812414, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 3.6554455757141113 }, { "auxiliary_loss_clip": 0.01260003, "auxiliary_loss_mlp": 0.00714544, "balance_loss_clip": 1.07603717, "balance_loss_mlp": 0.99991751, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.9153925358868853, "language_loss": 0.95213842, "learning_rate": 3.987079454553108e-06, "loss": 0.97188395, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 3.576323986053467 }, { "auxiliary_loss_clip": 0.01213191, "auxiliary_loss_mlp": 0.0105351, "balance_loss_clip": 1.07148755, "balance_loss_mlp": 1.03565216, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 2.616837345895366, "language_loss": 0.91009337, "learning_rate": 3.986990902358546e-06, "loss": 0.93276036, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.7515125274658203 }, { "auxiliary_loss_clip": 0.01253993, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.07112706, "balance_loss_mlp": 1.03033996, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.008659500823308, "language_loss": 0.93542892, "learning_rate": 3.986902048739045e-06, "loss": 0.9584555, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.6880974769592285 }, { "auxiliary_loss_clip": 0.01239202, "auxiliary_loss_mlp": 0.01049999, "balance_loss_clip": 1.07255995, "balance_loss_mlp": 1.03087807, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.847349982410298, "language_loss": 0.79958576, "learning_rate": 3.986812893708082e-06, "loss": 0.82247782, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.792034864425659 }, { "auxiliary_loss_clip": 0.01240674, "auxiliary_loss_mlp": 0.01056613, "balance_loss_clip": 1.06928504, "balance_loss_mlp": 1.03690743, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 3.521308614406233, "language_loss": 0.81380081, "learning_rate": 3.9867234372791826e-06, "loss": 0.83677363, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.6918418407440186 }, { "auxiliary_loss_clip": 0.0125557, "auxiliary_loss_mlp": 0.01043964, "balance_loss_clip": 1.07348537, "balance_loss_mlp": 1.02701306, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.6089987801075099, "language_loss": 0.87238717, "learning_rate": 3.986633679465918e-06, "loss": 0.89538252, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.670776605606079 }, { "auxiliary_loss_clip": 0.01202243, "auxiliary_loss_mlp": 0.01053439, "balance_loss_clip": 1.06921935, "balance_loss_mlp": 1.03558147, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.1890166747829554, "language_loss": 0.80702287, "learning_rate": 3.986543620281904e-06, "loss": 0.82957971, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.7367374897003174 }, { "auxiliary_loss_clip": 0.01222163, "auxiliary_loss_mlp": 0.0104934, "balance_loss_clip": 1.06909466, "balance_loss_mlp": 1.03210211, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.7428066778367532, "language_loss": 0.91182458, "learning_rate": 3.986453259740802e-06, "loss": 0.93453956, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.779442071914673 }, { "auxiliary_loss_clip": 0.01236441, "auxiliary_loss_mlp": 0.01050988, "balance_loss_clip": 1.07559228, "balance_loss_mlp": 1.03247523, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 3.406485462345537, "language_loss": 0.79246318, "learning_rate": 3.986362597856319e-06, "loss": 0.81533748, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.6954545974731445 }, { "auxiliary_loss_clip": 0.01230429, "auxiliary_loss_mlp": 0.00715571, "balance_loss_clip": 1.07041597, "balance_loss_mlp": 0.99994665, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 3.2830430527582055, "language_loss": 0.8164413, "learning_rate": 3.986271634642211e-06, "loss": 0.83590126, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.6322500705718994 }, { "auxiliary_loss_clip": 0.01269104, "auxiliary_loss_mlp": 0.01064397, "balance_loss_clip": 1.07459545, "balance_loss_mlp": 1.04637253, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 2.1880052228548217, "language_loss": 0.82047939, "learning_rate": 3.986180370112274e-06, "loss": 0.84381437, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.640148162841797 }, { "auxiliary_loss_clip": 0.01256899, "auxiliary_loss_mlp": 0.00714656, "balance_loss_clip": 1.07508361, "balance_loss_mlp": 0.9999283, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.8056855370356018, "language_loss": 0.74536735, "learning_rate": 3.986088804280354e-06, "loss": 0.7650829, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.6581547260284424 }, { "auxiliary_loss_clip": 0.01237615, "auxiliary_loss_mlp": 0.01050843, "balance_loss_clip": 1.0724647, "balance_loss_mlp": 1.03331876, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.5076746188009893, "language_loss": 0.93651551, "learning_rate": 3.985996937160342e-06, "loss": 0.95940006, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.7199862003326416 }, { "auxiliary_loss_clip": 0.01250924, "auxiliary_loss_mlp": 0.01051223, "balance_loss_clip": 1.07661366, "balance_loss_mlp": 1.03425932, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.0741483182741134, "language_loss": 0.69007272, "learning_rate": 3.985904768766173e-06, "loss": 0.71309423, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.9300825595855713 }, { "auxiliary_loss_clip": 0.01222364, "auxiliary_loss_mlp": 0.01051155, "balance_loss_clip": 1.07224059, "balance_loss_mlp": 1.03249884, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.6062667576787573, "language_loss": 0.7620061, "learning_rate": 3.98581229911183e-06, "loss": 0.78474128, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.706096887588501 }, { "auxiliary_loss_clip": 0.01251532, "auxiliary_loss_mlp": 0.01053712, "balance_loss_clip": 1.07067382, "balance_loss_mlp": 1.0367012, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 2.258622454550287, "language_loss": 0.92391598, "learning_rate": 3.985719528211341e-06, "loss": 0.94696844, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.648620367050171 }, { "auxiliary_loss_clip": 0.01173125, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 1.0756042, "balance_loss_mlp": 1.02086377, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8454038288459034, "language_loss": 0.6300956, "learning_rate": 3.985626456078777e-06, "loss": 0.65213037, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.373274803161621 }, { "auxiliary_loss_clip": 0.01219894, "auxiliary_loss_mlp": 0.01052093, "balance_loss_clip": 1.07109737, "balance_loss_mlp": 1.034235, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.815356906488187, "language_loss": 0.86157006, "learning_rate": 3.985533082728259e-06, "loss": 0.88428992, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.700531244277954 }, { "auxiliary_loss_clip": 0.01273638, "auxiliary_loss_mlp": 0.01055034, "balance_loss_clip": 1.07645869, "balance_loss_mlp": 1.03695011, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 1.8748398200267495, "language_loss": 0.74551725, "learning_rate": 3.985439408173951e-06, "loss": 0.76880401, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.636268377304077 }, { "auxiliary_loss_clip": 0.01273956, "auxiliary_loss_mlp": 0.01044669, "balance_loss_clip": 1.07628763, "balance_loss_mlp": 1.02653706, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 2.9959703111195717, "language_loss": 0.70556992, "learning_rate": 3.9853454324300634e-06, "loss": 0.72875619, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.725990056991577 }, { "auxiliary_loss_clip": 0.01191777, "auxiliary_loss_mlp": 0.0104969, "balance_loss_clip": 1.06780231, "balance_loss_mlp": 1.03171325, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.294758237075575, "language_loss": 0.77906501, "learning_rate": 3.985251155510852e-06, "loss": 0.8014797, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.7399208545684814 }, { "auxiliary_loss_clip": 0.01200335, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.07016325, "balance_loss_mlp": 1.02588022, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.8321573529318265, "language_loss": 0.80146587, "learning_rate": 3.98515657743062e-06, "loss": 0.82390523, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 2.776761770248413 }, { "auxiliary_loss_clip": 0.01232853, "auxiliary_loss_mlp": 0.01051765, "balance_loss_clip": 1.06771159, "balance_loss_mlp": 1.03479004, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 2.0041410917594615, "language_loss": 0.77865124, "learning_rate": 3.985061698203711e-06, "loss": 0.80149734, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 2.621670961380005 }, { "auxiliary_loss_clip": 0.01193422, "auxiliary_loss_mlp": 0.01014688, "balance_loss_clip": 1.07495284, "balance_loss_mlp": 1.00572324, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 1.1295558992260986, "language_loss": 0.63877678, "learning_rate": 3.984966517844523e-06, "loss": 0.66085792, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 4.039193153381348 }, { "auxiliary_loss_clip": 0.01276079, "auxiliary_loss_mlp": 0.0105945, "balance_loss_clip": 1.07843328, "balance_loss_mlp": 1.04079425, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.8276058406063354, "language_loss": 0.80422848, "learning_rate": 3.984871036367492e-06, "loss": 0.82758379, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 2.717585802078247 }, { "auxiliary_loss_clip": 0.01252376, "auxiliary_loss_mlp": 0.00715131, "balance_loss_clip": 1.07528913, "balance_loss_mlp": 0.9999547, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 1.9017975106044864, "language_loss": 0.82866448, "learning_rate": 3.984775253787102e-06, "loss": 0.84833956, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 3.5732264518737793 }, { "auxiliary_loss_clip": 0.01256299, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.07286739, "balance_loss_mlp": 1.03067684, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 3.2132470692431596, "language_loss": 0.87708759, "learning_rate": 3.984679170117885e-06, "loss": 0.90012747, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 3.572661876678467 }, { "auxiliary_loss_clip": 0.01249255, "auxiliary_loss_mlp": 0.01048432, "balance_loss_clip": 1.07187855, "balance_loss_mlp": 1.03026474, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.575512248766268, "language_loss": 0.78337938, "learning_rate": 3.984582785374415e-06, "loss": 0.80635619, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.6956608295440674 }, { "auxiliary_loss_clip": 0.01235096, "auxiliary_loss_mlp": 0.00714708, "balance_loss_clip": 1.07142353, "balance_loss_mlp": 0.99994123, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 2.3488115528671663, "language_loss": 0.80407953, "learning_rate": 3.9844860995713155e-06, "loss": 0.82357758, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.7249338626861572 }, { "auxiliary_loss_clip": 0.01255535, "auxiliary_loss_mlp": 0.01054165, "balance_loss_clip": 1.07975507, "balance_loss_mlp": 1.03712976, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 5.039861508862726, "language_loss": 0.82784605, "learning_rate": 3.9843891127232524e-06, "loss": 0.85094309, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.635885715484619 }, { "auxiliary_loss_clip": 0.01186891, "auxiliary_loss_mlp": 0.01052549, "balance_loss_clip": 1.06457627, "balance_loss_mlp": 1.0350368, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.561500848293181, "language_loss": 0.67131793, "learning_rate": 3.984291824844938e-06, "loss": 0.69371235, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.7747137546539307 }, { "auxiliary_loss_clip": 0.01273971, "auxiliary_loss_mlp": 0.01046873, "balance_loss_clip": 1.07752645, "balance_loss_mlp": 1.02996945, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 4.4896090902335715, "language_loss": 0.84915996, "learning_rate": 3.984194235951132e-06, "loss": 0.8723684, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.650352954864502 }, { "auxiliary_loss_clip": 0.0127653, "auxiliary_loss_mlp": 0.01062066, "balance_loss_clip": 1.08136344, "balance_loss_mlp": 1.04480457, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 2.5402010831864423, "language_loss": 0.84635252, "learning_rate": 3.9840963460566375e-06, "loss": 0.86973846, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.579244375228882 }, { "auxiliary_loss_clip": 0.01164058, "auxiliary_loss_mlp": 0.01046657, "balance_loss_clip": 1.06298292, "balance_loss_mlp": 1.02913308, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.6953075508922635, "language_loss": 0.89666414, "learning_rate": 3.983998155176305e-06, "loss": 0.91877127, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.838365316390991 }, { "auxiliary_loss_clip": 0.01190443, "auxiliary_loss_mlp": 0.01009258, "balance_loss_clip": 1.07299685, "balance_loss_mlp": 1.00048375, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8700462065640411, "language_loss": 0.57035381, "learning_rate": 3.9838996633250305e-06, "loss": 0.59235078, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.1418185234069824 }, { "auxiliary_loss_clip": 0.01250358, "auxiliary_loss_mlp": 0.01051772, "balance_loss_clip": 1.07220399, "balance_loss_mlp": 1.03538084, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.760631897131552, "language_loss": 0.88194972, "learning_rate": 3.983800870517753e-06, "loss": 0.904971, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.654129981994629 }, { "auxiliary_loss_clip": 0.01252095, "auxiliary_loss_mlp": 0.01054297, "balance_loss_clip": 1.0789597, "balance_loss_mlp": 1.03790522, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 2.964147694036408, "language_loss": 0.78562474, "learning_rate": 3.983701776769463e-06, "loss": 0.80868864, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.688281536102295 }, { "auxiliary_loss_clip": 0.01237111, "auxiliary_loss_mlp": 0.01047826, "balance_loss_clip": 1.07204747, "balance_loss_mlp": 1.03040957, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 3.1086617787882362, "language_loss": 0.85576952, "learning_rate": 3.9836023820951885e-06, "loss": 0.8786189, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.642648935317993 }, { "auxiliary_loss_clip": 0.01211043, "auxiliary_loss_mlp": 0.01046264, "balance_loss_clip": 1.06692731, "balance_loss_mlp": 1.02971745, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 2.3187187147170802, "language_loss": 0.68518639, "learning_rate": 3.983502686510011e-06, "loss": 0.70775944, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.758848190307617 }, { "auxiliary_loss_clip": 0.01254549, "auxiliary_loss_mlp": 0.00714481, "balance_loss_clip": 1.07310247, "balance_loss_mlp": 0.99990195, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.8654006985442437, "language_loss": 0.73240471, "learning_rate": 3.9834026900290525e-06, "loss": 0.7520951, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.681420087814331 }, { "auxiliary_loss_clip": 0.01270044, "auxiliary_loss_mlp": 0.01050189, "balance_loss_clip": 1.0774231, "balance_loss_mlp": 1.03289175, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 1.9128762652928324, "language_loss": 0.99890208, "learning_rate": 3.983302392667482e-06, "loss": 1.02210438, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.689894676208496 }, { "auxiliary_loss_clip": 0.0125008, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.0768733, "balance_loss_mlp": 1.03070271, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.832987218796697, "language_loss": 0.93621892, "learning_rate": 3.983201794440517e-06, "loss": 0.95918739, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.6742899417877197 }, { "auxiliary_loss_clip": 0.01220318, "auxiliary_loss_mlp": 0.01050469, "balance_loss_clip": 1.06851435, "balance_loss_mlp": 1.03387523, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 2.6643153721225743, "language_loss": 0.67325425, "learning_rate": 3.9831008953634165e-06, "loss": 0.69596207, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.804217576980591 }, { "auxiliary_loss_clip": 0.01180095, "auxiliary_loss_mlp": 0.01053379, "balance_loss_clip": 1.06299782, "balance_loss_mlp": 1.03606939, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 2.0959577498633224, "language_loss": 0.81231624, "learning_rate": 3.9829996954514864e-06, "loss": 0.83465099, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.76171875 }, { "auxiliary_loss_clip": 0.01239716, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.07201624, "balance_loss_mlp": 1.02777457, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.9640172895441583, "language_loss": 0.84129786, "learning_rate": 3.982898194720079e-06, "loss": 0.86414313, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.7000339031219482 }, { "auxiliary_loss_clip": 0.01227383, "auxiliary_loss_mlp": 0.00714892, "balance_loss_clip": 1.07195508, "balance_loss_mlp": 0.99989098, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.034554658657392, "language_loss": 0.82056296, "learning_rate": 3.982796393184592e-06, "loss": 0.83998573, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.7240779399871826 }, { "auxiliary_loss_clip": 0.01167807, "auxiliary_loss_mlp": 0.01013867, "balance_loss_clip": 1.06603765, "balance_loss_mlp": 1.00514066, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.789531110255125, "language_loss": 0.62648475, "learning_rate": 3.98269429086047e-06, "loss": 0.6483016, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.130704402923584 }, { "auxiliary_loss_clip": 0.01223133, "auxiliary_loss_mlp": 0.01046841, "balance_loss_clip": 1.06890559, "balance_loss_mlp": 1.02918601, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 2.3676663336371924, "language_loss": 0.86158901, "learning_rate": 3.982591887763199e-06, "loss": 0.88428873, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.7088325023651123 }, { "auxiliary_loss_clip": 0.01188135, "auxiliary_loss_mlp": 0.01044174, "balance_loss_clip": 1.05839729, "balance_loss_mlp": 1.02715063, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.1194791324701803, "language_loss": 0.8168329, "learning_rate": 3.982489183908316e-06, "loss": 0.83915603, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.7047853469848633 }, { "auxiliary_loss_clip": 0.01152858, "auxiliary_loss_mlp": 0.01052214, "balance_loss_clip": 1.05525637, "balance_loss_mlp": 1.03606093, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 2.386202638960888, "language_loss": 0.84509361, "learning_rate": 3.982386179311399e-06, "loss": 0.86714429, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 4.63289737701416 }, { "auxiliary_loss_clip": 0.01259291, "auxiliary_loss_mlp": 0.01048495, "balance_loss_clip": 1.07562363, "balance_loss_mlp": 1.03064919, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.766286935001125, "language_loss": 0.87643266, "learning_rate": 3.982282873988075e-06, "loss": 0.8995105, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 2.6648709774017334 }, { "auxiliary_loss_clip": 0.01229487, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.07032561, "balance_loss_mlp": 1.03209949, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.7541184034815682, "language_loss": 0.87345243, "learning_rate": 3.982179267954016e-06, "loss": 0.89623821, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 3.613034248352051 }, { "auxiliary_loss_clip": 0.01268503, "auxiliary_loss_mlp": 0.01055507, "balance_loss_clip": 1.07668316, "balance_loss_mlp": 1.03787613, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.2794319890269987, "language_loss": 0.96231711, "learning_rate": 3.982075361224937e-06, "loss": 0.98555726, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.575178384780884 }, { "auxiliary_loss_clip": 0.01246847, "auxiliary_loss_mlp": 0.00714318, "balance_loss_clip": 1.07367039, "balance_loss_mlp": 0.99987793, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 1.8978080729543805, "language_loss": 0.88237572, "learning_rate": 3.981971153816602e-06, "loss": 0.90198731, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.65859055519104 }, { "auxiliary_loss_clip": 0.01265447, "auxiliary_loss_mlp": 0.01055349, "balance_loss_clip": 1.07694864, "balance_loss_mlp": 1.03862357, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.5561764648918748, "language_loss": 0.96328902, "learning_rate": 3.981866645744819e-06, "loss": 0.98649704, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.6570844650268555 }, { "auxiliary_loss_clip": 0.01268796, "auxiliary_loss_mlp": 0.00714801, "balance_loss_clip": 1.07541144, "balance_loss_mlp": 0.999865, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.6737013697252157, "language_loss": 0.81873095, "learning_rate": 3.9817618370254416e-06, "loss": 0.8385669, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.641817331314087 }, { "auxiliary_loss_clip": 0.01270011, "auxiliary_loss_mlp": 0.01049337, "balance_loss_clip": 1.07644892, "balance_loss_mlp": 1.03262424, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.149012460564434, "language_loss": 0.87245172, "learning_rate": 3.9816567276743684e-06, "loss": 0.89564526, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.705622911453247 }, { "auxiliary_loss_clip": 0.01229327, "auxiliary_loss_mlp": 0.01053566, "balance_loss_clip": 1.07071161, "balance_loss_mlp": 1.03699613, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 1.9807774949635832, "language_loss": 0.77503073, "learning_rate": 3.9815513177075466e-06, "loss": 0.79785961, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.711258888244629 }, { "auxiliary_loss_clip": 0.01245503, "auxiliary_loss_mlp": 0.01041629, "balance_loss_clip": 1.07404041, "balance_loss_mlp": 1.02472544, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.8779818936660921, "language_loss": 0.70327586, "learning_rate": 3.9814456071409646e-06, "loss": 0.72614717, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.713660478591919 }, { "auxiliary_loss_clip": 0.01202404, "auxiliary_loss_mlp": 0.01059213, "balance_loss_clip": 1.06615674, "balance_loss_mlp": 1.04056859, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 3.1499463821814873, "language_loss": 0.85216856, "learning_rate": 3.981339595990659e-06, "loss": 0.87478471, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.8162481784820557 }, { "auxiliary_loss_clip": 0.01252057, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.07480049, "balance_loss_mlp": 1.02859628, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.0102466706698596, "language_loss": 0.80831969, "learning_rate": 3.981233284272713e-06, "loss": 0.8313095, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.6817848682403564 }, { "auxiliary_loss_clip": 0.01214657, "auxiliary_loss_mlp": 0.01062468, "balance_loss_clip": 1.06877089, "balance_loss_mlp": 1.04612482, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 2.807826025087308, "language_loss": 0.90133131, "learning_rate": 3.981126672003253e-06, "loss": 0.92410254, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.807669162750244 }, { "auxiliary_loss_clip": 0.01235549, "auxiliary_loss_mlp": 0.01051563, "balance_loss_clip": 1.06850219, "balance_loss_mlp": 1.03493297, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.9593775870541874, "language_loss": 0.78308666, "learning_rate": 3.981019759198451e-06, "loss": 0.80595779, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.676867723464966 }, { "auxiliary_loss_clip": 0.01231411, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.06763506, "balance_loss_mlp": 1.02525425, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.073143046238568, "language_loss": 0.8386004, "learning_rate": 3.980912545874528e-06, "loss": 0.86133301, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.7437071800231934 }, { "auxiliary_loss_clip": 0.01243427, "auxiliary_loss_mlp": 0.00715256, "balance_loss_clip": 1.07325804, "balance_loss_mlp": 0.99989045, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 2.5706397921280795, "language_loss": 0.85653389, "learning_rate": 3.980805032047746e-06, "loss": 0.87612069, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.721827507019043 }, { "auxiliary_loss_clip": 0.01230157, "auxiliary_loss_mlp": 0.01050941, "balance_loss_clip": 1.07106519, "balance_loss_mlp": 1.03317881, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 2.0899971857601987, "language_loss": 0.81243432, "learning_rate": 3.980697217734415e-06, "loss": 0.83524531, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.5939531326293945 }, { "auxiliary_loss_clip": 0.01204594, "auxiliary_loss_mlp": 0.00714624, "balance_loss_clip": 1.06856918, "balance_loss_mlp": 0.99990773, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.7587678886973497, "language_loss": 0.9182139, "learning_rate": 3.980589102950891e-06, "loss": 0.93740606, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.7459568977355957 }, { "auxiliary_loss_clip": 0.0123272, "auxiliary_loss_mlp": 0.01053445, "balance_loss_clip": 1.07378554, "balance_loss_mlp": 1.03637457, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.6286547438418855, "language_loss": 0.75851196, "learning_rate": 3.9804806877135755e-06, "loss": 0.78137362, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.712810516357422 }, { "auxiliary_loss_clip": 0.01252566, "auxiliary_loss_mlp": 0.00715539, "balance_loss_clip": 1.07184494, "balance_loss_mlp": 0.99990547, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.2322591605879114, "language_loss": 0.85857016, "learning_rate": 3.980371972038915e-06, "loss": 0.87825119, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.715656042098999 }, { "auxiliary_loss_clip": 0.01269167, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.07753682, "balance_loss_mlp": 1.03469563, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.7856814794859912, "language_loss": 0.84228861, "learning_rate": 3.980262955943399e-06, "loss": 0.86550653, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.811112403869629 }, { "auxiliary_loss_clip": 0.01228491, "auxiliary_loss_mlp": 0.01049441, "balance_loss_clip": 1.07419944, "balance_loss_mlp": 1.03315747, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.39480968877589, "language_loss": 0.86774451, "learning_rate": 3.980153639443569e-06, "loss": 0.89052391, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.6555135250091553 }, { "auxiliary_loss_clip": 0.01243553, "auxiliary_loss_mlp": 0.01053033, "balance_loss_clip": 1.07574248, "balance_loss_mlp": 1.03510368, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.1493993300279164, "language_loss": 0.80211163, "learning_rate": 3.980044022556005e-06, "loss": 0.82507741, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.6659135818481445 }, { "auxiliary_loss_clip": 0.01255766, "auxiliary_loss_mlp": 0.01062291, "balance_loss_clip": 1.07735538, "balance_loss_mlp": 1.04432642, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.053392850515658, "language_loss": 0.72875738, "learning_rate": 3.9799341052973375e-06, "loss": 0.75193799, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.7103123664855957 }, { "auxiliary_loss_clip": 0.01233309, "auxiliary_loss_mlp": 0.01047468, "balance_loss_clip": 1.07695556, "balance_loss_mlp": 1.03074288, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.6662497103284517, "language_loss": 0.75183606, "learning_rate": 3.979823887684241e-06, "loss": 0.77464384, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.624419689178467 }, { "auxiliary_loss_clip": 0.01268459, "auxiliary_loss_mlp": 0.0104587, "balance_loss_clip": 1.07613063, "balance_loss_mlp": 1.02835798, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.4976607729436995, "language_loss": 0.84688807, "learning_rate": 3.979713369733434e-06, "loss": 0.87003136, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 2.5932111740112305 }, { "auxiliary_loss_clip": 0.01240764, "auxiliary_loss_mlp": 0.0104988, "balance_loss_clip": 1.07101679, "balance_loss_mlp": 1.03228474, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 2.0482950957138826, "language_loss": 0.85155332, "learning_rate": 3.979602551461683e-06, "loss": 0.87445974, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 3.622286796569824 }, { "auxiliary_loss_clip": 0.01233259, "auxiliary_loss_mlp": 0.01047926, "balance_loss_clip": 1.07436287, "balance_loss_mlp": 1.03149891, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.4192698059809876, "language_loss": 0.91325653, "learning_rate": 3.979491432885799e-06, "loss": 0.9360683, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 2.7320876121520996 }, { "auxiliary_loss_clip": 0.01196985, "auxiliary_loss_mlp": 0.00714722, "balance_loss_clip": 1.06581378, "balance_loss_mlp": 0.99990308, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 1.9211102607932715, "language_loss": 0.82953024, "learning_rate": 3.97938001402264e-06, "loss": 0.84864736, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 3.672117233276367 }, { "auxiliary_loss_clip": 0.01212138, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.06983709, "balance_loss_mlp": 1.02957463, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 3.537747114124009, "language_loss": 0.79752409, "learning_rate": 3.979268294889105e-06, "loss": 0.82010943, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 3.6478452682495117 }, { "auxiliary_loss_clip": 0.01268014, "auxiliary_loss_mlp": 0.01053274, "balance_loss_clip": 1.0778383, "balance_loss_mlp": 1.03708553, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 1.7323449509369953, "language_loss": 0.73682499, "learning_rate": 3.979156275502143e-06, "loss": 0.7600379, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 2.84889817237854 }, { "auxiliary_loss_clip": 0.01217475, "auxiliary_loss_mlp": 0.01048704, "balance_loss_clip": 1.07056689, "balance_loss_mlp": 1.03163338, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.370180640337387, "language_loss": 0.91582221, "learning_rate": 3.979043955878749e-06, "loss": 0.93848395, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.707540988922119 }, { "auxiliary_loss_clip": 0.01233182, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.07318449, "balance_loss_mlp": 1.03999162, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 4.398057011250551, "language_loss": 0.83029479, "learning_rate": 3.978931336035959e-06, "loss": 0.85319602, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.663700819015503 }, { "auxiliary_loss_clip": 0.01247599, "auxiliary_loss_mlp": 0.01055252, "balance_loss_clip": 1.07603359, "balance_loss_mlp": 1.03805053, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 3.521280609692033, "language_loss": 0.82127094, "learning_rate": 3.9788184159908595e-06, "loss": 0.84429944, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.683063507080078 }, { "auxiliary_loss_clip": 0.01219965, "auxiliary_loss_mlp": 0.01055229, "balance_loss_clip": 1.06866598, "balance_loss_mlp": 1.03938627, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 2.6580361310405407, "language_loss": 0.82753229, "learning_rate": 3.97870519576058e-06, "loss": 0.85028434, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.6477959156036377 }, { "auxiliary_loss_clip": 0.01206994, "auxiliary_loss_mlp": 0.00714762, "balance_loss_clip": 1.06572652, "balance_loss_mlp": 0.99987197, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 4.993691948756567, "language_loss": 0.80948079, "learning_rate": 3.978591675362295e-06, "loss": 0.8286984, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.7594616413116455 }, { "auxiliary_loss_clip": 0.01194832, "auxiliary_loss_mlp": 0.01062559, "balance_loss_clip": 1.07051539, "balance_loss_mlp": 1.04597664, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.7787471393972045, "language_loss": 0.87789816, "learning_rate": 3.978477854813226e-06, "loss": 0.90047204, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.6982996463775635 }, { "auxiliary_loss_clip": 0.0125116, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.07264972, "balance_loss_mlp": 1.03517675, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.8557307468285689, "language_loss": 0.82526731, "learning_rate": 3.97836373413064e-06, "loss": 0.84829545, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.7785568237304688 }, { "auxiliary_loss_clip": 0.01265769, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.07342291, "balance_loss_mlp": 1.03162456, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 2.226172749583657, "language_loss": 0.74584293, "learning_rate": 3.978249313331848e-06, "loss": 0.76898491, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.650642156600952 }, { "auxiliary_loss_clip": 0.01254005, "auxiliary_loss_mlp": 0.0071519, "balance_loss_clip": 1.07264709, "balance_loss_mlp": 0.99987561, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 3.1528781533910917, "language_loss": 0.62570065, "learning_rate": 3.978134592434208e-06, "loss": 0.6453926, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.6905219554901123 }, { "auxiliary_loss_clip": 0.01110236, "auxiliary_loss_mlp": 0.01014299, "balance_loss_clip": 1.05888295, "balance_loss_mlp": 1.00581157, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0069277267309495, "language_loss": 0.59390718, "learning_rate": 3.978019571455123e-06, "loss": 0.61515254, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.322690486907959 }, { "auxiliary_loss_clip": 0.01265099, "auxiliary_loss_mlp": 0.01052657, "balance_loss_clip": 1.0760181, "balance_loss_mlp": 1.0371362, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.158954723129447, "language_loss": 0.84110188, "learning_rate": 3.977904250412042e-06, "loss": 0.86427951, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.6299455165863037 }, { "auxiliary_loss_clip": 0.01233959, "auxiliary_loss_mlp": 0.01056597, "balance_loss_clip": 1.07034564, "balance_loss_mlp": 1.04013443, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.5727884293161205, "language_loss": 0.85389566, "learning_rate": 3.97778862932246e-06, "loss": 0.87680125, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.7696094512939453 }, { "auxiliary_loss_clip": 0.01116579, "auxiliary_loss_mlp": 0.0105153, "balance_loss_clip": 1.04752398, "balance_loss_mlp": 1.03540087, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.372190118168669, "language_loss": 0.93702608, "learning_rate": 3.9776727082039144e-06, "loss": 0.95870715, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 2.8639159202575684 }, { "auxiliary_loss_clip": 0.01179654, "auxiliary_loss_mlp": 0.0100997, "balance_loss_clip": 1.06691682, "balance_loss_mlp": 1.00162566, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.808701759182082, "language_loss": 0.5549829, "learning_rate": 3.977556487073991e-06, "loss": 0.57687914, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.3662068843841553 }, { "auxiliary_loss_clip": 0.01214835, "auxiliary_loss_mlp": 0.01053137, "balance_loss_clip": 1.06155503, "balance_loss_mlp": 1.03736591, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 2.1534623849324572, "language_loss": 0.8152988, "learning_rate": 3.97743996595032e-06, "loss": 0.83797848, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.7417445182800293 }, { "auxiliary_loss_clip": 0.01265975, "auxiliary_loss_mlp": 0.01056613, "balance_loss_clip": 1.07564569, "balance_loss_mlp": 1.03978109, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.5552530520969272, "language_loss": 0.81669724, "learning_rate": 3.9773231448505804e-06, "loss": 0.83992314, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.640944242477417 }, { "auxiliary_loss_clip": 0.01224931, "auxiliary_loss_mlp": 0.00714674, "balance_loss_clip": 1.07148409, "balance_loss_mlp": 0.99982607, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 2.1243525777343293, "language_loss": 0.77952307, "learning_rate": 3.977206023792491e-06, "loss": 0.79891914, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.696671962738037 }, { "auxiliary_loss_clip": 0.01244923, "auxiliary_loss_mlp": 0.01048216, "balance_loss_clip": 1.07369471, "balance_loss_mlp": 1.03109729, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.768050716473268, "language_loss": 0.81202257, "learning_rate": 3.97708860279382e-06, "loss": 0.83495396, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.630913019180298 }, { "auxiliary_loss_clip": 0.01202692, "auxiliary_loss_mlp": 0.01063835, "balance_loss_clip": 1.06382573, "balance_loss_mlp": 1.04684758, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.7952782827683342, "language_loss": 0.78132868, "learning_rate": 3.97697088187238e-06, "loss": 0.80399394, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.7409825325012207 }, { "auxiliary_loss_clip": 0.01224653, "auxiliary_loss_mlp": 0.01053797, "balance_loss_clip": 1.07044196, "balance_loss_mlp": 1.03750074, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.2418127012765097, "language_loss": 0.9222123, "learning_rate": 3.976852861046029e-06, "loss": 0.94499683, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.6530394554138184 }, { "auxiliary_loss_clip": 0.01188244, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.06456828, "balance_loss_mlp": 1.02855933, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.6410904839435445, "language_loss": 0.80159974, "learning_rate": 3.97673454033267e-06, "loss": 0.82392406, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 2.9021050930023193 }, { "auxiliary_loss_clip": 0.01226544, "auxiliary_loss_mlp": 0.01042559, "balance_loss_clip": 1.06705785, "balance_loss_mlp": 1.02647781, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 1.9964073827276974, "language_loss": 0.82463312, "learning_rate": 3.976615919750254e-06, "loss": 0.84732419, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 3.572415828704834 }, { "auxiliary_loss_clip": 0.01246485, "auxiliary_loss_mlp": 0.01051938, "balance_loss_clip": 1.07395864, "balance_loss_mlp": 1.03539217, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 2.044870064157728, "language_loss": 0.86869937, "learning_rate": 3.976496999316775e-06, "loss": 0.89168364, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 3.5783252716064453 }, { "auxiliary_loss_clip": 0.01230186, "auxiliary_loss_mlp": 0.01053919, "balance_loss_clip": 1.07696962, "balance_loss_mlp": 1.0367049, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 1.9176913481314422, "language_loss": 0.84023494, "learning_rate": 3.976377779050271e-06, "loss": 0.86307597, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 3.6095848083496094 }, { "auxiliary_loss_clip": 0.01235039, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.06777072, "balance_loss_mlp": 1.03107882, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.366003213154095, "language_loss": 0.84218752, "learning_rate": 3.976258258968831e-06, "loss": 0.86502779, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.6422946453094482 }, { "auxiliary_loss_clip": 0.01212329, "auxiliary_loss_mlp": 0.0105268, "balance_loss_clip": 1.07172799, "balance_loss_mlp": 1.03613424, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.8926195463457547, "language_loss": 0.74470937, "learning_rate": 3.976138439090583e-06, "loss": 0.7673595, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.7141916751861572 }, { "auxiliary_loss_clip": 0.01215957, "auxiliary_loss_mlp": 0.01059023, "balance_loss_clip": 1.07328725, "balance_loss_mlp": 1.04202366, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 3.853588402655796, "language_loss": 0.85012376, "learning_rate": 3.976018319433706e-06, "loss": 0.87287354, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.7544445991516113 }, { "auxiliary_loss_clip": 0.01244312, "auxiliary_loss_mlp": 0.01053165, "balance_loss_clip": 1.07274389, "balance_loss_mlp": 1.03577185, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.2301114526419314, "language_loss": 0.91087925, "learning_rate": 3.9758979000164205e-06, "loss": 0.93385398, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.7175023555755615 }, { "auxiliary_loss_clip": 0.01215061, "auxiliary_loss_mlp": 0.01049426, "balance_loss_clip": 1.06895137, "balance_loss_mlp": 1.0319258, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.9206566138737509, "language_loss": 0.72113156, "learning_rate": 3.975777180856995e-06, "loss": 0.74377644, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.739147901535034 }, { "auxiliary_loss_clip": 0.01267953, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.0754236, "balance_loss_mlp": 1.03533471, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.0374531533642233, "language_loss": 0.86212707, "learning_rate": 3.975656161973742e-06, "loss": 0.88532352, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.6122539043426514 }, { "auxiliary_loss_clip": 0.0126432, "auxiliary_loss_mlp": 0.0105016, "balance_loss_clip": 1.0732795, "balance_loss_mlp": 1.03302956, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.5735440121133517, "language_loss": 0.88898808, "learning_rate": 3.9755348433850194e-06, "loss": 0.91213286, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.6172127723693848 }, { "auxiliary_loss_clip": 0.01141996, "auxiliary_loss_mlp": 0.01009711, "balance_loss_clip": 1.05704546, "balance_loss_mlp": 1.00155687, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9648554952500676, "language_loss": 0.63625515, "learning_rate": 3.975413225109232e-06, "loss": 0.65777218, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.2609641551971436 }, { "auxiliary_loss_clip": 0.01248073, "auxiliary_loss_mlp": 0.01041793, "balance_loss_clip": 1.07390356, "balance_loss_mlp": 1.02486551, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 3.5831542351533807, "language_loss": 0.93314338, "learning_rate": 3.975291307164829e-06, "loss": 0.95604205, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.6438934803009033 }, { "auxiliary_loss_clip": 0.01198141, "auxiliary_loss_mlp": 0.01053712, "balance_loss_clip": 1.0639782, "balance_loss_mlp": 1.03822672, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 1.8782228761594424, "language_loss": 0.84987092, "learning_rate": 3.975169089570306e-06, "loss": 0.87238938, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.729705333709717 }, { "auxiliary_loss_clip": 0.01234578, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.0697844, "balance_loss_mlp": 1.02174807, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 2.1238418821728513, "language_loss": 0.91746116, "learning_rate": 3.975046572344202e-06, "loss": 0.94018185, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.5955705642700195 }, { "auxiliary_loss_clip": 0.0120652, "auxiliary_loss_mlp": 0.01045373, "balance_loss_clip": 1.06391931, "balance_loss_mlp": 1.02852869, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 2.08321209994197, "language_loss": 0.70923746, "learning_rate": 3.974923755505103e-06, "loss": 0.73175633, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.746385097503662 }, { "auxiliary_loss_clip": 0.0119701, "auxiliary_loss_mlp": 0.01052957, "balance_loss_clip": 1.06558692, "balance_loss_mlp": 1.03637481, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.6855601790021717, "language_loss": 0.91118282, "learning_rate": 3.974800639071641e-06, "loss": 0.93368244, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.721496820449829 }, { "auxiliary_loss_clip": 0.01160318, "auxiliary_loss_mlp": 0.00715245, "balance_loss_clip": 1.05942237, "balance_loss_mlp": 0.9998647, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 2.256182934958743, "language_loss": 1.00933814, "learning_rate": 3.974677223062492e-06, "loss": 1.02809381, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.7883834838867188 }, { "auxiliary_loss_clip": 0.01226753, "auxiliary_loss_mlp": 0.0104674, "balance_loss_clip": 1.07111514, "balance_loss_mlp": 1.03045571, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 3.702460878613956, "language_loss": 0.74402034, "learning_rate": 3.974553507496378e-06, "loss": 0.76675534, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.639979124069214 }, { "auxiliary_loss_clip": 0.01220668, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.07066059, "balance_loss_mlp": 1.03383529, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.2331162829365603, "language_loss": 0.89105833, "learning_rate": 3.974429492392068e-06, "loss": 0.91377246, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.6933038234710693 }, { "auxiliary_loss_clip": 0.01264417, "auxiliary_loss_mlp": 0.00713732, "balance_loss_clip": 1.07728314, "balance_loss_mlp": 0.99986237, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 2.090909609286468, "language_loss": 0.90996182, "learning_rate": 3.974305177768373e-06, "loss": 0.92974329, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.578676223754883 }, { "auxiliary_loss_clip": 0.01201562, "auxiliary_loss_mlp": 0.01047047, "balance_loss_clip": 1.07031739, "balance_loss_mlp": 1.03014338, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 6.276382082504064, "language_loss": 0.86780047, "learning_rate": 3.974180563644152e-06, "loss": 0.89028651, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.7415149211883545 }, { "auxiliary_loss_clip": 0.01228809, "auxiliary_loss_mlp": 0.01057577, "balance_loss_clip": 1.07107067, "balance_loss_mlp": 1.04063773, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 5.7148746992149775, "language_loss": 0.89112115, "learning_rate": 3.97405565003831e-06, "loss": 0.91398501, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.6238465309143066 }, { "auxiliary_loss_clip": 0.01217088, "auxiliary_loss_mlp": 0.01048766, "balance_loss_clip": 1.069152, "balance_loss_mlp": 1.03280401, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.006633313136334, "language_loss": 0.78074324, "learning_rate": 3.973930436969794e-06, "loss": 0.80340183, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.6555590629577637 }, { "auxiliary_loss_clip": 0.01218059, "auxiliary_loss_mlp": 0.01050038, "balance_loss_clip": 1.06658494, "balance_loss_mlp": 1.03277683, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.8501728125966126, "language_loss": 0.8576135, "learning_rate": 3.973804924457602e-06, "loss": 0.88029444, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.6747195720672607 }, { "auxiliary_loss_clip": 0.01222256, "auxiliary_loss_mlp": 0.01057507, "balance_loss_clip": 1.06919289, "balance_loss_mlp": 1.04149699, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.7501342662260306, "language_loss": 0.8579067, "learning_rate": 3.973679112520771e-06, "loss": 0.88070428, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.7237730026245117 }, { "auxiliary_loss_clip": 0.01200523, "auxiliary_loss_mlp": 0.01053012, "balance_loss_clip": 1.06298399, "balance_loss_mlp": 1.03662086, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 1.8384687455557926, "language_loss": 0.98624164, "learning_rate": 3.973553001178389e-06, "loss": 1.0087769, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 2.7007884979248047 }, { "auxiliary_loss_clip": 0.01209227, "auxiliary_loss_mlp": 0.01048217, "balance_loss_clip": 1.07120705, "balance_loss_mlp": 1.03117001, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.0913868578986508, "language_loss": 0.75536895, "learning_rate": 3.973426590449585e-06, "loss": 0.77794337, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 3.6037092208862305 }, { "auxiliary_loss_clip": 0.01192107, "auxiliary_loss_mlp": 0.01056613, "balance_loss_clip": 1.06639087, "balance_loss_mlp": 1.04030514, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 2.163091721272781, "language_loss": 0.75414044, "learning_rate": 3.9732998803535364e-06, "loss": 0.77662766, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 3.547450065612793 }, { "auxiliary_loss_clip": 0.01264126, "auxiliary_loss_mlp": 0.01046645, "balance_loss_clip": 1.07562304, "balance_loss_mlp": 1.03030097, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.3853456954561922, "language_loss": 0.85249698, "learning_rate": 3.973172870909465e-06, "loss": 0.87560475, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 4.39754319190979 }, { "auxiliary_loss_clip": 0.01232349, "auxiliary_loss_mlp": 0.01047138, "balance_loss_clip": 1.06763113, "balance_loss_mlp": 1.03004372, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.924594823602943, "language_loss": 0.80730283, "learning_rate": 3.973045562136638e-06, "loss": 0.83009768, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 2.7104828357696533 }, { "auxiliary_loss_clip": 0.01245274, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.07120407, "balance_loss_mlp": 1.03132081, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.0154764190446652, "language_loss": 0.91733241, "learning_rate": 3.972917954054368e-06, "loss": 0.94026196, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.654897451400757 }, { "auxiliary_loss_clip": 0.01225889, "auxiliary_loss_mlp": 0.01058334, "balance_loss_clip": 1.07452488, "balance_loss_mlp": 1.04118013, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.436346691484905, "language_loss": 0.8213371, "learning_rate": 3.972790046682013e-06, "loss": 0.84417933, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.693506956100464 }, { "auxiliary_loss_clip": 0.01207198, "auxiliary_loss_mlp": 0.01051988, "balance_loss_clip": 1.06259191, "balance_loss_mlp": 1.03547716, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.8071724413738803, "language_loss": 0.79044867, "learning_rate": 3.972661840038977e-06, "loss": 0.81304049, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.796018123626709 }, { "auxiliary_loss_clip": 0.0124782, "auxiliary_loss_mlp": 0.01056522, "balance_loss_clip": 1.0756253, "balance_loss_mlp": 1.04007089, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.535457963147819, "language_loss": 0.83318579, "learning_rate": 3.972533334144707e-06, "loss": 0.85622919, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.6544883251190186 }, { "auxiliary_loss_clip": 0.01245919, "auxiliary_loss_mlp": 0.01048643, "balance_loss_clip": 1.0708009, "balance_loss_mlp": 1.03204858, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.391143487801178, "language_loss": 0.78218377, "learning_rate": 3.972404529018699e-06, "loss": 0.80512941, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.6570043563842773 }, { "auxiliary_loss_clip": 0.01216079, "auxiliary_loss_mlp": 0.01054692, "balance_loss_clip": 1.06128979, "balance_loss_mlp": 1.03781247, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 2.1412723963772313, "language_loss": 0.85266638, "learning_rate": 3.972275424680493e-06, "loss": 0.87537408, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.7333953380584717 }, { "auxiliary_loss_clip": 0.01263896, "auxiliary_loss_mlp": 0.01046853, "balance_loss_clip": 1.07470381, "balance_loss_mlp": 1.03009224, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.2831784306579923, "language_loss": 0.9179101, "learning_rate": 3.972146021149673e-06, "loss": 0.94101757, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.583218812942505 }, { "auxiliary_loss_clip": 0.01207387, "auxiliary_loss_mlp": 0.01039992, "balance_loss_clip": 1.06636238, "balance_loss_mlp": 1.02488792, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 2.202038195832464, "language_loss": 0.78313553, "learning_rate": 3.972016318445868e-06, "loss": 0.80560935, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.708268880844116 }, { "auxiliary_loss_clip": 0.01242284, "auxiliary_loss_mlp": 0.01045473, "balance_loss_clip": 1.06937814, "balance_loss_mlp": 1.02939177, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 1.9132208100680745, "language_loss": 0.92064404, "learning_rate": 3.971886316588757e-06, "loss": 0.94352162, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.6842217445373535 }, { "auxiliary_loss_clip": 0.01200287, "auxiliary_loss_mlp": 0.01054207, "balance_loss_clip": 1.06682038, "balance_loss_mlp": 1.03668356, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.382998244137404, "language_loss": 0.73368037, "learning_rate": 3.9717560155980595e-06, "loss": 0.75622535, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.704754114151001 }, { "auxiliary_loss_clip": 0.01241914, "auxiliary_loss_mlp": 0.01044631, "balance_loss_clip": 1.06950164, "balance_loss_mlp": 1.0284307, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 2.1412529161496168, "language_loss": 0.92226136, "learning_rate": 3.971625415493542e-06, "loss": 0.94512677, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.63287353515625 }, { "auxiliary_loss_clip": 0.01204733, "auxiliary_loss_mlp": 0.01049771, "balance_loss_clip": 1.06534088, "balance_loss_mlp": 1.03383279, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.8549500099728176, "language_loss": 0.87337953, "learning_rate": 3.971494516295017e-06, "loss": 0.89592463, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.7637996673583984 }, { "auxiliary_loss_clip": 0.01207783, "auxiliary_loss_mlp": 0.01055649, "balance_loss_clip": 1.06327295, "balance_loss_mlp": 1.03975809, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 2.2660212198585534, "language_loss": 0.85290515, "learning_rate": 3.971363318022341e-06, "loss": 0.87553948, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.7566356658935547 }, { "auxiliary_loss_clip": 0.0122383, "auxiliary_loss_mlp": 0.01050862, "balance_loss_clip": 1.06355369, "balance_loss_mlp": 1.03423214, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 1.7593767995059655, "language_loss": 0.68213356, "learning_rate": 3.971231820695417e-06, "loss": 0.70488042, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.8136510848999023 }, { "auxiliary_loss_clip": 0.01232699, "auxiliary_loss_mlp": 0.0105444, "balance_loss_clip": 1.07015038, "balance_loss_mlp": 1.03798914, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 3.317842692299832, "language_loss": 0.81414056, "learning_rate": 3.971100024334193e-06, "loss": 0.83701193, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.7093007564544678 }, { "auxiliary_loss_clip": 0.01192366, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.06197262, "balance_loss_mlp": 1.02963662, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.375544003630478, "language_loss": 0.86455321, "learning_rate": 3.970967928958663e-06, "loss": 0.88692766, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.724705457687378 }, { "auxiliary_loss_clip": 0.01194826, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.06331909, "balance_loss_mlp": 1.03094566, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.9816071740443046, "language_loss": 0.83471656, "learning_rate": 3.970835534588865e-06, "loss": 0.85712874, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.870885133743286 }, { "auxiliary_loss_clip": 0.01229273, "auxiliary_loss_mlp": 0.01056919, "balance_loss_clip": 1.07206917, "balance_loss_mlp": 1.04089737, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8563064947222667, "language_loss": 0.85415769, "learning_rate": 3.970702841244883e-06, "loss": 0.87701964, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.6562461853027344 }, { "auxiliary_loss_clip": 0.01247865, "auxiliary_loss_mlp": 0.01050914, "balance_loss_clip": 1.07393205, "balance_loss_mlp": 1.0343082, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.8358671440294425, "language_loss": 0.82587421, "learning_rate": 3.970569848946847e-06, "loss": 0.84886205, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.6774282455444336 }, { "auxiliary_loss_clip": 0.01228079, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.06812501, "balance_loss_mlp": 1.02880621, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 3.7818226959810763, "language_loss": 0.83271378, "learning_rate": 3.970436557714932e-06, "loss": 0.85544825, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.6305885314941406 }, { "auxiliary_loss_clip": 0.01221805, "auxiliary_loss_mlp": 0.01059411, "balance_loss_clip": 1.06806016, "balance_loss_mlp": 1.04255497, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.2059588078887296, "language_loss": 0.86702937, "learning_rate": 3.970302967569358e-06, "loss": 0.8898415, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.698521137237549 }, { "auxiliary_loss_clip": 0.01243001, "auxiliary_loss_mlp": 0.01040804, "balance_loss_clip": 1.07293129, "balance_loss_mlp": 1.02354252, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 2.432240161492277, "language_loss": 0.68346024, "learning_rate": 3.9701690785303896e-06, "loss": 0.70629823, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 2.679431676864624 }, { "auxiliary_loss_clip": 0.01242926, "auxiliary_loss_mlp": 0.01051511, "balance_loss_clip": 1.06895387, "balance_loss_mlp": 1.03453565, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.086242561733085, "language_loss": 0.88279814, "learning_rate": 3.970034890618339e-06, "loss": 0.90574253, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 3.646544933319092 }, { "auxiliary_loss_clip": 0.01226624, "auxiliary_loss_mlp": 0.01047315, "balance_loss_clip": 1.06581211, "balance_loss_mlp": 1.03142464, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 2.109097736172267, "language_loss": 0.88393211, "learning_rate": 3.969900403853562e-06, "loss": 0.90667146, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 2.8122472763061523 }, { "auxiliary_loss_clip": 0.01259152, "auxiliary_loss_mlp": 0.01054496, "balance_loss_clip": 1.07290184, "balance_loss_mlp": 1.03825951, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.6926859172802118, "language_loss": 0.78183675, "learning_rate": 3.96976561825646e-06, "loss": 0.80497319, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.5174002647399902 }, { "auxiliary_loss_clip": 0.01197146, "auxiliary_loss_mlp": 0.0105132, "balance_loss_clip": 1.06792915, "balance_loss_mlp": 1.03505945, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 1.8261253828370636, "language_loss": 0.87132978, "learning_rate": 3.969630533847479e-06, "loss": 0.89381444, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.835599899291992 }, { "auxiliary_loss_clip": 0.01241254, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.06904221, "balance_loss_mlp": 1.02705789, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 2.1600484177122445, "language_loss": 0.84329998, "learning_rate": 3.969495150647113e-06, "loss": 0.86614108, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.7862651348114014 }, { "auxiliary_loss_clip": 0.01207678, "auxiliary_loss_mlp": 0.01045354, "balance_loss_clip": 1.07114875, "balance_loss_mlp": 1.02933216, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.852851370379316, "language_loss": 0.76572496, "learning_rate": 3.969359468675899e-06, "loss": 0.78825521, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.778244733810425 }, { "auxiliary_loss_clip": 0.01242732, "auxiliary_loss_mlp": 0.01055488, "balance_loss_clip": 1.07431781, "balance_loss_mlp": 1.04013407, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.507031606890189, "language_loss": 0.89308459, "learning_rate": 3.969223487954418e-06, "loss": 0.91606677, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.679119110107422 }, { "auxiliary_loss_clip": 0.01190672, "auxiliary_loss_mlp": 0.01060961, "balance_loss_clip": 1.06804085, "balance_loss_mlp": 1.04505873, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 5.073883234211682, "language_loss": 0.82214391, "learning_rate": 3.969087208503301e-06, "loss": 0.84466028, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.7128443717956543 }, { "auxiliary_loss_clip": 0.01191872, "auxiliary_loss_mlp": 0.01054714, "balance_loss_clip": 1.06503105, "balance_loss_mlp": 1.03721404, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 3.9525024849222707, "language_loss": 0.84506178, "learning_rate": 3.968950630343219e-06, "loss": 0.8675276, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.729703187942505 }, { "auxiliary_loss_clip": 0.01216826, "auxiliary_loss_mlp": 0.01041832, "balance_loss_clip": 1.06340849, "balance_loss_mlp": 1.02601254, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.002200142842333, "language_loss": 0.93552649, "learning_rate": 3.968813753494892e-06, "loss": 0.95811307, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.7150304317474365 }, { "auxiliary_loss_clip": 0.01196213, "auxiliary_loss_mlp": 0.00714518, "balance_loss_clip": 1.06338787, "balance_loss_mlp": 0.9998765, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.431950052755458, "language_loss": 0.75555021, "learning_rate": 3.968676577979084e-06, "loss": 0.77465749, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.7708098888397217 }, { "auxiliary_loss_clip": 0.01189358, "auxiliary_loss_mlp": 0.01053917, "balance_loss_clip": 1.06376171, "balance_loss_mlp": 1.03703737, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.0971832394819527, "language_loss": 0.77842605, "learning_rate": 3.968539103816605e-06, "loss": 0.80085886, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.743467330932617 }, { "auxiliary_loss_clip": 0.01223585, "auxiliary_loss_mlp": 0.00714278, "balance_loss_clip": 1.06972706, "balance_loss_mlp": 0.9998883, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 2.070414006973879, "language_loss": 0.89144278, "learning_rate": 3.9684013310283085e-06, "loss": 0.91082138, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.7231340408325195 }, { "auxiliary_loss_clip": 0.01226167, "auxiliary_loss_mlp": 0.010473, "balance_loss_clip": 1.07449245, "balance_loss_mlp": 1.03202951, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 1.897379077213272, "language_loss": 0.64111596, "learning_rate": 3.9682632596350956e-06, "loss": 0.66385067, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.79115629196167 }, { "auxiliary_loss_clip": 0.01240966, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.07243156, "balance_loss_mlp": 1.024966, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.011177225818738, "language_loss": 0.78514379, "learning_rate": 3.968124889657911e-06, "loss": 0.80796134, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.6500754356384277 }, { "auxiliary_loss_clip": 0.01186576, "auxiliary_loss_mlp": 0.01047112, "balance_loss_clip": 1.06128335, "balance_loss_mlp": 1.03144836, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.293816344309342, "language_loss": 0.90515953, "learning_rate": 3.967986221117746e-06, "loss": 0.92749643, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.7201101779937744 }, { "auxiliary_loss_clip": 0.01162688, "auxiliary_loss_mlp": 0.01051428, "balance_loss_clip": 1.06067729, "balance_loss_mlp": 1.0363245, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 2.4263018673297005, "language_loss": 0.86392421, "learning_rate": 3.967847254035635e-06, "loss": 0.88606536, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 2.9515247344970703 }, { "auxiliary_loss_clip": 0.01211336, "auxiliary_loss_mlp": 0.01045117, "balance_loss_clip": 1.06735063, "balance_loss_mlp": 1.02832007, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 3.992493932531268, "language_loss": 0.86710602, "learning_rate": 3.967707988432661e-06, "loss": 0.88967061, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.985285997390747 }, { "auxiliary_loss_clip": 0.01257565, "auxiliary_loss_mlp": 0.01052296, "balance_loss_clip": 1.07045221, "balance_loss_mlp": 1.03546333, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.3330592479771184, "language_loss": 0.87955618, "learning_rate": 3.967568424329949e-06, "loss": 0.90265477, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.652465581893921 }, { "auxiliary_loss_clip": 0.01141897, "auxiliary_loss_mlp": 0.01011015, "balance_loss_clip": 1.05077612, "balance_loss_mlp": 1.00267017, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8164374358260755, "language_loss": 0.55500352, "learning_rate": 3.967428561748671e-06, "loss": 0.57653272, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.382974147796631 }, { "auxiliary_loss_clip": 0.01181068, "auxiliary_loss_mlp": 0.01058269, "balance_loss_clip": 1.060534, "balance_loss_mlp": 1.04051852, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 2.14630799360798, "language_loss": 0.87496126, "learning_rate": 3.967288400710045e-06, "loss": 0.8973546, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.733536720275879 }, { "auxiliary_loss_clip": 0.01206236, "auxiliary_loss_mlp": 0.01058038, "balance_loss_clip": 1.07081938, "balance_loss_mlp": 1.04195678, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 1.8956864780967915, "language_loss": 0.88632977, "learning_rate": 3.9671479412353335e-06, "loss": 0.9089725, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.7371950149536133 }, { "auxiliary_loss_clip": 0.0123725, "auxiliary_loss_mlp": 0.01048814, "balance_loss_clip": 1.06977963, "balance_loss_mlp": 1.03182721, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.1635723983291895, "language_loss": 0.73898244, "learning_rate": 3.967007183345843e-06, "loss": 0.76184309, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.736846446990967 }, { "auxiliary_loss_clip": 0.01238859, "auxiliary_loss_mlp": 0.0105428, "balance_loss_clip": 1.0716846, "balance_loss_mlp": 1.03766274, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 4.6358822015667975, "language_loss": 0.89362526, "learning_rate": 3.966866127062927e-06, "loss": 0.91655672, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.6249403953552246 }, { "auxiliary_loss_clip": 0.0115843, "auxiliary_loss_mlp": 0.01008418, "balance_loss_clip": 1.06392086, "balance_loss_mlp": 1.00040746, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.9022124067565789, "language_loss": 0.62683237, "learning_rate": 3.966724772407982e-06, "loss": 0.6485008, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.074467182159424 }, { "auxiliary_loss_clip": 0.01195578, "auxiliary_loss_mlp": 0.01050961, "balance_loss_clip": 1.06615567, "balance_loss_mlp": 1.03522587, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.4020271974177323, "language_loss": 0.88745862, "learning_rate": 3.966583119402454e-06, "loss": 0.90992403, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 2.6679155826568604 }, { "auxiliary_loss_clip": 0.01238389, "auxiliary_loss_mlp": 0.007146, "balance_loss_clip": 1.07055163, "balance_loss_mlp": 0.99984342, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.7244737900178941, "language_loss": 0.82165009, "learning_rate": 3.9664411680678305e-06, "loss": 0.84117997, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 3.8212661743164062 }, { "auxiliary_loss_clip": 0.01125603, "auxiliary_loss_mlp": 0.0101048, "balance_loss_clip": 1.05783606, "balance_loss_mlp": 1.00223064, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8441980831159953, "language_loss": 0.61486912, "learning_rate": 3.966298918425644e-06, "loss": 0.63622993, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 3.152812957763672 }, { "auxiliary_loss_clip": 0.01247145, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.07236981, "balance_loss_mlp": 1.03473115, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 1.9062627175991322, "language_loss": 0.82625896, "learning_rate": 3.966156370497476e-06, "loss": 0.84924519, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 4.5646891593933105 }, { "auxiliary_loss_clip": 0.01247268, "auxiliary_loss_mlp": 0.01051548, "balance_loss_clip": 1.07241654, "balance_loss_mlp": 1.0350014, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.963716598189467, "language_loss": 0.88679743, "learning_rate": 3.96601352430495e-06, "loss": 0.90978563, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.68388295173645 }, { "auxiliary_loss_clip": 0.01224819, "auxiliary_loss_mlp": 0.01050582, "balance_loss_clip": 1.07167816, "balance_loss_mlp": 1.03475094, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.5072667557484887, "language_loss": 0.83242881, "learning_rate": 3.965870379869735e-06, "loss": 0.85518277, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 2.747403144836426 }, { "auxiliary_loss_clip": 0.01237446, "auxiliary_loss_mlp": 0.01056544, "balance_loss_clip": 1.06582713, "balance_loss_mlp": 1.04087961, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.2501685199182875, "language_loss": 0.87006223, "learning_rate": 3.965726937213547e-06, "loss": 0.89300215, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.6077756881713867 }, { "auxiliary_loss_clip": 0.01236462, "auxiliary_loss_mlp": 0.01045889, "balance_loss_clip": 1.06682849, "balance_loss_mlp": 1.02894938, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.925847768342367, "language_loss": 0.81145132, "learning_rate": 3.965583196358144e-06, "loss": 0.83427483, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.6763761043548584 }, { "auxiliary_loss_clip": 0.01261698, "auxiliary_loss_mlp": 0.01046343, "balance_loss_clip": 1.07247877, "balance_loss_mlp": 1.0300231, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.3433054521020016, "language_loss": 0.74139899, "learning_rate": 3.965439157325335e-06, "loss": 0.76447934, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.5851361751556396 }, { "auxiliary_loss_clip": 0.01216688, "auxiliary_loss_mlp": 0.01053164, "balance_loss_clip": 1.06403577, "balance_loss_mlp": 1.03637922, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 1.8646233964210999, "language_loss": 0.75608087, "learning_rate": 3.965294820136968e-06, "loss": 0.77877939, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.8003809452056885 }, { "auxiliary_loss_clip": 0.01224817, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.07075703, "balance_loss_mlp": 1.03425217, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.1848709964214974, "language_loss": 0.86896658, "learning_rate": 3.965150184814938e-06, "loss": 0.89174199, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.68776535987854 }, { "auxiliary_loss_clip": 0.01208518, "auxiliary_loss_mlp": 0.01052611, "balance_loss_clip": 1.06482553, "balance_loss_mlp": 1.03706563, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.0288284603247875, "language_loss": 0.76492453, "learning_rate": 3.965005251381189e-06, "loss": 0.78753585, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.6981189250946045 }, { "auxiliary_loss_clip": 0.01157644, "auxiliary_loss_mlp": 0.01010423, "balance_loss_clip": 1.05934048, "balance_loss_mlp": 1.00260246, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.8990286029061032, "language_loss": 0.64635915, "learning_rate": 3.964860019857705e-06, "loss": 0.6680398, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.242870569229126 }, { "auxiliary_loss_clip": 0.01261246, "auxiliary_loss_mlp": 0.01055656, "balance_loss_clip": 1.07510376, "balance_loss_mlp": 1.03982472, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.7963958165089806, "language_loss": 0.84102923, "learning_rate": 3.964714490266518e-06, "loss": 0.86419821, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.646322250366211 }, { "auxiliary_loss_clip": 0.01145058, "auxiliary_loss_mlp": 0.01008891, "balance_loss_clip": 1.05424833, "balance_loss_mlp": 1.0011189, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8834340494193248, "language_loss": 0.64646375, "learning_rate": 3.964568662629706e-06, "loss": 0.66800332, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.1093809604644775 }, { "auxiliary_loss_clip": 0.01235672, "auxiliary_loss_mlp": 0.01044483, "balance_loss_clip": 1.06492972, "balance_loss_mlp": 1.02791321, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 2.623061424215799, "language_loss": 0.84601188, "learning_rate": 3.9644225369693895e-06, "loss": 0.86881346, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.6396267414093018 }, { "auxiliary_loss_clip": 0.01256181, "auxiliary_loss_mlp": 0.01049614, "balance_loss_clip": 1.07169306, "balance_loss_mlp": 1.03362775, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 2.0881861833175193, "language_loss": 0.86737275, "learning_rate": 3.964276113307735e-06, "loss": 0.89043069, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.6414082050323486 }, { "auxiliary_loss_clip": 0.01200679, "auxiliary_loss_mlp": 0.01046393, "balance_loss_clip": 1.06663251, "balance_loss_mlp": 1.02938199, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.9654314518570204, "language_loss": 0.80554104, "learning_rate": 3.9641293916669574e-06, "loss": 0.82801175, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.6837844848632812 }, { "auxiliary_loss_clip": 0.01198469, "auxiliary_loss_mlp": 0.01051091, "balance_loss_clip": 1.06463873, "balance_loss_mlp": 1.03428292, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 1.721249805665902, "language_loss": 0.82509953, "learning_rate": 3.9639823720693115e-06, "loss": 0.84759521, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.757049798965454 }, { "auxiliary_loss_clip": 0.01118941, "auxiliary_loss_mlp": 0.01017321, "balance_loss_clip": 1.05458474, "balance_loss_mlp": 1.00959671, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8450202383348147, "language_loss": 0.60017407, "learning_rate": 3.963835054537102e-06, "loss": 0.62153667, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.3126132488250732 }, { "auxiliary_loss_clip": 0.01219285, "auxiliary_loss_mlp": 0.01043316, "balance_loss_clip": 1.06448829, "balance_loss_mlp": 1.02723515, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.188910968804085, "language_loss": 0.60703546, "learning_rate": 3.963687439092676e-06, "loss": 0.62966144, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.698420524597168 }, { "auxiliary_loss_clip": 0.01240089, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.07086325, "balance_loss_mlp": 1.0350852, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 2.1517025596601314, "language_loss": 0.80801451, "learning_rate": 3.963539525758427e-06, "loss": 0.83093464, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.6962087154388428 }, { "auxiliary_loss_clip": 0.01225886, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.06729054, "balance_loss_mlp": 1.02755141, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 1.9378502777935123, "language_loss": 0.67338431, "learning_rate": 3.9633913145567925e-06, "loss": 0.69608009, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.696507215499878 }, { "auxiliary_loss_clip": 0.01224134, "auxiliary_loss_mlp": 0.01047377, "balance_loss_clip": 1.06819105, "balance_loss_mlp": 1.0318439, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 1.8648132952334215, "language_loss": 0.81466615, "learning_rate": 3.9632428055102575e-06, "loss": 0.83738124, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.709775686264038 }, { "auxiliary_loss_clip": 0.01243713, "auxiliary_loss_mlp": 0.01049128, "balance_loss_clip": 1.07274151, "balance_loss_mlp": 1.03261721, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 8.447286707417645, "language_loss": 0.67129862, "learning_rate": 3.9630939986413495e-06, "loss": 0.69422698, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.7282426357269287 }, { "auxiliary_loss_clip": 0.01187166, "auxiliary_loss_mlp": 0.01055602, "balance_loss_clip": 1.06173563, "balance_loss_mlp": 1.03800702, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 2.0160397290947816, "language_loss": 0.7815969, "learning_rate": 3.962944893972643e-06, "loss": 0.80402458, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.7062184810638428 }, { "auxiliary_loss_clip": 0.0122065, "auxiliary_loss_mlp": 0.01047543, "balance_loss_clip": 1.06866932, "balance_loss_mlp": 1.03091323, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.6141050776586616, "language_loss": 0.90956259, "learning_rate": 3.962795491526756e-06, "loss": 0.93224454, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 3.5585176944732666 }, { "auxiliary_loss_clip": 0.01264088, "auxiliary_loss_mlp": 0.01056539, "balance_loss_clip": 1.07449818, "balance_loss_mlp": 1.03984988, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.4092990969832697, "language_loss": 0.89259058, "learning_rate": 3.962645791326354e-06, "loss": 0.91579688, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.616835117340088 }, { "auxiliary_loss_clip": 0.01239692, "auxiliary_loss_mlp": 0.01043324, "balance_loss_clip": 1.07231939, "balance_loss_mlp": 1.02839947, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 2.30734744213115, "language_loss": 0.83472764, "learning_rate": 3.962495793394146e-06, "loss": 0.85755783, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 2.6061198711395264 }, { "auxiliary_loss_clip": 0.01160919, "auxiliary_loss_mlp": 0.01017117, "balance_loss_clip": 1.05333221, "balance_loss_mlp": 1.00972641, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7451311250540584, "language_loss": 0.6122818, "learning_rate": 3.9623454977528864e-06, "loss": 0.63406217, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 4.784292221069336 }, { "auxiliary_loss_clip": 0.01206809, "auxiliary_loss_mlp": 0.01048199, "balance_loss_clip": 1.06584096, "balance_loss_mlp": 1.03096104, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.744213175336853, "language_loss": 0.85056508, "learning_rate": 3.962194904425375e-06, "loss": 0.87311518, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 2.727876901626587 }, { "auxiliary_loss_clip": 0.01236024, "auxiliary_loss_mlp": 0.01059387, "balance_loss_clip": 1.068892, "balance_loss_mlp": 1.04276907, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 2.282609589995523, "language_loss": 0.68014801, "learning_rate": 3.9620440134344566e-06, "loss": 0.70310211, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.6190149784088135 }, { "auxiliary_loss_clip": 0.01202508, "auxiliary_loss_mlp": 0.0105332, "balance_loss_clip": 1.06531894, "balance_loss_mlp": 1.03649962, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.1344272873352184, "language_loss": 0.82219267, "learning_rate": 3.9618928248030215e-06, "loss": 0.84475088, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.6911158561706543 }, { "auxiliary_loss_clip": 0.01238376, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.07103753, "balance_loss_mlp": 1.03624856, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.510134266709899, "language_loss": 0.82931012, "learning_rate": 3.961741338554005e-06, "loss": 0.85221732, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.6627390384674072 }, { "auxiliary_loss_clip": 0.01228007, "auxiliary_loss_mlp": 0.01039602, "balance_loss_clip": 1.06853116, "balance_loss_mlp": 1.0236038, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 1.8793185989552534, "language_loss": 0.7584039, "learning_rate": 3.9615895547103865e-06, "loss": 0.78108001, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.7629592418670654 }, { "auxiliary_loss_clip": 0.01219636, "auxiliary_loss_mlp": 0.0105886, "balance_loss_clip": 1.06601381, "balance_loss_mlp": 1.04293406, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 3.43065678574105, "language_loss": 0.77603817, "learning_rate": 3.961437473295193e-06, "loss": 0.79882318, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.7564713954925537 }, { "auxiliary_loss_clip": 0.01171073, "auxiliary_loss_mlp": 0.0105248, "balance_loss_clip": 1.05738235, "balance_loss_mlp": 1.03682745, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.8728005728843757, "language_loss": 0.72167516, "learning_rate": 3.961285094331495e-06, "loss": 0.74391067, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.716503143310547 }, { "auxiliary_loss_clip": 0.01253758, "auxiliary_loss_mlp": 0.01056317, "balance_loss_clip": 1.07019627, "balance_loss_mlp": 1.04139245, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 1.7435399034382657, "language_loss": 0.85805303, "learning_rate": 3.961132417842406e-06, "loss": 0.88115382, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.6810572147369385 }, { "auxiliary_loss_clip": 0.01225514, "auxiliary_loss_mlp": 0.01048205, "balance_loss_clip": 1.06448317, "balance_loss_mlp": 1.032601, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 18.73957179446939, "language_loss": 0.75184262, "learning_rate": 3.960979443851089e-06, "loss": 0.77457976, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.627135753631592 }, { "auxiliary_loss_clip": 0.01218544, "auxiliary_loss_mlp": 0.01057725, "balance_loss_clip": 1.0682863, "balance_loss_mlp": 1.040869, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.9519909607734471, "language_loss": 0.79106998, "learning_rate": 3.96082617238075e-06, "loss": 0.8138327, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.769521474838257 }, { "auxiliary_loss_clip": 0.01223034, "auxiliary_loss_mlp": 0.0104244, "balance_loss_clip": 1.06856787, "balance_loss_mlp": 1.02693081, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.348040170932174, "language_loss": 0.79784989, "learning_rate": 3.960672603454639e-06, "loss": 0.82050467, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.641439199447632 }, { "auxiliary_loss_clip": 0.0122782, "auxiliary_loss_mlp": 0.0105314, "balance_loss_clip": 1.0652765, "balance_loss_mlp": 1.03733277, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 2.765742696353934, "language_loss": 0.76631743, "learning_rate": 3.960518737096054e-06, "loss": 0.78912705, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.6601412296295166 }, { "auxiliary_loss_clip": 0.01240479, "auxiliary_loss_mlp": 0.01055236, "balance_loss_clip": 1.07014358, "balance_loss_mlp": 1.03914237, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.6685539334552555, "language_loss": 0.72502738, "learning_rate": 3.960364573328334e-06, "loss": 0.74798453, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.6459479331970215 }, { "auxiliary_loss_clip": 0.01207459, "auxiliary_loss_mlp": 0.01059318, "balance_loss_clip": 1.06631768, "balance_loss_mlp": 1.04243827, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 1.9231517834988916, "language_loss": 0.88891685, "learning_rate": 3.9602101121748675e-06, "loss": 0.91158462, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.712756395339966 }, { "auxiliary_loss_clip": 0.01222795, "auxiliary_loss_mlp": 0.01051775, "balance_loss_clip": 1.06964028, "balance_loss_mlp": 1.03602743, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 1.989686699165389, "language_loss": 0.72598332, "learning_rate": 3.960055353659085e-06, "loss": 0.74872905, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.613924980163574 }, { "auxiliary_loss_clip": 0.01206655, "auxiliary_loss_mlp": 0.01052936, "balance_loss_clip": 1.06597662, "balance_loss_mlp": 1.0380702, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 5.384462390614872, "language_loss": 0.83596444, "learning_rate": 3.959900297804465e-06, "loss": 0.85856026, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.722010612487793 }, { "auxiliary_loss_clip": 0.0120913, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.06726921, "balance_loss_mlp": 1.03715694, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 1.8741220233079774, "language_loss": 0.77773404, "learning_rate": 3.9597449446345276e-06, "loss": 0.80035198, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.6653244495391846 }, { "auxiliary_loss_clip": 0.01205877, "auxiliary_loss_mlp": 0.01055153, "balance_loss_clip": 1.06404066, "balance_loss_mlp": 1.03969145, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 3.146830249704327, "language_loss": 0.83374357, "learning_rate": 3.95958929417284e-06, "loss": 0.85635382, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.767510175704956 }, { "auxiliary_loss_clip": 0.01154596, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.0596621, "balance_loss_mlp": 1.01929414, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7354770309032078, "language_loss": 0.58814979, "learning_rate": 3.9594333464430145e-06, "loss": 0.60995829, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.367357015609741 }, { "auxiliary_loss_clip": 0.01137448, "auxiliary_loss_mlp": 0.01050821, "balance_loss_clip": 1.05422056, "balance_loss_mlp": 1.03541899, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 1.881512305928972, "language_loss": 0.88015723, "learning_rate": 3.959277101468709e-06, "loss": 0.90203995, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.740063190460205 }, { "auxiliary_loss_clip": 0.01202815, "auxiliary_loss_mlp": 0.01053372, "balance_loss_clip": 1.06109047, "balance_loss_mlp": 1.03738594, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 2.485637213392057, "language_loss": 0.7873807, "learning_rate": 3.959120559273624e-06, "loss": 0.8099426, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.7098166942596436 }, { "auxiliary_loss_clip": 0.01208616, "auxiliary_loss_mlp": 0.01050351, "balance_loss_clip": 1.06809783, "balance_loss_mlp": 1.03552127, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 1.7640075284517658, "language_loss": 0.83722389, "learning_rate": 3.958963719881509e-06, "loss": 0.85981351, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.7106785774230957 }, { "auxiliary_loss_clip": 0.01239782, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.07285738, "balance_loss_mlp": 1.02921605, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 1.9137543535961141, "language_loss": 0.93888974, "learning_rate": 3.958806583316154e-06, "loss": 0.96172875, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 3.593646287918091 }, { "auxiliary_loss_clip": 0.01257646, "auxiliary_loss_mlp": 0.01044509, "balance_loss_clip": 1.07337832, "balance_loss_mlp": 1.02854645, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 1.97223752312442, "language_loss": 0.78759146, "learning_rate": 3.9586491496013985e-06, "loss": 0.81061298, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 2.7700631618499756 }, { "auxiliary_loss_clip": 0.01243657, "auxiliary_loss_mlp": 0.01049977, "balance_loss_clip": 1.07257271, "balance_loss_mlp": 1.03397918, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 9.110094607455565, "language_loss": 0.83454776, "learning_rate": 3.958491418761124e-06, "loss": 0.8574841, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 2.604419469833374 }, { "auxiliary_loss_clip": 0.01220316, "auxiliary_loss_mlp": 0.01047699, "balance_loss_clip": 1.06437063, "balance_loss_mlp": 1.032583, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 2.364945243195901, "language_loss": 0.72546685, "learning_rate": 3.958333390819258e-06, "loss": 0.74814701, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 3.5847253799438477 }, { "auxiliary_loss_clip": 0.01255756, "auxiliary_loss_mlp": 0.01048263, "balance_loss_clip": 1.07242703, "balance_loss_mlp": 1.03288507, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 1.9829356157651719, "language_loss": 0.79933429, "learning_rate": 3.9581750657997754e-06, "loss": 0.82237446, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.6343836784362793 }, { "auxiliary_loss_clip": 0.0122101, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.06831074, "balance_loss_mlp": 1.02691245, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 1.6826425019993039, "language_loss": 0.89532399, "learning_rate": 3.95801644372669e-06, "loss": 0.91795838, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 2.681795835494995 }, { "auxiliary_loss_clip": 0.01226743, "auxiliary_loss_mlp": 0.01038168, "balance_loss_clip": 1.0654819, "balance_loss_mlp": 1.02323079, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 1.7891620370570331, "language_loss": 0.84446961, "learning_rate": 3.957857524624068e-06, "loss": 0.86711872, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.666395425796509 }, { "auxiliary_loss_clip": 0.01220721, "auxiliary_loss_mlp": 0.01041936, "balance_loss_clip": 1.06727242, "balance_loss_mlp": 1.02728474, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.596543313233079, "language_loss": 0.89802063, "learning_rate": 3.957698308516016e-06, "loss": 0.92064714, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.687624216079712 }, { "auxiliary_loss_clip": 0.01240067, "auxiliary_loss_mlp": 0.00713578, "balance_loss_clip": 1.07597661, "balance_loss_mlp": 0.99989796, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 1.9227713296565891, "language_loss": 0.82284969, "learning_rate": 3.957538795426688e-06, "loss": 0.84238613, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.6278574466705322 }, { "auxiliary_loss_clip": 0.01223604, "auxiliary_loss_mlp": 0.01055981, "balance_loss_clip": 1.06834543, "balance_loss_mlp": 1.03976834, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 4.989759451316171, "language_loss": 0.7722528, "learning_rate": 3.9573789853802804e-06, "loss": 0.79504865, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.651291847229004 }, { "auxiliary_loss_clip": 0.01220714, "auxiliary_loss_mlp": 0.00714433, "balance_loss_clip": 1.07039773, "balance_loss_mlp": 0.99989146, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.1118575413185874, "language_loss": 0.74649251, "learning_rate": 3.957218878401037e-06, "loss": 0.76584399, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.680172920227051 }, { "auxiliary_loss_clip": 0.01260612, "auxiliary_loss_mlp": 0.01060383, "balance_loss_clip": 1.0769906, "balance_loss_mlp": 1.04402733, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 1.7416466329416596, "language_loss": 0.89294314, "learning_rate": 3.957058474513246e-06, "loss": 0.91615307, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.643765687942505 }, { "auxiliary_loss_clip": 0.01241982, "auxiliary_loss_mlp": 0.01047173, "balance_loss_clip": 1.07669663, "balance_loss_mlp": 1.03209305, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.7437520112740101, "language_loss": 0.7851519, "learning_rate": 3.956897773741241e-06, "loss": 0.80804348, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.700108289718628 }, { "auxiliary_loss_clip": 0.01208325, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.06589866, "balance_loss_mlp": 1.03173923, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 2.2477054718665506, "language_loss": 0.71762186, "learning_rate": 3.956736776109398e-06, "loss": 0.74017388, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.6773362159729004 }, { "auxiliary_loss_clip": 0.01226452, "auxiliary_loss_mlp": 0.00714634, "balance_loss_clip": 1.06567311, "balance_loss_mlp": 0.99996436, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 2.1286266804944693, "language_loss": 0.83799106, "learning_rate": 3.956575481642143e-06, "loss": 0.85740197, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.6234402656555176 }, { "auxiliary_loss_clip": 0.01179785, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.058882, "balance_loss_mlp": 1.02519429, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.656165057641689, "language_loss": 0.75306594, "learning_rate": 3.956413890363943e-06, "loss": 0.77527219, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.746894359588623 }, { "auxiliary_loss_clip": 0.01237638, "auxiliary_loss_mlp": 0.01049642, "balance_loss_clip": 1.07234311, "balance_loss_mlp": 1.03402591, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.077899316146802, "language_loss": 0.81746739, "learning_rate": 3.956252002299312e-06, "loss": 0.8403402, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.6954190731048584 }, { "auxiliary_loss_clip": 0.01252481, "auxiliary_loss_mlp": 0.01052914, "balance_loss_clip": 1.07128, "balance_loss_mlp": 1.03790605, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 1.9387798082490797, "language_loss": 0.90715396, "learning_rate": 3.956089817472807e-06, "loss": 0.93020791, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.5767886638641357 }, { "auxiliary_loss_clip": 0.01218131, "auxiliary_loss_mlp": 0.01048469, "balance_loss_clip": 1.06988347, "balance_loss_mlp": 1.03223276, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.1670887278386237, "language_loss": 0.85735708, "learning_rate": 3.955927335909032e-06, "loss": 0.88002312, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.7275969982147217 }, { "auxiliary_loss_clip": 0.01182648, "auxiliary_loss_mlp": 0.01038795, "balance_loss_clip": 1.06628203, "balance_loss_mlp": 1.02450156, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 3.4427253693290387, "language_loss": 0.76102006, "learning_rate": 3.955764557632634e-06, "loss": 0.78323454, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.7520382404327393 }, { "auxiliary_loss_clip": 0.01213855, "auxiliary_loss_mlp": 0.01047663, "balance_loss_clip": 1.06682944, "balance_loss_mlp": 1.03223753, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.230463551890388, "language_loss": 0.94709605, "learning_rate": 3.955601482668309e-06, "loss": 0.96971124, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.6514668464660645 }, { "auxiliary_loss_clip": 0.01176104, "auxiliary_loss_mlp": 0.01051538, "balance_loss_clip": 1.05789781, "balance_loss_mlp": 1.03583765, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 1.8840723006037576, "language_loss": 0.8867358, "learning_rate": 3.955438111040794e-06, "loss": 0.9090122, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.7496819496154785 }, { "auxiliary_loss_clip": 0.0117749, "auxiliary_loss_mlp": 0.01043634, "balance_loss_clip": 1.0602771, "balance_loss_mlp": 1.02777898, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 1.8254335992793953, "language_loss": 0.80463535, "learning_rate": 3.955274442774873e-06, "loss": 0.8268466, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.7239773273468018 }, { "auxiliary_loss_clip": 0.01235617, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.06749916, "balance_loss_mlp": 1.0324688, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.320889460954286, "language_loss": 0.70729816, "learning_rate": 3.9551104778953725e-06, "loss": 0.73013544, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.7746353149414062 }, { "auxiliary_loss_clip": 0.01202356, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.06487536, "balance_loss_mlp": 1.03173959, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.8859587583426054, "language_loss": 0.85454822, "learning_rate": 3.954946216427167e-06, "loss": 0.87704146, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.7069759368896484 }, { "auxiliary_loss_clip": 0.01103772, "auxiliary_loss_mlp": 0.01013649, "balance_loss_clip": 1.03922439, "balance_loss_mlp": 1.00644875, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.8743006321602907, "language_loss": 0.61611378, "learning_rate": 3.954781658395176e-06, "loss": 0.63728797, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.263685941696167 }, { "auxiliary_loss_clip": 0.01219813, "auxiliary_loss_mlp": 0.01049278, "balance_loss_clip": 1.06333852, "balance_loss_mlp": 1.03251708, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 2.620010355649088, "language_loss": 0.92112041, "learning_rate": 3.95461680382436e-06, "loss": 0.9438113, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 3.6520912647247314 }, { "auxiliary_loss_clip": 0.01241441, "auxiliary_loss_mlp": 0.01051242, "balance_loss_clip": 1.07325518, "balance_loss_mlp": 1.03499365, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 4.60121741504678, "language_loss": 0.86445665, "learning_rate": 3.9544516527397295e-06, "loss": 0.88738346, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.608166456222534 }, { "auxiliary_loss_clip": 0.01206648, "auxiliary_loss_mlp": 0.01048614, "balance_loss_clip": 1.06570435, "balance_loss_mlp": 1.03278255, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.603044248527209, "language_loss": 0.80480814, "learning_rate": 3.954286205166338e-06, "loss": 0.82736075, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.645085573196411 }, { "auxiliary_loss_clip": 0.01243482, "auxiliary_loss_mlp": 0.01048844, "balance_loss_clip": 1.07694554, "balance_loss_mlp": 1.03297758, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 2.6484456563589776, "language_loss": 0.84208143, "learning_rate": 3.954120461129282e-06, "loss": 0.86500466, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.6022934913635254 }, { "auxiliary_loss_clip": 0.01259214, "auxiliary_loss_mlp": 0.01060902, "balance_loss_clip": 1.07777321, "balance_loss_mlp": 1.04544008, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 2.1190742774583264, "language_loss": 0.83672464, "learning_rate": 3.953954420653706e-06, "loss": 0.85992575, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.604874849319458 }, { "auxiliary_loss_clip": 0.01235921, "auxiliary_loss_mlp": 0.01048365, "balance_loss_clip": 1.07083011, "balance_loss_mlp": 1.03386879, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 1.8477871672623514, "language_loss": 0.88069594, "learning_rate": 3.953788083764798e-06, "loss": 0.90353876, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.6868560314178467 }, { "auxiliary_loss_clip": 0.01183199, "auxiliary_loss_mlp": 0.01050917, "balance_loss_clip": 1.06443858, "balance_loss_mlp": 1.03552663, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.8025641258823377, "language_loss": 0.92212403, "learning_rate": 3.953621450487792e-06, "loss": 0.94446516, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.6580753326416016 }, { "auxiliary_loss_clip": 0.0115799, "auxiliary_loss_mlp": 0.01007091, "balance_loss_clip": 1.05252612, "balance_loss_mlp": 1.00012934, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8380843277977769, "language_loss": 0.61179125, "learning_rate": 3.953454520847964e-06, "loss": 0.63344204, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.3399720191955566 }, { "auxiliary_loss_clip": 0.01214412, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.06686318, "balance_loss_mlp": 1.02945411, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 2.1468857539801434, "language_loss": 0.73975104, "learning_rate": 3.9532872948706395e-06, "loss": 0.76236403, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.6941885948181152 }, { "auxiliary_loss_clip": 0.0122357, "auxiliary_loss_mlp": 0.01046857, "balance_loss_clip": 1.07006943, "balance_loss_mlp": 1.03165758, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 2.219136406938871, "language_loss": 0.82805777, "learning_rate": 3.9531197725811845e-06, "loss": 0.85076201, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.6154627799987793 }, { "auxiliary_loss_clip": 0.01256109, "auxiliary_loss_mlp": 0.01048525, "balance_loss_clip": 1.07576132, "balance_loss_mlp": 1.03364789, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 2.211683666871189, "language_loss": 0.8797158, "learning_rate": 3.952951954005013e-06, "loss": 0.90276217, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.6659467220306396 }, { "auxiliary_loss_clip": 0.01214347, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.06303859, "balance_loss_mlp": 1.03024101, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 2.5604542712871794, "language_loss": 0.84753883, "learning_rate": 3.952783839167584e-06, "loss": 0.87013507, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.701968193054199 }, { "auxiliary_loss_clip": 0.012336, "auxiliary_loss_mlp": 0.01054512, "balance_loss_clip": 1.06970024, "balance_loss_mlp": 1.03938472, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.8082727978288005, "language_loss": 0.74214208, "learning_rate": 3.952615428094398e-06, "loss": 0.76502323, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.6117026805877686 }, { "auxiliary_loss_clip": 0.01172831, "auxiliary_loss_mlp": 0.01055691, "balance_loss_clip": 1.05816782, "balance_loss_mlp": 1.04059935, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.7354517483053213, "language_loss": 0.7326737, "learning_rate": 3.952446720811004e-06, "loss": 0.75495893, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.7000038623809814 }, { "auxiliary_loss_clip": 0.01115566, "auxiliary_loss_mlp": 0.01012281, "balance_loss_clip": 1.05155468, "balance_loss_mlp": 1.00531888, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8380865767952288, "language_loss": 0.63615447, "learning_rate": 3.952277717342995e-06, "loss": 0.65743297, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.3355698585510254 }, { "auxiliary_loss_clip": 0.01224168, "auxiliary_loss_mlp": 0.01046457, "balance_loss_clip": 1.06740808, "balance_loss_mlp": 1.03076935, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 2.3837496097914004, "language_loss": 0.85790586, "learning_rate": 3.952108417716009e-06, "loss": 0.88061208, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.667525291442871 }, { "auxiliary_loss_clip": 0.01236514, "auxiliary_loss_mlp": 0.01055433, "balance_loss_clip": 1.0730958, "balance_loss_mlp": 1.04020977, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 1.7492267055752253, "language_loss": 0.85143042, "learning_rate": 3.951938821955727e-06, "loss": 0.87434995, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.7082948684692383 }, { "auxiliary_loss_clip": 0.01215254, "auxiliary_loss_mlp": 0.01055407, "balance_loss_clip": 1.06799722, "balance_loss_mlp": 1.04018426, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.5694368214865704, "language_loss": 0.76511538, "learning_rate": 3.9517689300878786e-06, "loss": 0.78782201, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.6352922916412354 }, { "auxiliary_loss_clip": 0.01251562, "auxiliary_loss_mlp": 0.01050006, "balance_loss_clip": 1.07074177, "balance_loss_mlp": 1.03496182, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.7963582403407847, "language_loss": 0.78797817, "learning_rate": 3.951598742138236e-06, "loss": 0.81099379, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.6582682132720947 }, { "auxiliary_loss_clip": 0.01219769, "auxiliary_loss_mlp": 0.0104672, "balance_loss_clip": 1.06367266, "balance_loss_mlp": 1.03138995, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.1347654690900035, "language_loss": 0.79996347, "learning_rate": 3.951428258132615e-06, "loss": 0.82262838, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.7161386013031006 }, { "auxiliary_loss_clip": 0.01221477, "auxiliary_loss_mlp": 0.01046179, "balance_loss_clip": 1.06971478, "balance_loss_mlp": 1.03120637, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 2.1834105183293295, "language_loss": 0.84310633, "learning_rate": 3.951257478096879e-06, "loss": 0.86578286, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.715383768081665 }, { "auxiliary_loss_clip": 0.01218573, "auxiliary_loss_mlp": 0.00714672, "balance_loss_clip": 1.06920373, "balance_loss_mlp": 1.00001347, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 2.5458347469185214, "language_loss": 0.68629575, "learning_rate": 3.951086402056936e-06, "loss": 0.70562822, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.6467444896698 }, { "auxiliary_loss_clip": 0.01146084, "auxiliary_loss_mlp": 0.00714678, "balance_loss_clip": 1.0646323, "balance_loss_mlp": 1.00009108, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.8570704850808313, "language_loss": 0.83675611, "learning_rate": 3.950915030038735e-06, "loss": 0.85536373, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 3.0671427249908447 }, { "auxiliary_loss_clip": 0.01235101, "auxiliary_loss_mlp": 0.01039656, "balance_loss_clip": 1.07372594, "balance_loss_mlp": 1.02573204, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.106370749413999, "language_loss": 0.83851987, "learning_rate": 3.9507433620682765e-06, "loss": 0.86126745, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.7968015670776367 }, { "auxiliary_loss_clip": 0.01193313, "auxiliary_loss_mlp": 0.01047155, "balance_loss_clip": 1.06063795, "balance_loss_mlp": 1.03192031, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.7110833667222165, "language_loss": 0.88120973, "learning_rate": 3.9505713981716e-06, "loss": 0.9036144, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.776087999343872 }, { "auxiliary_loss_clip": 0.01221091, "auxiliary_loss_mlp": 0.01037885, "balance_loss_clip": 1.07195747, "balance_loss_mlp": 1.0233655, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 1.8941624561056394, "language_loss": 0.81174254, "learning_rate": 3.950399138374795e-06, "loss": 0.83433229, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.778473377227783 }, { "auxiliary_loss_clip": 0.01231449, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.07008767, "balance_loss_mlp": 1.03005886, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.8753989909260596, "language_loss": 0.74322909, "learning_rate": 3.95022658270399e-06, "loss": 0.76600897, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 4.397713661193848 }, { "auxiliary_loss_clip": 0.01213571, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.0698632, "balance_loss_mlp": 1.03118336, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 1.8114554421390745, "language_loss": 0.78328037, "learning_rate": 3.9500537311853635e-06, "loss": 0.80588508, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 2.607325553894043 }, { "auxiliary_loss_clip": 0.01231656, "auxiliary_loss_mlp": 0.01039744, "balance_loss_clip": 1.06518292, "balance_loss_mlp": 1.02374578, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 2.1857477033341692, "language_loss": 0.83258092, "learning_rate": 3.949880583845136e-06, "loss": 0.85529494, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 3.569153308868408 }, { "auxiliary_loss_clip": 0.01216035, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.06840324, "balance_loss_mlp": 1.02804613, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 2.0448762586567977, "language_loss": 0.8126148, "learning_rate": 3.949707140709575e-06, "loss": 0.83520496, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 3.6285672187805176 }, { "auxiliary_loss_clip": 0.01236389, "auxiliary_loss_mlp": 0.01045689, "balance_loss_clip": 1.06792378, "balance_loss_mlp": 1.03079939, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.25891427709759, "language_loss": 0.83109617, "learning_rate": 3.949533401804991e-06, "loss": 0.853917, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.6186683177948 }, { "auxiliary_loss_clip": 0.01229579, "auxiliary_loss_mlp": 0.00714456, "balance_loss_clip": 1.069242, "balance_loss_mlp": 1.00001109, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 2.1476205724422317, "language_loss": 0.90382177, "learning_rate": 3.949359367157739e-06, "loss": 0.92326212, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.6758077144622803 }, { "auxiliary_loss_clip": 0.01237753, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.0698266, "balance_loss_mlp": 1.03134453, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 1.9766231931931826, "language_loss": 0.75679791, "learning_rate": 3.949185036794222e-06, "loss": 0.77963793, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.6230390071868896 }, { "auxiliary_loss_clip": 0.0125105, "auxiliary_loss_mlp": 0.01048191, "balance_loss_clip": 1.07269192, "balance_loss_mlp": 1.03311086, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.797726401155187, "language_loss": 0.78762841, "learning_rate": 3.949010410740884e-06, "loss": 0.81062078, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.7024619579315186 }, { "auxiliary_loss_clip": 0.01205182, "auxiliary_loss_mlp": 0.00715287, "balance_loss_clip": 1.06368935, "balance_loss_mlp": 1.00000024, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.683914062932591, "language_loss": 0.8639847, "learning_rate": 3.948835489024216e-06, "loss": 0.88318932, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.6595685482025146 }, { "auxiliary_loss_clip": 0.01236218, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.06841278, "balance_loss_mlp": 1.0223608, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.026561971187501, "language_loss": 0.90557557, "learning_rate": 3.948660271670755e-06, "loss": 0.92830634, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.643115520477295 }, { "auxiliary_loss_clip": 0.01212685, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.06713772, "balance_loss_mlp": 1.0270673, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.1931839127343515, "language_loss": 0.84265578, "learning_rate": 3.948484758707079e-06, "loss": 0.86521059, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.6845390796661377 }, { "auxiliary_loss_clip": 0.01188678, "auxiliary_loss_mlp": 0.01047102, "balance_loss_clip": 1.06131315, "balance_loss_mlp": 1.03198624, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.5695264971686256, "language_loss": 0.83429158, "learning_rate": 3.948308950159815e-06, "loss": 0.85664934, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.7535552978515625 }, { "auxiliary_loss_clip": 0.0119188, "auxiliary_loss_mlp": 0.01052628, "balance_loss_clip": 1.05995584, "balance_loss_mlp": 1.03741717, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 2.205548212000605, "language_loss": 0.76044327, "learning_rate": 3.9481328460556326e-06, "loss": 0.78288829, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.6644554138183594 }, { "auxiliary_loss_clip": 0.01204739, "auxiliary_loss_mlp": 0.01053363, "balance_loss_clip": 1.06235003, "balance_loss_mlp": 1.03779411, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.0920612721188125, "language_loss": 0.89878857, "learning_rate": 3.9479564464212455e-06, "loss": 0.92136955, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.6668217182159424 }, { "auxiliary_loss_clip": 0.01252871, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.06769454, "balance_loss_mlp": 1.02621794, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.1432355474868743, "language_loss": 0.76056606, "learning_rate": 3.947779751283414e-06, "loss": 0.78351426, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.558335304260254 }, { "auxiliary_loss_clip": 0.01237031, "auxiliary_loss_mlp": 0.00714294, "balance_loss_clip": 1.07489467, "balance_loss_mlp": 1.00004888, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.744967399415046, "language_loss": 0.76036894, "learning_rate": 3.947602760668944e-06, "loss": 0.77988219, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.6533026695251465 }, { "auxiliary_loss_clip": 0.01235863, "auxiliary_loss_mlp": 0.01048767, "balance_loss_clip": 1.07151496, "balance_loss_mlp": 1.03406835, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 1.9522385177313666, "language_loss": 0.71520734, "learning_rate": 3.947425474604684e-06, "loss": 0.73805362, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.8311684131622314 }, { "auxiliary_loss_clip": 0.01216346, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.06675565, "balance_loss_mlp": 1.02994084, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 2.1104625640928663, "language_loss": 0.92479241, "learning_rate": 3.947247893117528e-06, "loss": 0.94740093, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.6943519115448 }, { "auxiliary_loss_clip": 0.0122407, "auxiliary_loss_mlp": 0.01052986, "balance_loss_clip": 1.06493723, "balance_loss_mlp": 1.03752482, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 2.9891052334658172, "language_loss": 0.6900574, "learning_rate": 3.947070016234413e-06, "loss": 0.71282792, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.6431915760040283 }, { "auxiliary_loss_clip": 0.01226847, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.06996393, "balance_loss_mlp": 1.0282805, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.409078033041965, "language_loss": 0.74917269, "learning_rate": 3.946891843982326e-06, "loss": 0.77187937, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.704580068588257 }, { "auxiliary_loss_clip": 0.01236066, "auxiliary_loss_mlp": 0.0105284, "balance_loss_clip": 1.07200086, "balance_loss_mlp": 1.03598368, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.18516682173834, "language_loss": 0.74336189, "learning_rate": 3.9467133763882935e-06, "loss": 0.76625097, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.6930670738220215 }, { "auxiliary_loss_clip": 0.01224406, "auxiliary_loss_mlp": 0.01052642, "balance_loss_clip": 1.06783271, "balance_loss_mlp": 1.03728831, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.02124388956038, "language_loss": 0.86214912, "learning_rate": 3.9465346134793905e-06, "loss": 0.88491964, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.710181951522827 }, { "auxiliary_loss_clip": 0.0120169, "auxiliary_loss_mlp": 0.01045534, "balance_loss_clip": 1.06757665, "balance_loss_mlp": 1.03110933, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 1.9889867812531556, "language_loss": 0.79463387, "learning_rate": 3.9463555552827335e-06, "loss": 0.81710607, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.7755086421966553 }, { "auxiliary_loss_clip": 0.01219929, "auxiliary_loss_mlp": 0.01059191, "balance_loss_clip": 1.06508458, "balance_loss_mlp": 1.04343152, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.866567167905748, "language_loss": 0.86381817, "learning_rate": 3.946176201825487e-06, "loss": 0.88660938, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.636404514312744 }, { "auxiliary_loss_clip": 0.0121562, "auxiliary_loss_mlp": 0.01046111, "balance_loss_clip": 1.06887388, "balance_loss_mlp": 1.03053069, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 2.3922133209067473, "language_loss": 0.83874983, "learning_rate": 3.9459965531348575e-06, "loss": 0.86136711, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.7974143028259277 }, { "auxiliary_loss_clip": 0.01217838, "auxiliary_loss_mlp": 0.00715102, "balance_loss_clip": 1.06845772, "balance_loss_mlp": 1.00005698, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.066849394719345, "language_loss": 0.8566457, "learning_rate": 3.945816609238098e-06, "loss": 0.87597513, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.7683072090148926 }, { "auxiliary_loss_clip": 0.01176951, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.06305432, "balance_loss_mlp": 1.02851927, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 2.8534445625949827, "language_loss": 0.85249537, "learning_rate": 3.945636370162507e-06, "loss": 0.87470543, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 4.580850839614868 }, { "auxiliary_loss_clip": 0.01232147, "auxiliary_loss_mlp": 0.01050924, "balance_loss_clip": 1.06877732, "balance_loss_mlp": 1.03664267, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 1.8243233856840773, "language_loss": 0.79284322, "learning_rate": 3.945455835935425e-06, "loss": 0.81567395, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 2.7116763591766357 }, { "auxiliary_loss_clip": 0.01216017, "auxiliary_loss_mlp": 0.01052344, "balance_loss_clip": 1.06616926, "balance_loss_mlp": 1.03656101, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.607578358352996, "language_loss": 0.75553161, "learning_rate": 3.94527500658424e-06, "loss": 0.77821517, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 2.7379679679870605 }, { "auxiliary_loss_clip": 0.01177747, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.06302106, "balance_loss_mlp": 1.03258657, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 4.382325697172137, "language_loss": 0.81087363, "learning_rate": 3.945093882136382e-06, "loss": 0.83312815, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 4.6278605461120605 }, { "auxiliary_loss_clip": 0.01217997, "auxiliary_loss_mlp": 0.0071497, "balance_loss_clip": 1.06918836, "balance_loss_mlp": 1.000072, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 1.9000910216011027, "language_loss": 0.84546202, "learning_rate": 3.944912462619329e-06, "loss": 0.86479169, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.7553093433380127 }, { "auxiliary_loss_clip": 0.01221225, "auxiliary_loss_mlp": 0.01054205, "balance_loss_clip": 1.0677284, "balance_loss_mlp": 1.03743207, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 1.9850913416033285, "language_loss": 0.80820256, "learning_rate": 3.9447307480606025e-06, "loss": 0.83095682, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 2.698802947998047 }, { "auxiliary_loss_clip": 0.01208012, "auxiliary_loss_mlp": 0.01049076, "balance_loss_clip": 1.06672668, "balance_loss_mlp": 1.03308964, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 2.015802940537912, "language_loss": 0.89906037, "learning_rate": 3.944548738487767e-06, "loss": 0.92163128, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.634765148162842 }, { "auxiliary_loss_clip": 0.01257762, "auxiliary_loss_mlp": 0.01052483, "balance_loss_clip": 1.0754509, "balance_loss_mlp": 1.03700948, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.447192050179895, "language_loss": 0.90960288, "learning_rate": 3.944366433928434e-06, "loss": 0.93270528, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.6488375663757324 }, { "auxiliary_loss_clip": 0.01210226, "auxiliary_loss_mlp": 0.01049524, "balance_loss_clip": 1.06465912, "balance_loss_mlp": 1.03518343, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 1.5134534517497553, "language_loss": 0.83668357, "learning_rate": 3.9441838344102594e-06, "loss": 0.85928106, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.7230775356292725 }, { "auxiliary_loss_clip": 0.01223301, "auxiliary_loss_mlp": 0.01049982, "balance_loss_clip": 1.07006156, "balance_loss_mlp": 1.03560543, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 2.151808564403494, "language_loss": 0.66478872, "learning_rate": 3.944000939960943e-06, "loss": 0.68752158, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.801060199737549 }, { "auxiliary_loss_clip": 0.01237092, "auxiliary_loss_mlp": 0.0104499, "balance_loss_clip": 1.06892729, "balance_loss_mlp": 1.03089976, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.557954963986581, "language_loss": 0.79872984, "learning_rate": 3.943817750608229e-06, "loss": 0.82155061, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.689971923828125 }, { "auxiliary_loss_clip": 0.01235308, "auxiliary_loss_mlp": 0.01056417, "balance_loss_clip": 1.06957936, "balance_loss_mlp": 1.04101562, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.5152702726033063, "language_loss": 0.81625307, "learning_rate": 3.943634266379908e-06, "loss": 0.83917034, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.650143623352051 }, { "auxiliary_loss_clip": 0.0123531, "auxiliary_loss_mlp": 0.01057373, "balance_loss_clip": 1.06868184, "balance_loss_mlp": 1.04294837, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.658261168503249, "language_loss": 0.84702766, "learning_rate": 3.943450487303815e-06, "loss": 0.86995447, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.6433939933776855 }, { "auxiliary_loss_clip": 0.01232761, "auxiliary_loss_mlp": 0.0104948, "balance_loss_clip": 1.06989503, "balance_loss_mlp": 1.03376806, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 2.811582832055587, "language_loss": 0.85623074, "learning_rate": 3.943266413407827e-06, "loss": 0.87905318, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.649777412414551 }, { "auxiliary_loss_clip": 0.01236378, "auxiliary_loss_mlp": 0.01045894, "balance_loss_clip": 1.07007706, "balance_loss_mlp": 1.03149307, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 1.8810488584382121, "language_loss": 0.8517819, "learning_rate": 3.94308204471987e-06, "loss": 0.87460458, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.635000228881836 }, { "auxiliary_loss_clip": 0.01199375, "auxiliary_loss_mlp": 0.01052832, "balance_loss_clip": 1.06434846, "balance_loss_mlp": 1.03814507, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 2.326037594286324, "language_loss": 0.74727893, "learning_rate": 3.942897381267912e-06, "loss": 0.76980102, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.708362579345703 }, { "auxiliary_loss_clip": 0.01235538, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.06942463, "balance_loss_mlp": 1.0343827, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 3.0201048890116713, "language_loss": 0.65935218, "learning_rate": 3.942712423079965e-06, "loss": 0.68220931, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.5768582820892334 }, { "auxiliary_loss_clip": 0.01181007, "auxiliary_loss_mlp": 0.01046077, "balance_loss_clip": 1.05593491, "balance_loss_mlp": 1.03119969, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.1022482740004604, "language_loss": 0.90024155, "learning_rate": 3.942527170184088e-06, "loss": 0.92251241, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.7236037254333496 }, { "auxiliary_loss_clip": 0.01252969, "auxiliary_loss_mlp": 0.01045722, "balance_loss_clip": 1.0729214, "balance_loss_mlp": 1.02986717, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.496329238141998, "language_loss": 0.77213329, "learning_rate": 3.942341622608385e-06, "loss": 0.79512018, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.5672717094421387 }, { "auxiliary_loss_clip": 0.01218973, "auxiliary_loss_mlp": 0.01051369, "balance_loss_clip": 1.07178414, "balance_loss_mlp": 1.03706443, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 1.5077931141117713, "language_loss": 0.77906233, "learning_rate": 3.942155780381001e-06, "loss": 0.8017658, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.810370445251465 }, { "auxiliary_loss_clip": 0.01218785, "auxiliary_loss_mlp": 0.0104157, "balance_loss_clip": 1.06779778, "balance_loss_mlp": 1.02687132, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 1.817243852035361, "language_loss": 0.75650656, "learning_rate": 3.94196964353013e-06, "loss": 0.77911007, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.6333000659942627 }, { "auxiliary_loss_clip": 0.01211243, "auxiliary_loss_mlp": 0.0071463, "balance_loss_clip": 1.06237662, "balance_loss_mlp": 1.00003135, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 2.5041598084057637, "language_loss": 0.81149995, "learning_rate": 3.941783212084008e-06, "loss": 0.83075863, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.676970958709717 }, { "auxiliary_loss_clip": 0.01202458, "auxiliary_loss_mlp": 0.01058181, "balance_loss_clip": 1.06433392, "balance_loss_mlp": 1.04375625, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.8729359722367867, "language_loss": 0.79335296, "learning_rate": 3.941596486070916e-06, "loss": 0.81595939, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.6669223308563232 }, { "auxiliary_loss_clip": 0.01179451, "auxiliary_loss_mlp": 0.01051472, "balance_loss_clip": 1.06772518, "balance_loss_mlp": 1.03564143, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 2.1102734695518697, "language_loss": 0.58594131, "learning_rate": 3.941409465519182e-06, "loss": 0.6082505, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.862177610397339 }, { "auxiliary_loss_clip": 0.0122191, "auxiliary_loss_mlp": 0.01037031, "balance_loss_clip": 1.06436992, "balance_loss_mlp": 1.02196288, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.6154641464358275, "language_loss": 0.85201329, "learning_rate": 3.941222150457176e-06, "loss": 0.87460268, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.7903549671173096 }, { "auxiliary_loss_clip": 0.01238306, "auxiliary_loss_mlp": 0.01046593, "balance_loss_clip": 1.06869292, "balance_loss_mlp": 1.03198934, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.466810473504566, "language_loss": 0.71222264, "learning_rate": 3.941034540913311e-06, "loss": 0.73507166, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.5762369632720947 }, { "auxiliary_loss_clip": 0.01233063, "auxiliary_loss_mlp": 0.00714808, "balance_loss_clip": 1.06800067, "balance_loss_mlp": 1.00002265, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.6429795325055816, "language_loss": 0.8238917, "learning_rate": 3.940846636916051e-06, "loss": 0.84337044, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 4.368937253952026 }, { "auxiliary_loss_clip": 0.01212775, "auxiliary_loss_mlp": 0.01055728, "balance_loss_clip": 1.06977248, "balance_loss_mlp": 1.04152989, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.2159517059037985, "language_loss": 0.86622912, "learning_rate": 3.940658438493899e-06, "loss": 0.88891411, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 2.796635150909424 }, { "auxiliary_loss_clip": 0.01251926, "auxiliary_loss_mlp": 0.01044427, "balance_loss_clip": 1.06885171, "balance_loss_mlp": 1.02904892, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 3.4846912445622404, "language_loss": 0.76197016, "learning_rate": 3.940469945675405e-06, "loss": 0.78493369, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 2.5673203468322754 }, { "auxiliary_loss_clip": 0.01156891, "auxiliary_loss_mlp": 0.01054962, "balance_loss_clip": 1.05852151, "balance_loss_mlp": 1.04052603, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 1.8820784980801415, "language_loss": 0.91368389, "learning_rate": 3.940281158489163e-06, "loss": 0.93580246, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 4.6392433643341064 }, { "auxiliary_loss_clip": 0.01157766, "auxiliary_loss_mlp": 0.01048494, "balance_loss_clip": 1.05550241, "balance_loss_mlp": 1.03388524, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.8173617173717294, "language_loss": 0.82806277, "learning_rate": 3.940092076963812e-06, "loss": 0.85012543, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.771638870239258 }, { "auxiliary_loss_clip": 0.0121539, "auxiliary_loss_mlp": 0.01054224, "balance_loss_clip": 1.06668162, "balance_loss_mlp": 1.0388813, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.0421815103196104, "language_loss": 0.79089761, "learning_rate": 3.9399027011280355e-06, "loss": 0.81359375, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 2.7889726161956787 }, { "auxiliary_loss_clip": 0.01217514, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.07079816, "balance_loss_mlp": 1.03047037, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 1.8942202682320874, "language_loss": 0.76814032, "learning_rate": 3.939713031010561e-06, "loss": 0.79078525, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.659838914871216 }, { "auxiliary_loss_clip": 0.01199072, "auxiliary_loss_mlp": 0.01041853, "balance_loss_clip": 1.06881213, "balance_loss_mlp": 1.02784586, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.111813390617528, "language_loss": 0.77398765, "learning_rate": 3.939523066640163e-06, "loss": 0.79639691, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.69679856300354 }, { "auxiliary_loss_clip": 0.01234664, "auxiliary_loss_mlp": 0.01040712, "balance_loss_clip": 1.07002485, "balance_loss_mlp": 1.02613235, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 2.0787775041933236, "language_loss": 0.80926538, "learning_rate": 3.939332808045657e-06, "loss": 0.83201909, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.6740200519561768 }, { "auxiliary_loss_clip": 0.01197894, "auxiliary_loss_mlp": 0.01049676, "balance_loss_clip": 1.06401932, "balance_loss_mlp": 1.03479838, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.5943682718579058, "language_loss": 0.84653449, "learning_rate": 3.939142255255906e-06, "loss": 0.86901021, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.70331072807312 }, { "auxiliary_loss_clip": 0.01235653, "auxiliary_loss_mlp": 0.01054319, "balance_loss_clip": 1.07166266, "balance_loss_mlp": 1.03941774, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 2.149555013339614, "language_loss": 0.86884928, "learning_rate": 3.938951408299817e-06, "loss": 0.89174902, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.5737502574920654 }, { "auxiliary_loss_clip": 0.01085487, "auxiliary_loss_mlp": 0.01013594, "balance_loss_clip": 1.04447365, "balance_loss_mlp": 1.00720453, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8003036445575556, "language_loss": 0.5437808, "learning_rate": 3.938760267206342e-06, "loss": 0.56477165, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.1919023990631104 }, { "auxiliary_loss_clip": 0.01247992, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.07189369, "balance_loss_mlp": 1.02852559, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.2569466086828056, "language_loss": 0.78425956, "learning_rate": 3.938568832004475e-06, "loss": 0.80717152, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.6687726974487305 }, { "auxiliary_loss_clip": 0.01201263, "auxiliary_loss_mlp": 0.01045984, "balance_loss_clip": 1.06275082, "balance_loss_mlp": 1.03102338, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.0772164518098566, "language_loss": 0.74853861, "learning_rate": 3.938377102723257e-06, "loss": 0.77101099, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.6814048290252686 }, { "auxiliary_loss_clip": 0.01164587, "auxiliary_loss_mlp": 0.01053305, "balance_loss_clip": 1.06019998, "balance_loss_mlp": 1.03772473, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 2.2178476865076906, "language_loss": 0.83167636, "learning_rate": 3.938185079391774e-06, "loss": 0.85385537, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.77333664894104 }, { "auxiliary_loss_clip": 0.01249593, "auxiliary_loss_mlp": 0.01050636, "balance_loss_clip": 1.07091498, "balance_loss_mlp": 1.0358125, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 2.746937110429947, "language_loss": 1.05950677, "learning_rate": 3.937992762039157e-06, "loss": 1.08250904, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.6104326248168945 }, { "auxiliary_loss_clip": 0.01231226, "auxiliary_loss_mlp": 0.01046931, "balance_loss_clip": 1.07019687, "balance_loss_mlp": 1.03217292, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.69642281664245, "language_loss": 0.80454385, "learning_rate": 3.937800150694577e-06, "loss": 0.82732546, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.630486011505127 }, { "auxiliary_loss_clip": 0.01178611, "auxiliary_loss_mlp": 0.0104623, "balance_loss_clip": 1.06294894, "balance_loss_mlp": 1.03154337, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.4887545613995914, "language_loss": 0.76022148, "learning_rate": 3.937607245387255e-06, "loss": 0.78246993, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.7395951747894287 }, { "auxiliary_loss_clip": 0.01216338, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.06325364, "balance_loss_mlp": 1.03623128, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 1.947826730620342, "language_loss": 0.72325134, "learning_rate": 3.937414046146455e-06, "loss": 0.7459141, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.677072763442993 }, { "auxiliary_loss_clip": 0.0125369, "auxiliary_loss_mlp": 0.01045815, "balance_loss_clip": 1.07563043, "balance_loss_mlp": 1.03091431, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.588864417489263, "language_loss": 0.74991316, "learning_rate": 3.9372205530014845e-06, "loss": 0.77290821, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.653181791305542 }, { "auxiliary_loss_clip": 0.01249848, "auxiliary_loss_mlp": 0.01051378, "balance_loss_clip": 1.06928134, "balance_loss_mlp": 1.03695989, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 2.0852428387188966, "language_loss": 0.71591008, "learning_rate": 3.937026765981696e-06, "loss": 0.73892236, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.5989725589752197 }, { "auxiliary_loss_clip": 0.01202274, "auxiliary_loss_mlp": 0.01049123, "balance_loss_clip": 1.06934643, "balance_loss_mlp": 1.03446031, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.906200728823018, "language_loss": 0.79556108, "learning_rate": 3.936832685116488e-06, "loss": 0.81807506, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.7346396446228027 }, { "auxiliary_loss_clip": 0.0124782, "auxiliary_loss_mlp": 0.01052912, "balance_loss_clip": 1.06998217, "balance_loss_mlp": 1.03884578, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.1506712871601783, "language_loss": 0.90381241, "learning_rate": 3.936638310435301e-06, "loss": 0.9268198, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.5491585731506348 }, { "auxiliary_loss_clip": 0.01238443, "auxiliary_loss_mlp": 0.01051986, "balance_loss_clip": 1.07173133, "balance_loss_mlp": 1.03676319, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 1.9100737084952886, "language_loss": 0.81621844, "learning_rate": 3.936443641967623e-06, "loss": 0.83912271, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.6444225311279297 }, { "auxiliary_loss_clip": 0.01220135, "auxiliary_loss_mlp": 0.0104702, "balance_loss_clip": 1.06839967, "balance_loss_mlp": 1.0329175, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.067137608086523, "language_loss": 0.83138776, "learning_rate": 3.936248679742983e-06, "loss": 0.85405928, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.663072109222412 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01011096, "balance_loss_clip": 1.04706979, "balance_loss_mlp": 1.0042299, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.0493108961248192, "language_loss": 0.70196539, "learning_rate": 3.936053423790959e-06, "loss": 0.7231375, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.045994758605957 }, { "auxiliary_loss_clip": 0.01251899, "auxiliary_loss_mlp": 0.01050226, "balance_loss_clip": 1.07184291, "balance_loss_mlp": 1.03619528, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.997400494804864, "language_loss": 0.77437377, "learning_rate": 3.935857874141168e-06, "loss": 0.79739505, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 3.557835578918457 }, { "auxiliary_loss_clip": 0.01211607, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.06782365, "balance_loss_mlp": 1.02727604, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 3.412558621249361, "language_loss": 0.83397728, "learning_rate": 3.935662030823279e-06, "loss": 0.85652, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 2.6170198917388916 }, { "auxiliary_loss_clip": 0.01235801, "auxiliary_loss_mlp": 0.01046736, "balance_loss_clip": 1.06850302, "balance_loss_mlp": 1.03176332, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.1616132333118205, "language_loss": 0.72345221, "learning_rate": 3.935465893866998e-06, "loss": 0.74627757, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.6304898262023926 }, { "auxiliary_loss_clip": 0.01215938, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.06910372, "balance_loss_mlp": 1.03522551, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 2.0193091623495754, "language_loss": 0.80087495, "learning_rate": 3.935269463302079e-06, "loss": 0.82354462, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 3.630563497543335 }, { "auxiliary_loss_clip": 0.01239486, "auxiliary_loss_mlp": 0.01052558, "balance_loss_clip": 1.07039523, "balance_loss_mlp": 1.03729916, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 2.0396082071574244, "language_loss": 0.76460981, "learning_rate": 3.935072739158322e-06, "loss": 0.78753024, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 3.608919143676758 }, { "auxiliary_loss_clip": 0.01215644, "auxiliary_loss_mlp": 0.01047975, "balance_loss_clip": 1.06754208, "balance_loss_mlp": 1.03314519, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.6225020969523345, "language_loss": 0.79758322, "learning_rate": 3.934875721465569e-06, "loss": 0.8202194, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 2.7417638301849365 }, { "auxiliary_loss_clip": 0.01214642, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.06430507, "balance_loss_mlp": 1.02346325, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 3.730953252879778, "language_loss": 0.71811724, "learning_rate": 3.9346784102537076e-06, "loss": 0.74066061, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.7723915576934814 }, { "auxiliary_loss_clip": 0.0124862, "auxiliary_loss_mlp": 0.01043766, "balance_loss_clip": 1.06859756, "balance_loss_mlp": 1.02848387, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 1.8209587169937962, "language_loss": 0.78364623, "learning_rate": 3.934480805552669e-06, "loss": 0.80657005, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.654754877090454 }, { "auxiliary_loss_clip": 0.01251501, "auxiliary_loss_mlp": 0.00714953, "balance_loss_clip": 1.07122958, "balance_loss_mlp": 1.00000954, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 1.9450233769466154, "language_loss": 0.87861943, "learning_rate": 3.93428290739243e-06, "loss": 0.89828402, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.6386780738830566 }, { "auxiliary_loss_clip": 0.01220056, "auxiliary_loss_mlp": 0.01057486, "balance_loss_clip": 1.06976676, "balance_loss_mlp": 1.04285264, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.46469515916895, "language_loss": 0.79855412, "learning_rate": 3.9340847158030125e-06, "loss": 0.82132947, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.6604156494140625 }, { "auxiliary_loss_clip": 0.01236217, "auxiliary_loss_mlp": 0.01051166, "balance_loss_clip": 1.06995106, "balance_loss_mlp": 1.036551, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 3.6983494756514625, "language_loss": 0.75281811, "learning_rate": 3.9338862308144814e-06, "loss": 0.77569199, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.6365230083465576 }, { "auxiliary_loss_clip": 0.01252521, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.07278812, "balance_loss_mlp": 1.02737975, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.6567977971621166, "language_loss": 0.84758973, "learning_rate": 3.933687452456946e-06, "loss": 0.87053388, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.6766622066497803 }, { "auxiliary_loss_clip": 0.01195662, "auxiliary_loss_mlp": 0.0105575, "balance_loss_clip": 1.06269407, "balance_loss_mlp": 1.03929949, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.216833526413356, "language_loss": 0.85948968, "learning_rate": 3.933488380760562e-06, "loss": 0.88200378, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.663662910461426 }, { "auxiliary_loss_clip": 0.01249702, "auxiliary_loss_mlp": 0.00715027, "balance_loss_clip": 1.06950998, "balance_loss_mlp": 1.00000811, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 2.550344654619146, "language_loss": 0.87229741, "learning_rate": 3.9332890157555286e-06, "loss": 0.89194465, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.675976276397705 }, { "auxiliary_loss_clip": 0.01222759, "auxiliary_loss_mlp": 0.01050363, "balance_loss_clip": 1.06902075, "balance_loss_mlp": 1.03448439, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 7.975512201085821, "language_loss": 0.76465058, "learning_rate": 3.933089357472088e-06, "loss": 0.78738183, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.718317985534668 }, { "auxiliary_loss_clip": 0.01251099, "auxiliary_loss_mlp": 0.01054648, "balance_loss_clip": 1.07187009, "balance_loss_mlp": 1.03957963, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 1.8974773403243888, "language_loss": 0.85829258, "learning_rate": 3.932889405940529e-06, "loss": 0.88135004, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.6255900859832764 }, { "auxiliary_loss_clip": 0.012176, "auxiliary_loss_mlp": 0.01064225, "balance_loss_clip": 1.06895292, "balance_loss_mlp": 1.04897809, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.1661578390931067, "language_loss": 0.79826897, "learning_rate": 3.932689161191184e-06, "loss": 0.82108724, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.687700033187866 }, { "auxiliary_loss_clip": 0.01232272, "auxiliary_loss_mlp": 0.01046903, "balance_loss_clip": 1.06638408, "balance_loss_mlp": 1.03268147, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 3.5622103559101896, "language_loss": 0.88062239, "learning_rate": 3.93248862325443e-06, "loss": 0.90341413, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.6218721866607666 }, { "auxiliary_loss_clip": 0.01139701, "auxiliary_loss_mlp": 0.01011072, "balance_loss_clip": 1.04956448, "balance_loss_mlp": 1.00415826, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9742254062077245, "language_loss": 0.64422899, "learning_rate": 3.932287792160688e-06, "loss": 0.66573673, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.0902910232543945 }, { "auxiliary_loss_clip": 0.01232775, "auxiliary_loss_mlp": 0.01052895, "balance_loss_clip": 1.0660007, "balance_loss_mlp": 1.03674269, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.1401063573593295, "language_loss": 0.80463898, "learning_rate": 3.932086667940424e-06, "loss": 0.82749569, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.5925378799438477 }, { "auxiliary_loss_clip": 0.01231337, "auxiliary_loss_mlp": 0.00714475, "balance_loss_clip": 1.06932831, "balance_loss_mlp": 1.00002789, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 2.1149053826734634, "language_loss": 0.81663716, "learning_rate": 3.93188525062415e-06, "loss": 0.83609533, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.746716022491455 }, { "auxiliary_loss_clip": 0.01231288, "auxiliary_loss_mlp": 0.01048537, "balance_loss_clip": 1.06780767, "balance_loss_mlp": 1.03312278, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 1.8293899685520052, "language_loss": 0.85958362, "learning_rate": 3.931683540242418e-06, "loss": 0.88238186, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.642232894897461 }, { "auxiliary_loss_clip": 0.01225382, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.0657934, "balance_loss_mlp": 1.03191447, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.8629485023824683, "language_loss": 0.90898764, "learning_rate": 3.9314815368258295e-06, "loss": 0.93171108, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.6481151580810547 }, { "auxiliary_loss_clip": 0.01238556, "auxiliary_loss_mlp": 0.01053831, "balance_loss_clip": 1.07249665, "balance_loss_mlp": 1.03929925, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.6686553893814362, "language_loss": 0.79166555, "learning_rate": 3.9312792404050275e-06, "loss": 0.8145895, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.599973678588867 }, { "auxiliary_loss_clip": 0.01248999, "auxiliary_loss_mlp": 0.01046423, "balance_loss_clip": 1.07174957, "balance_loss_mlp": 1.03307176, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 1.7726609937406788, "language_loss": 0.77267808, "learning_rate": 3.9310766510107e-06, "loss": 0.79563224, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.591118812561035 }, { "auxiliary_loss_clip": 0.01198477, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.06072497, "balance_loss_mlp": 1.02624273, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.965330171199504, "language_loss": 0.92011881, "learning_rate": 3.9308737686735806e-06, "loss": 0.94252288, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.729418992996216 }, { "auxiliary_loss_clip": 0.0125113, "auxiliary_loss_mlp": 0.01058266, "balance_loss_clip": 1.071455, "balance_loss_mlp": 1.04388952, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.3276436995053724, "language_loss": 0.83030242, "learning_rate": 3.9306705934244455e-06, "loss": 0.85339642, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 4.341211318969727 }, { "auxiliary_loss_clip": 0.01200376, "auxiliary_loss_mlp": 0.01050386, "balance_loss_clip": 1.06387365, "balance_loss_mlp": 1.03564036, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.6401378503794946, "language_loss": 0.88135636, "learning_rate": 3.930467125294116e-06, "loss": 0.90386403, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.695507049560547 }, { "auxiliary_loss_clip": 0.010546, "auxiliary_loss_mlp": 0.01008619, "balance_loss_clip": 1.0251267, "balance_loss_mlp": 1.00170445, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9182738314761408, "language_loss": 0.60435593, "learning_rate": 3.930263364313458e-06, "loss": 0.6249882, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 3.115072011947632 }, { "auxiliary_loss_clip": 0.01199123, "auxiliary_loss_mlp": 0.01056232, "balance_loss_clip": 1.06538486, "balance_loss_mlp": 1.04104483, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.958760410273881, "language_loss": 0.83196372, "learning_rate": 3.930059310513384e-06, "loss": 0.85451734, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 3.614792585372925 }, { "auxiliary_loss_clip": 0.01184918, "auxiliary_loss_mlp": 0.00715529, "balance_loss_clip": 1.06081903, "balance_loss_mlp": 1.00002503, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.8045392876344426, "language_loss": 0.84165633, "learning_rate": 3.929854963924846e-06, "loss": 0.86066073, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 3.6633477210998535 }, { "auxiliary_loss_clip": 0.01192788, "auxiliary_loss_mlp": 0.01049204, "balance_loss_clip": 1.06100154, "balance_loss_mlp": 1.03507805, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 1.8287836290069766, "language_loss": 0.77368814, "learning_rate": 3.929650324578845e-06, "loss": 0.79610813, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.693406820297241 }, { "auxiliary_loss_clip": 0.0121664, "auxiliary_loss_mlp": 0.01046685, "balance_loss_clip": 1.06648421, "balance_loss_mlp": 1.03047299, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 2.461241898677026, "language_loss": 0.81891924, "learning_rate": 3.929445392506423e-06, "loss": 0.8415525, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.6980912685394287 }, { "auxiliary_loss_clip": 0.01230664, "auxiliary_loss_mlp": 0.01042904, "balance_loss_clip": 1.07010579, "balance_loss_mlp": 1.02865887, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.8969418486456877, "language_loss": 0.76046926, "learning_rate": 3.92924016773867e-06, "loss": 0.78320491, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.6630172729492188 }, { "auxiliary_loss_clip": 0.01213684, "auxiliary_loss_mlp": 0.00714884, "balance_loss_clip": 1.06374454, "balance_loss_mlp": 1.00003123, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.23908021933469, "language_loss": 0.73557639, "learning_rate": 3.9290346503067175e-06, "loss": 0.75486213, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.656214952468872 }, { "auxiliary_loss_clip": 0.01232266, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 1.06565297, "balance_loss_mlp": 1.02853703, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 1.636887580827675, "language_loss": 0.79003555, "learning_rate": 3.9288288402417415e-06, "loss": 0.81279218, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.9350743293762207 }, { "auxiliary_loss_clip": 0.01233085, "auxiliary_loss_mlp": 0.01055012, "balance_loss_clip": 1.06989312, "balance_loss_mlp": 1.03910935, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 3.9180769607795614, "language_loss": 0.70552933, "learning_rate": 3.928622737574964e-06, "loss": 0.72841024, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.604537010192871 }, { "auxiliary_loss_clip": 0.01211773, "auxiliary_loss_mlp": 0.0104669, "balance_loss_clip": 1.06348085, "balance_loss_mlp": 1.03162169, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 1.8740964031841545, "language_loss": 0.90911126, "learning_rate": 3.928416342337652e-06, "loss": 0.93169594, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.716322422027588 }, { "auxiliary_loss_clip": 0.01215092, "auxiliary_loss_mlp": 0.01053704, "balance_loss_clip": 1.06624413, "balance_loss_mlp": 1.03908873, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 7.706570801277255, "language_loss": 0.82537645, "learning_rate": 3.928209654561113e-06, "loss": 0.84806436, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.8530402183532715 }, { "auxiliary_loss_clip": 0.01209461, "auxiliary_loss_mlp": 0.01054197, "balance_loss_clip": 1.06705403, "balance_loss_mlp": 1.03966498, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 1.910706815519834, "language_loss": 0.81313318, "learning_rate": 3.928002674276703e-06, "loss": 0.83576977, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.7120187282562256 }, { "auxiliary_loss_clip": 0.01156059, "auxiliary_loss_mlp": 0.01047409, "balance_loss_clip": 1.05329633, "balance_loss_mlp": 1.03222179, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.3486187192726824, "language_loss": 0.75331569, "learning_rate": 3.92779540151582e-06, "loss": 0.77535033, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.676780939102173 }, { "auxiliary_loss_clip": 0.0121329, "auxiliary_loss_mlp": 0.01047486, "balance_loss_clip": 1.06471992, "balance_loss_mlp": 1.03263247, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.8918368618015227, "language_loss": 0.85658193, "learning_rate": 3.927587836309907e-06, "loss": 0.87918973, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.620441436767578 }, { "auxiliary_loss_clip": 0.0120503, "auxiliary_loss_mlp": 0.01049553, "balance_loss_clip": 1.06203079, "balance_loss_mlp": 1.0341512, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 1.7814895615675355, "language_loss": 0.78190124, "learning_rate": 3.927379978690452e-06, "loss": 0.80444705, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.6805810928344727 }, { "auxiliary_loss_clip": 0.0117865, "auxiliary_loss_mlp": 0.01055525, "balance_loss_clip": 1.05413163, "balance_loss_mlp": 1.04087448, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 2.1581271440114, "language_loss": 0.87224996, "learning_rate": 3.927171828688987e-06, "loss": 0.89459169, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.744413137435913 }, { "auxiliary_loss_clip": 0.01248143, "auxiliary_loss_mlp": 0.01058496, "balance_loss_clip": 1.07098842, "balance_loss_mlp": 1.04330885, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.4339672056561237, "language_loss": 0.82052505, "learning_rate": 3.926963386337088e-06, "loss": 0.84359145, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.579617738723755 }, { "auxiliary_loss_clip": 0.01248589, "auxiliary_loss_mlp": 0.01043555, "balance_loss_clip": 1.06896996, "balance_loss_mlp": 1.0279026, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.128466457532593, "language_loss": 0.70096624, "learning_rate": 3.926754651666375e-06, "loss": 0.72388768, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.7846486568450928 }, { "auxiliary_loss_clip": 0.0119765, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.0649904, "balance_loss_mlp": 1.02545691, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 2.5370462199478965, "language_loss": 0.77753091, "learning_rate": 3.926545624708513e-06, "loss": 0.79990697, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.729325294494629 }, { "auxiliary_loss_clip": 0.01191469, "auxiliary_loss_mlp": 0.01051839, "balance_loss_clip": 1.06105018, "balance_loss_mlp": 1.03728318, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 1.8290425292614787, "language_loss": 0.8582232, "learning_rate": 3.926336305495213e-06, "loss": 0.88065624, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.6501269340515137 }, { "auxiliary_loss_clip": 0.01183569, "auxiliary_loss_mlp": 0.01051512, "balance_loss_clip": 1.06125796, "balance_loss_mlp": 1.03564477, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.242399964448005, "language_loss": 0.89168102, "learning_rate": 3.926126694058226e-06, "loss": 0.91403186, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.6980154514312744 }, { "auxiliary_loss_clip": 0.01179567, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.06547141, "balance_loss_mlp": 1.03383088, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.4951895813456104, "language_loss": 0.82462156, "learning_rate": 3.92591679042935e-06, "loss": 0.84689337, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.7088329792022705 }, { "auxiliary_loss_clip": 0.01228829, "auxiliary_loss_mlp": 0.01048554, "balance_loss_clip": 1.0685302, "balance_loss_mlp": 1.03304482, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.6003447047136832, "language_loss": 0.8240943, "learning_rate": 3.92570659464043e-06, "loss": 0.84686816, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.6690688133239746 }, { "auxiliary_loss_clip": 0.01228826, "auxiliary_loss_mlp": 0.0071429, "balance_loss_clip": 1.06949794, "balance_loss_mlp": 0.99998164, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 1.8850154309710432, "language_loss": 0.79447246, "learning_rate": 3.925496106723349e-06, "loss": 0.81390357, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.6539788246154785 }, { "auxiliary_loss_clip": 0.01231692, "auxiliary_loss_mlp": 0.01052456, "balance_loss_clip": 1.06818724, "balance_loss_mlp": 1.03796053, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 1.8604414687462363, "language_loss": 0.83669734, "learning_rate": 3.9252853267100405e-06, "loss": 0.85953879, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 3.5255417823791504 }, { "auxiliary_loss_clip": 0.01185854, "auxiliary_loss_mlp": 0.01051663, "balance_loss_clip": 1.06196976, "balance_loss_mlp": 1.03658342, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 1.817758560303797, "language_loss": 0.83654964, "learning_rate": 3.9250742546324786e-06, "loss": 0.85892487, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 2.704704999923706 }, { "auxiliary_loss_clip": 0.01213025, "auxiliary_loss_mlp": 0.01045011, "balance_loss_clip": 1.06387973, "balance_loss_mlp": 1.03082538, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.7423213376670872, "language_loss": 0.86585116, "learning_rate": 3.924862890522683e-06, "loss": 0.88843149, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 2.6953585147857666 }, { "auxiliary_loss_clip": 0.01227828, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.0640955, "balance_loss_mlp": 1.03022993, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.281324347258038, "language_loss": 0.86301947, "learning_rate": 3.9246512344127174e-06, "loss": 0.88575214, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 3.499061107635498 }, { "auxiliary_loss_clip": 0.01141442, "auxiliary_loss_mlp": 0.01049296, "balance_loss_clip": 1.05461431, "balance_loss_mlp": 1.03558731, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 2.0059963468939266, "language_loss": 0.82006812, "learning_rate": 3.9244392863346895e-06, "loss": 0.84197551, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 3.679316759109497 }, { "auxiliary_loss_clip": 0.0121364, "auxiliary_loss_mlp": 0.0106001, "balance_loss_clip": 1.0672071, "balance_loss_mlp": 1.04444122, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.901254364522041, "language_loss": 0.9246493, "learning_rate": 3.9242270463207524e-06, "loss": 0.94738579, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.6667709350585938 }, { "auxiliary_loss_clip": 0.01166623, "auxiliary_loss_mlp": 0.0104494, "balance_loss_clip": 1.05978131, "balance_loss_mlp": 1.02976465, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 3.565099777418526, "language_loss": 0.85116029, "learning_rate": 3.924014514403102e-06, "loss": 0.87327588, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.712925672531128 }, { "auxiliary_loss_clip": 0.01173186, "auxiliary_loss_mlp": 0.01047126, "balance_loss_clip": 1.06015754, "balance_loss_mlp": 1.03062749, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 1.979908336404065, "language_loss": 0.91394043, "learning_rate": 3.92380169061398e-06, "loss": 0.93614352, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.6706056594848633 }, { "auxiliary_loss_clip": 0.01184278, "auxiliary_loss_mlp": 0.00715292, "balance_loss_clip": 1.05709827, "balance_loss_mlp": 0.99997365, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 2.0627200131860124, "language_loss": 0.83737087, "learning_rate": 3.9235885749856705e-06, "loss": 0.85636657, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.781421422958374 }, { "auxiliary_loss_clip": 0.01209591, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.06698287, "balance_loss_mlp": 1.02806568, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 1.7519148289041606, "language_loss": 0.82580352, "learning_rate": 3.9233751675505035e-06, "loss": 0.84833378, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.6289117336273193 }, { "auxiliary_loss_clip": 0.012059, "auxiliary_loss_mlp": 0.01053777, "balance_loss_clip": 1.06494045, "balance_loss_mlp": 1.03817272, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 2.2616593566199166, "language_loss": 0.84872854, "learning_rate": 3.923161468340853e-06, "loss": 0.87132537, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.678039312362671 }, { "auxiliary_loss_clip": 0.01161791, "auxiliary_loss_mlp": 0.01056289, "balance_loss_clip": 1.05462384, "balance_loss_mlp": 1.04126883, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 5.195468101328973, "language_loss": 0.81385034, "learning_rate": 3.9229474773891374e-06, "loss": 0.83603114, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.700010299682617 }, { "auxiliary_loss_clip": 0.01199342, "auxiliary_loss_mlp": 0.01040722, "balance_loss_clip": 1.05889654, "balance_loss_mlp": 1.02528429, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 1.9120824535280299, "language_loss": 0.8356868, "learning_rate": 3.922733194727818e-06, "loss": 0.85808742, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.7750091552734375 }, { "auxiliary_loss_clip": 0.01233374, "auxiliary_loss_mlp": 0.01040649, "balance_loss_clip": 1.06906319, "balance_loss_mlp": 1.02560449, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 1.9956357505286788, "language_loss": 0.87419462, "learning_rate": 3.922518620389402e-06, "loss": 0.89693493, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.568937301635742 }, { "auxiliary_loss_clip": 0.01116089, "auxiliary_loss_mlp": 0.01057164, "balance_loss_clip": 1.05351353, "balance_loss_mlp": 1.04234672, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 1.7751751986808093, "language_loss": 0.89895833, "learning_rate": 3.922303754406439e-06, "loss": 0.92069083, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.7865209579467773 }, { "auxiliary_loss_clip": 0.01171553, "auxiliary_loss_mlp": 0.01046027, "balance_loss_clip": 1.05580711, "balance_loss_mlp": 1.03131652, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 7.689352529579397, "language_loss": 0.78993273, "learning_rate": 3.922088596811526e-06, "loss": 0.81210852, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.667557716369629 }, { "auxiliary_loss_clip": 0.01218071, "auxiliary_loss_mlp": 0.01047844, "balance_loss_clip": 1.06571937, "balance_loss_mlp": 1.03277588, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.1043493669182225, "language_loss": 0.86406672, "learning_rate": 3.9218731476373e-06, "loss": 0.8867259, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.6032581329345703 }, { "auxiliary_loss_clip": 0.01232881, "auxiliary_loss_mlp": 0.01042852, "balance_loss_clip": 1.07060587, "balance_loss_mlp": 1.02735436, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.184660103143737, "language_loss": 0.84737945, "learning_rate": 3.9216574069164455e-06, "loss": 0.87013674, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.6326780319213867 }, { "auxiliary_loss_clip": 0.01243975, "auxiliary_loss_mlp": 0.01049372, "balance_loss_clip": 1.06907439, "balance_loss_mlp": 1.0355494, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.5536436089737597, "language_loss": 0.80043429, "learning_rate": 3.921441374681691e-06, "loss": 0.82336783, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.67228627204895 }, { "auxiliary_loss_clip": 0.01209102, "auxiliary_loss_mlp": 0.01043113, "balance_loss_clip": 1.06743836, "balance_loss_mlp": 1.0287838, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.885597258291384, "language_loss": 0.65058607, "learning_rate": 3.921225050965808e-06, "loss": 0.67310822, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.6910390853881836 }, { "auxiliary_loss_clip": 0.01191005, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.06189787, "balance_loss_mlp": 1.02478147, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 2.1170672429228, "language_loss": 0.75074762, "learning_rate": 3.921008435801612e-06, "loss": 0.77305526, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.7110772132873535 }, { "auxiliary_loss_clip": 0.01217715, "auxiliary_loss_mlp": 0.01044304, "balance_loss_clip": 1.06462967, "balance_loss_mlp": 1.02882493, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 2.2880034581583963, "language_loss": 0.75954688, "learning_rate": 3.920791529221963e-06, "loss": 0.78216714, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.6371116638183594 }, { "auxiliary_loss_clip": 0.01214462, "auxiliary_loss_mlp": 0.00714627, "balance_loss_clip": 1.06564045, "balance_loss_mlp": 1.00006461, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 1.796147567845045, "language_loss": 0.76638037, "learning_rate": 3.920574331259768e-06, "loss": 0.78567123, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.7163937091827393 }, { "auxiliary_loss_clip": 0.01198942, "auxiliary_loss_mlp": 0.01052274, "balance_loss_clip": 1.062711, "balance_loss_mlp": 1.03756356, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.599843215673764, "language_loss": 0.79643196, "learning_rate": 3.9203568419479716e-06, "loss": 0.8189441, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.684084892272949 }, { "auxiliary_loss_clip": 0.01207941, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.06424427, "balance_loss_mlp": 1.0298667, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.9021553584343613, "language_loss": 0.7531606, "learning_rate": 3.92013906131957e-06, "loss": 0.77568293, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.670773506164551 }, { "auxiliary_loss_clip": 0.01189947, "auxiliary_loss_mlp": 0.01047619, "balance_loss_clip": 1.06242537, "balance_loss_mlp": 1.03334951, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 1.752740810442128, "language_loss": 0.82638562, "learning_rate": 3.9199209894076e-06, "loss": 0.84876126, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 3.6093435287475586 }, { "auxiliary_loss_clip": 0.01247873, "auxiliary_loss_mlp": 0.01040846, "balance_loss_clip": 1.06799197, "balance_loss_mlp": 1.02624345, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.8356956332533978, "language_loss": 0.89960086, "learning_rate": 3.919702626245142e-06, "loss": 0.92248809, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 3.712007522583008 }, { "auxiliary_loss_clip": 0.01200571, "auxiliary_loss_mlp": 0.01048654, "balance_loss_clip": 1.06286657, "balance_loss_mlp": 1.03396749, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.2127375869059276, "language_loss": 0.66080832, "learning_rate": 3.919483971865322e-06, "loss": 0.68330061, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.6722028255462646 }, { "auxiliary_loss_clip": 0.01207804, "auxiliary_loss_mlp": 0.0104397, "balance_loss_clip": 1.06629813, "balance_loss_mlp": 1.02942657, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 2.136492711023447, "language_loss": 0.88065004, "learning_rate": 3.91926502630131e-06, "loss": 0.90316772, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 2.6810405254364014 }, { "auxiliary_loss_clip": 0.01231012, "auxiliary_loss_mlp": 0.01046431, "balance_loss_clip": 1.07050478, "balance_loss_mlp": 1.03250694, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 1.7825337535278205, "language_loss": 0.71921998, "learning_rate": 3.91904578958632e-06, "loss": 0.74199438, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 3.5797431468963623 }, { "auxiliary_loss_clip": 0.01245199, "auxiliary_loss_mlp": 0.01037021, "balance_loss_clip": 1.06805778, "balance_loss_mlp": 1.02258444, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.091759901174985, "language_loss": 0.84405959, "learning_rate": 3.918826261753608e-06, "loss": 0.86688185, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 3.5498013496398926 }, { "auxiliary_loss_clip": 0.01211144, "auxiliary_loss_mlp": 0.01048488, "balance_loss_clip": 1.06544018, "balance_loss_mlp": 1.03404593, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 3.004176516898627, "language_loss": 0.71152395, "learning_rate": 3.918606442836478e-06, "loss": 0.73412025, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 2.791205644607544 }, { "auxiliary_loss_clip": 0.01229028, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.07085824, "balance_loss_mlp": 1.02541709, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.8792526604498876, "language_loss": 0.77756774, "learning_rate": 3.918386332868277e-06, "loss": 0.800246, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.654364585876465 }, { "auxiliary_loss_clip": 0.01216824, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.06329989, "balance_loss_mlp": 1.0364778, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.6630117221653795, "language_loss": 0.94440329, "learning_rate": 3.918165931882394e-06, "loss": 0.9670788, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.646224021911621 }, { "auxiliary_loss_clip": 0.01147806, "auxiliary_loss_mlp": 0.0105098, "balance_loss_clip": 1.05473626, "balance_loss_mlp": 1.03715181, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.6014641470906787, "language_loss": 0.75531614, "learning_rate": 3.917945239912264e-06, "loss": 0.77730399, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.7416794300079346 }, { "auxiliary_loss_clip": 0.01178464, "auxiliary_loss_mlp": 0.01039507, "balance_loss_clip": 1.06196606, "balance_loss_mlp": 1.0257442, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 1.9948928835136253, "language_loss": 0.75765401, "learning_rate": 3.917724256991367e-06, "loss": 0.77983373, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.6480040550231934 }, { "auxiliary_loss_clip": 0.012016, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.06526542, "balance_loss_mlp": 1.0245285, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 1.9823904349116779, "language_loss": 0.81541371, "learning_rate": 3.9175029831532245e-06, "loss": 0.83782244, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.7240023612976074 }, { "auxiliary_loss_clip": 0.01197777, "auxiliary_loss_mlp": 0.0105349, "balance_loss_clip": 1.06656361, "balance_loss_mlp": 1.03907704, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.171560391634001, "language_loss": 0.88444203, "learning_rate": 3.917281418431404e-06, "loss": 0.90695465, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.678678035736084 }, { "auxiliary_loss_clip": 0.01213052, "auxiliary_loss_mlp": 0.01052259, "balance_loss_clip": 1.06811762, "balance_loss_mlp": 1.03818643, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.353395418646498, "language_loss": 0.76716584, "learning_rate": 3.917059562859516e-06, "loss": 0.789819, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.651879072189331 }, { "auxiliary_loss_clip": 0.01199795, "auxiliary_loss_mlp": 0.01038939, "balance_loss_clip": 1.06615162, "balance_loss_mlp": 1.02484274, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.0458969675922236, "language_loss": 0.88863575, "learning_rate": 3.916837416471218e-06, "loss": 0.91102308, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.708617687225342 }, { "auxiliary_loss_clip": 0.01222296, "auxiliary_loss_mlp": 0.01050881, "balance_loss_clip": 1.06511569, "balance_loss_mlp": 1.03583658, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.5044452898938525, "language_loss": 0.72249758, "learning_rate": 3.916614979300207e-06, "loss": 0.7452293, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.632899761199951 }, { "auxiliary_loss_clip": 0.01167282, "auxiliary_loss_mlp": 0.01046065, "balance_loss_clip": 1.05990744, "balance_loss_mlp": 1.0320524, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.599542815148772, "language_loss": 0.78659213, "learning_rate": 3.9163922513802274e-06, "loss": 0.8087256, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.764341115951538 }, { "auxiliary_loss_clip": 0.01249171, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.06895697, "balance_loss_mlp": 1.03114986, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 2.2244650253654785, "language_loss": 0.83065426, "learning_rate": 3.916169232745067e-06, "loss": 0.8536092, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.6370620727539062 }, { "auxiliary_loss_clip": 0.01201566, "auxiliary_loss_mlp": 0.01053896, "balance_loss_clip": 1.06393325, "balance_loss_mlp": 1.0379101, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 2.5976381888606324, "language_loss": 0.92004055, "learning_rate": 3.915945923428559e-06, "loss": 0.94259512, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.6272642612457275 }, { "auxiliary_loss_clip": 0.01223916, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.06659508, "balance_loss_mlp": 1.02268457, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.281961189343109, "language_loss": 0.8318404, "learning_rate": 3.915722323464577e-06, "loss": 0.85444945, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.611659526824951 }, { "auxiliary_loss_clip": 0.0122676, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.06854367, "balance_loss_mlp": 1.03081226, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 3.1789906640614833, "language_loss": 0.70413607, "learning_rate": 3.91549843288704e-06, "loss": 0.72686148, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.933468818664551 }, { "auxiliary_loss_clip": 0.01192287, "auxiliary_loss_mlp": 0.00714732, "balance_loss_clip": 1.06136513, "balance_loss_mlp": 1.00004983, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 1.9101235268724763, "language_loss": 0.79048371, "learning_rate": 3.915274251729916e-06, "loss": 0.80955386, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.7830140590667725 }, { "auxiliary_loss_clip": 0.01200819, "auxiliary_loss_mlp": 0.01052677, "balance_loss_clip": 1.06803477, "balance_loss_mlp": 1.03821719, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.059700374056191, "language_loss": 0.9009515, "learning_rate": 3.91504978002721e-06, "loss": 0.92348641, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.671412229537964 }, { "auxiliary_loss_clip": 0.01211688, "auxiliary_loss_mlp": 0.0071478, "balance_loss_clip": 1.06364846, "balance_loss_mlp": 1.00005341, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 2.6477233584652033, "language_loss": 0.76309812, "learning_rate": 3.914825017812974e-06, "loss": 0.78236282, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.6638171672821045 }, { "auxiliary_loss_clip": 0.01214405, "auxiliary_loss_mlp": 0.01045936, "balance_loss_clip": 1.07121766, "balance_loss_mlp": 1.03155911, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.1654157251758956, "language_loss": 0.72756088, "learning_rate": 3.9145999651213065e-06, "loss": 0.75016427, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.6794509887695312 }, { "auxiliary_loss_clip": 0.01231019, "auxiliary_loss_mlp": 0.01049551, "balance_loss_clip": 1.07027531, "balance_loss_mlp": 1.03384507, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 2.31041563242056, "language_loss": 0.88450718, "learning_rate": 3.9143746219863465e-06, "loss": 0.90731287, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.6293530464172363 }, { "auxiliary_loss_clip": 0.01132839, "auxiliary_loss_mlp": 0.01010052, "balance_loss_clip": 1.05303693, "balance_loss_mlp": 1.00390041, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9378548306225524, "language_loss": 0.64760095, "learning_rate": 3.914148988442278e-06, "loss": 0.66902989, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 4.090515613555908 }, { "auxiliary_loss_clip": 0.01203706, "auxiliary_loss_mlp": 0.01049625, "balance_loss_clip": 1.06783152, "balance_loss_mlp": 1.03355575, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 2.4976766988714574, "language_loss": 0.95440948, "learning_rate": 3.91392306452333e-06, "loss": 0.97694278, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 3.5444796085357666 }, { "auxiliary_loss_clip": 0.0124891, "auxiliary_loss_mlp": 0.01048934, "balance_loss_clip": 1.07003736, "balance_loss_mlp": 1.03311491, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 2.9720103032388874, "language_loss": 0.66408956, "learning_rate": 3.913696850263774e-06, "loss": 0.68706799, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.6182918548583984 }, { "auxiliary_loss_clip": 0.01227037, "auxiliary_loss_mlp": 0.01038586, "balance_loss_clip": 1.06762338, "balance_loss_mlp": 1.02343416, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.310199143045025, "language_loss": 0.79307044, "learning_rate": 3.913470345697929e-06, "loss": 0.81572664, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 2.618501901626587 }, { "auxiliary_loss_clip": 0.0118014, "auxiliary_loss_mlp": 0.01042798, "balance_loss_clip": 1.06430483, "balance_loss_mlp": 1.0281117, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.0244467475792876, "language_loss": 0.85353899, "learning_rate": 3.913243550860153e-06, "loss": 0.87576842, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 3.676604986190796 }, { "auxiliary_loss_clip": 0.01231906, "auxiliary_loss_mlp": 0.01044814, "balance_loss_clip": 1.07244647, "balance_loss_mlp": 1.02968645, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 1.8448159230751608, "language_loss": 0.76384366, "learning_rate": 3.913016465784852e-06, "loss": 0.78661084, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 3.613085985183716 }, { "auxiliary_loss_clip": 0.01178033, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.06197262, "balance_loss_mlp": 1.02337456, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 3.659404764483598, "language_loss": 0.71989381, "learning_rate": 3.912789090506474e-06, "loss": 0.74205768, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 2.702991485595703 }, { "auxiliary_loss_clip": 0.01199355, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.06262004, "balance_loss_mlp": 1.02543354, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.6810425534599758, "language_loss": 0.71964836, "learning_rate": 3.9125614250595114e-06, "loss": 0.74204481, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.6349897384643555 }, { "auxiliary_loss_clip": 0.01231704, "auxiliary_loss_mlp": 0.01048521, "balance_loss_clip": 1.06813836, "balance_loss_mlp": 1.03249907, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.911247113084427, "language_loss": 0.88923818, "learning_rate": 3.912333469478502e-06, "loss": 0.91204047, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.611292600631714 }, { "auxiliary_loss_clip": 0.01210334, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.06547511, "balance_loss_mlp": 1.03153706, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 1.9313176791647813, "language_loss": 0.77919674, "learning_rate": 3.912105223798025e-06, "loss": 0.80176771, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.6499404907226562 }, { "auxiliary_loss_clip": 0.01093706, "auxiliary_loss_mlp": 0.01007868, "balance_loss_clip": 1.02742517, "balance_loss_mlp": 1.00162184, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 0.9923216113556143, "language_loss": 0.67649812, "learning_rate": 3.9118766880527065e-06, "loss": 0.69751382, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.138136863708496 }, { "auxiliary_loss_clip": 0.01166268, "auxiliary_loss_mlp": 0.01043695, "balance_loss_clip": 1.05939376, "balance_loss_mlp": 1.02946782, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.7918278546343807, "language_loss": 0.7370795, "learning_rate": 3.9116478622772145e-06, "loss": 0.75917912, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.7293412685394287 }, { "auxiliary_loss_clip": 0.01228098, "auxiliary_loss_mlp": 0.01043899, "balance_loss_clip": 1.0702132, "balance_loss_mlp": 1.02893853, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.7013115395239224, "language_loss": 0.87852401, "learning_rate": 3.911418746506261e-06, "loss": 0.90124404, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.7354395389556885 }, { "auxiliary_loss_clip": 0.01234546, "auxiliary_loss_mlp": 0.01048589, "balance_loss_clip": 1.0744884, "balance_loss_mlp": 1.03385425, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.6923754685497663, "language_loss": 0.78217566, "learning_rate": 3.911189340774604e-06, "loss": 0.80500698, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.6579792499542236 }, { "auxiliary_loss_clip": 0.01215848, "auxiliary_loss_mlp": 0.01047958, "balance_loss_clip": 1.0635941, "balance_loss_mlp": 1.03330719, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.9618904827585273, "language_loss": 0.79351312, "learning_rate": 3.910959645117043e-06, "loss": 0.81615114, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.6594605445861816 }, { "auxiliary_loss_clip": 0.01122445, "auxiliary_loss_mlp": 0.0070455, "balance_loss_clip": 1.04896903, "balance_loss_mlp": 0.99990219, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8237002222890593, "language_loss": 0.56711632, "learning_rate": 3.910729659568423e-06, "loss": 0.58538634, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.2561936378479004 }, { "auxiliary_loss_clip": 0.01216402, "auxiliary_loss_mlp": 0.01050089, "balance_loss_clip": 1.07095933, "balance_loss_mlp": 1.03580213, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 1.8916112745861497, "language_loss": 0.8230167, "learning_rate": 3.9104993841636344e-06, "loss": 0.84568167, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.720400810241699 }, { "auxiliary_loss_clip": 0.01212765, "auxiliary_loss_mlp": 0.0071463, "balance_loss_clip": 1.07342148, "balance_loss_mlp": 1.00015473, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.7880167924225863, "language_loss": 0.80875027, "learning_rate": 3.910268818937608e-06, "loss": 0.82802415, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.701918601989746 }, { "auxiliary_loss_clip": 0.01176899, "auxiliary_loss_mlp": 0.0104787, "balance_loss_clip": 1.0650239, "balance_loss_mlp": 1.03405356, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 3.32980421089156, "language_loss": 0.87521648, "learning_rate": 3.9100379639253196e-06, "loss": 0.89746416, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.681755304336548 }, { "auxiliary_loss_clip": 0.0121331, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.06638956, "balance_loss_mlp": 1.03086567, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.4514065428075664, "language_loss": 0.86293405, "learning_rate": 3.909806819161791e-06, "loss": 0.88552248, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.656277894973755 }, { "auxiliary_loss_clip": 0.01201907, "auxiliary_loss_mlp": 0.01045898, "balance_loss_clip": 1.06470561, "balance_loss_mlp": 1.03144968, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 2.0677040138651828, "language_loss": 0.86343825, "learning_rate": 3.909575384682086e-06, "loss": 0.88591635, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.676086664199829 }, { "auxiliary_loss_clip": 0.01228744, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.06496668, "balance_loss_mlp": 1.03469777, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.9156879961757882, "language_loss": 0.69065607, "learning_rate": 3.9093436605213144e-06, "loss": 0.71342826, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.6585140228271484 }, { "auxiliary_loss_clip": 0.01210787, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.06615758, "balance_loss_mlp": 1.02953434, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 1.9923160698273004, "language_loss": 0.79289633, "learning_rate": 3.909111646714627e-06, "loss": 0.81544828, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.698753833770752 }, { "auxiliary_loss_clip": 0.01245236, "auxiliary_loss_mlp": 0.01047136, "balance_loss_clip": 1.07181323, "balance_loss_mlp": 1.03308105, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.7231759183440136, "language_loss": 0.72474933, "learning_rate": 3.9088793432972206e-06, "loss": 0.74767309, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.606806993484497 }, { "auxiliary_loss_clip": 0.01180314, "auxiliary_loss_mlp": 0.01052137, "balance_loss_clip": 1.06617117, "balance_loss_mlp": 1.03817725, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.0049173439227777, "language_loss": 0.82388294, "learning_rate": 3.908646750304336e-06, "loss": 0.84620744, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.6677896976470947 }, { "auxiliary_loss_clip": 0.01218633, "auxiliary_loss_mlp": 0.01046059, "balance_loss_clip": 1.07188463, "balance_loss_mlp": 1.03069329, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.620491871333628, "language_loss": 0.87237883, "learning_rate": 3.908413867771257e-06, "loss": 0.89502573, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.7879834175109863 }, { "auxiliary_loss_clip": 0.01230105, "auxiliary_loss_mlp": 0.01051917, "balance_loss_clip": 1.07173502, "balance_loss_mlp": 1.03568053, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 1.8500686164538422, "language_loss": 0.80596066, "learning_rate": 3.908180695733311e-06, "loss": 0.82878077, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 3.5542349815368652 }, { "auxiliary_loss_clip": 0.01149631, "auxiliary_loss_mlp": 0.01050649, "balance_loss_clip": 1.05296469, "balance_loss_mlp": 1.03658819, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.785881100789941, "language_loss": 0.82840216, "learning_rate": 3.907947234225871e-06, "loss": 0.85040498, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 3.5173118114471436 }, { "auxiliary_loss_clip": 0.01150465, "auxiliary_loss_mlp": 0.01053265, "balance_loss_clip": 1.05868578, "balance_loss_mlp": 1.03933001, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 1.8873466694614538, "language_loss": 0.87258637, "learning_rate": 3.907713483284352e-06, "loss": 0.8946237, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.765359878540039 }, { "auxiliary_loss_clip": 0.0113037, "auxiliary_loss_mlp": 0.01051132, "balance_loss_clip": 1.05584836, "balance_loss_mlp": 1.03687406, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.950695073217228, "language_loss": 0.97194988, "learning_rate": 3.907479442944216e-06, "loss": 0.99376488, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 2.8030178546905518 }, { "auxiliary_loss_clip": 0.01225662, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.06938326, "balance_loss_mlp": 1.03274298, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.059351045171155, "language_loss": 0.92171502, "learning_rate": 3.907245113240963e-06, "loss": 0.94444406, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 4.441791534423828 }, { "auxiliary_loss_clip": 0.01189001, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.05912662, "balance_loss_mlp": 1.03499186, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.8326431246029171, "language_loss": 0.73739326, "learning_rate": 3.907010494210144e-06, "loss": 0.7597878, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 2.890490770339966 }, { "auxiliary_loss_clip": 0.01233412, "auxiliary_loss_mlp": 0.01046313, "balance_loss_clip": 1.0711062, "balance_loss_mlp": 1.03217471, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 2.0080832454207633, "language_loss": 0.91842806, "learning_rate": 3.9067755858873495e-06, "loss": 0.94122529, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.587590217590332 }, { "auxiliary_loss_clip": 0.0109421, "auxiliary_loss_mlp": 0.01005335, "balance_loss_clip": 1.0328629, "balance_loss_mlp": 0.99975568, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8808297485725757, "language_loss": 0.62808776, "learning_rate": 3.906540388308214e-06, "loss": 0.64908326, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.2355124950408936 }, { "auxiliary_loss_clip": 0.01160908, "auxiliary_loss_mlp": 0.01046582, "balance_loss_clip": 1.06032372, "balance_loss_mlp": 1.03209782, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 1.6062092004183894, "language_loss": 0.81378961, "learning_rate": 3.906304901508417e-06, "loss": 0.83586448, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.703768253326416 }, { "auxiliary_loss_clip": 0.01233338, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.07216311, "balance_loss_mlp": 1.03524756, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.037963532620457, "language_loss": 0.75701225, "learning_rate": 3.9060691255236835e-06, "loss": 0.77984416, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.7465546131134033 }, { "auxiliary_loss_clip": 0.01225984, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0667665, "balance_loss_mlp": 1.03086901, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 1.6885686180231079, "language_loss": 0.80494118, "learning_rate": 3.905833060389778e-06, "loss": 0.82765239, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.737687587738037 }, { "auxiliary_loss_clip": 0.01246685, "auxiliary_loss_mlp": 0.00714422, "balance_loss_clip": 1.07083702, "balance_loss_mlp": 1.00013316, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 1.844705550881063, "language_loss": 0.7811498, "learning_rate": 3.905596706142513e-06, "loss": 0.80076087, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.622330665588379 }, { "auxiliary_loss_clip": 0.01181573, "auxiliary_loss_mlp": 0.01044669, "balance_loss_clip": 1.05723155, "balance_loss_mlp": 1.03044748, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.3493666110912996, "language_loss": 0.85937011, "learning_rate": 3.9053600628177435e-06, "loss": 0.88163257, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.7794902324676514 }, { "auxiliary_loss_clip": 0.01246932, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.07060671, "balance_loss_mlp": 1.03070617, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 1.9054487199216856, "language_loss": 0.84510309, "learning_rate": 3.905123130451367e-06, "loss": 0.86802429, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.6539483070373535 }, { "auxiliary_loss_clip": 0.01247139, "auxiliary_loss_mlp": 0.01048226, "balance_loss_clip": 1.07212591, "balance_loss_mlp": 1.03438604, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.8739494197510198, "language_loss": 0.79318696, "learning_rate": 3.904885909079326e-06, "loss": 0.81614065, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.776099681854248 }, { "auxiliary_loss_clip": 0.01227932, "auxiliary_loss_mlp": 0.01051597, "balance_loss_clip": 1.066903, "balance_loss_mlp": 1.03637362, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.409112281289852, "language_loss": 0.77833772, "learning_rate": 3.904648398737607e-06, "loss": 0.80113304, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.6598708629608154 }, { "auxiliary_loss_clip": 0.01243441, "auxiliary_loss_mlp": 0.01042919, "balance_loss_clip": 1.06888509, "balance_loss_mlp": 1.02884614, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.774922658146648, "language_loss": 0.78003716, "learning_rate": 3.9044105994622406e-06, "loss": 0.80290079, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.7167463302612305 }, { "auxiliary_loss_clip": 0.01214519, "auxiliary_loss_mlp": 0.00715148, "balance_loss_clip": 1.06652987, "balance_loss_mlp": 1.00009465, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 1.8819550931333722, "language_loss": 0.81423903, "learning_rate": 3.9041725112893005e-06, "loss": 0.83353567, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.752594470977783 }, { "auxiliary_loss_clip": 0.01186807, "auxiliary_loss_mlp": 0.01046307, "balance_loss_clip": 1.06543171, "balance_loss_mlp": 1.03166831, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 1.78538337296101, "language_loss": 0.75087565, "learning_rate": 3.903934134254904e-06, "loss": 0.77320683, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.6792566776275635 }, { "auxiliary_loss_clip": 0.01229768, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.0661099, "balance_loss_mlp": 1.0339601, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.26747686278576, "language_loss": 0.84975219, "learning_rate": 3.903695468395213e-06, "loss": 0.87253773, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.6587345600128174 }, { "auxiliary_loss_clip": 0.01213327, "auxiliary_loss_mlp": 0.01042435, "balance_loss_clip": 1.06444025, "balance_loss_mlp": 1.02920318, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 1.9946066336898134, "language_loss": 0.55485392, "learning_rate": 3.903456513746434e-06, "loss": 0.57741159, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.7402536869049072 }, { "auxiliary_loss_clip": 0.01243551, "auxiliary_loss_mlp": 0.01045461, "balance_loss_clip": 1.06945825, "balance_loss_mlp": 1.03159666, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.7026389690577854, "language_loss": 0.8708809, "learning_rate": 3.903217270344815e-06, "loss": 0.89377105, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.744194746017456 }, { "auxiliary_loss_clip": 0.01183935, "auxiliary_loss_mlp": 0.01041693, "balance_loss_clip": 1.05918837, "balance_loss_mlp": 1.02819836, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 1.6300285531966532, "language_loss": 0.82205129, "learning_rate": 3.902977738226648e-06, "loss": 0.8443076, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.796062707901001 }, { "auxiliary_loss_clip": 0.01227126, "auxiliary_loss_mlp": 0.01048254, "balance_loss_clip": 1.06790996, "balance_loss_mlp": 1.03348958, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.9241938030417547, "language_loss": 0.911816, "learning_rate": 3.902737917428273e-06, "loss": 0.93456978, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.674783706665039 }, { "auxiliary_loss_clip": 0.01243482, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.06896687, "balance_loss_mlp": 1.03296733, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 2.2532880943789166, "language_loss": 0.84156859, "learning_rate": 3.902497807986068e-06, "loss": 0.86448562, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.6395249366760254 }, { "auxiliary_loss_clip": 0.0119394, "auxiliary_loss_mlp": 0.01045914, "balance_loss_clip": 1.06266677, "balance_loss_mlp": 1.03217483, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.548040464127309, "language_loss": 0.8372786, "learning_rate": 3.902257409936458e-06, "loss": 0.8596772, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.7393693923950195 }, { "auxiliary_loss_clip": 0.01211695, "auxiliary_loss_mlp": 0.01048747, "balance_loss_clip": 1.06834483, "balance_loss_mlp": 1.03516936, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 4.882236969629533, "language_loss": 0.84153974, "learning_rate": 3.902016723315912e-06, "loss": 0.86414415, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 3.5930991172790527 }, { "auxiliary_loss_clip": 0.01223726, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.06605279, "balance_loss_mlp": 1.02804267, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 2.3613449816317056, "language_loss": 0.69423926, "learning_rate": 3.901775748160941e-06, "loss": 0.71689194, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 3.575604200363159 }, { "auxiliary_loss_clip": 0.01115778, "auxiliary_loss_mlp": 0.01013571, "balance_loss_clip": 1.04831052, "balance_loss_mlp": 1.00782478, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.7967035279917636, "language_loss": 0.60923439, "learning_rate": 3.901534484508101e-06, "loss": 0.63052785, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 3.258101224899292 }, { "auxiliary_loss_clip": 0.01193101, "auxiliary_loss_mlp": 0.01042831, "balance_loss_clip": 1.05957174, "balance_loss_mlp": 1.02868092, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 1.876609150763176, "language_loss": 0.7447474, "learning_rate": 3.901292932393991e-06, "loss": 0.76710671, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 2.6692585945129395 }, { "auxiliary_loss_clip": 0.01245567, "auxiliary_loss_mlp": 0.01046463, "balance_loss_clip": 1.07160389, "balance_loss_mlp": 1.03177619, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 2.4109326807058338, "language_loss": 0.85103273, "learning_rate": 3.9010510918552555e-06, "loss": 0.87395298, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 3.589167833328247 }, { "auxiliary_loss_clip": 0.01205607, "auxiliary_loss_mlp": 0.01056186, "balance_loss_clip": 1.06385922, "balance_loss_mlp": 1.04085588, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.314342229356385, "language_loss": 0.74636012, "learning_rate": 3.900808962928581e-06, "loss": 0.76897806, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 3.687018394470215 }, { "auxiliary_loss_clip": 0.01242483, "auxiliary_loss_mlp": 0.01045625, "balance_loss_clip": 1.07195938, "balance_loss_mlp": 1.03196955, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.161420898974666, "language_loss": 0.89155126, "learning_rate": 3.900566545650698e-06, "loss": 0.91443235, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.5788612365722656 }, { "auxiliary_loss_clip": 0.01226956, "auxiliary_loss_mlp": 0.01041291, "balance_loss_clip": 1.07097673, "balance_loss_mlp": 1.02660453, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.196381643727136, "language_loss": 0.82055253, "learning_rate": 3.900323840058381e-06, "loss": 0.84323502, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.657589912414551 }, { "auxiliary_loss_clip": 0.01223216, "auxiliary_loss_mlp": 0.01039561, "balance_loss_clip": 1.06364012, "balance_loss_mlp": 1.02572048, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 1.752142179065913, "language_loss": 0.81561786, "learning_rate": 3.900080846188449e-06, "loss": 0.83824563, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.671121597290039 }, { "auxiliary_loss_clip": 0.01245294, "auxiliary_loss_mlp": 0.01044918, "balance_loss_clip": 1.07014394, "balance_loss_mlp": 1.03165579, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.7803891911925938, "language_loss": 0.81282187, "learning_rate": 3.8998375640777625e-06, "loss": 0.835724, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.5914041996002197 }, { "auxiliary_loss_clip": 0.01106361, "auxiliary_loss_mlp": 0.01012103, "balance_loss_clip": 1.03825235, "balance_loss_mlp": 1.00633335, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.7044513552650243, "language_loss": 0.52679992, "learning_rate": 3.899593993763229e-06, "loss": 0.5479846, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.1889798641204834 }, { "auxiliary_loss_clip": 0.0118197, "auxiliary_loss_mlp": 0.01043908, "balance_loss_clip": 1.0602777, "balance_loss_mlp": 1.02826738, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 3.95421269394101, "language_loss": 0.81577957, "learning_rate": 3.899350135281796e-06, "loss": 0.83803833, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.75980544090271 }, { "auxiliary_loss_clip": 0.01193926, "auxiliary_loss_mlp": 0.01050263, "balance_loss_clip": 1.06477189, "balance_loss_mlp": 1.0371139, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 1.8726118296343643, "language_loss": 0.79792082, "learning_rate": 3.8991059886704585e-06, "loss": 0.82036275, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.701838970184326 }, { "auxiliary_loss_clip": 0.01179562, "auxiliary_loss_mlp": 0.01051431, "balance_loss_clip": 1.06193542, "balance_loss_mlp": 1.03775764, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.0924316083858403, "language_loss": 0.82982224, "learning_rate": 3.898861553966252e-06, "loss": 0.8521322, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.757490634918213 }, { "auxiliary_loss_clip": 0.01136653, "auxiliary_loss_mlp": 0.01047593, "balance_loss_clip": 1.05447924, "balance_loss_mlp": 1.03424799, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.7027857698344935, "language_loss": 0.88070637, "learning_rate": 3.898616831206257e-06, "loss": 0.90254885, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.848029613494873 }, { "auxiliary_loss_clip": 0.01181252, "auxiliary_loss_mlp": 0.01040761, "balance_loss_clip": 1.05745089, "balance_loss_mlp": 1.02665854, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 1.7739988654282135, "language_loss": 0.7671417, "learning_rate": 3.8983718204276e-06, "loss": 0.78936177, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.7386434078216553 }, { "auxiliary_loss_clip": 0.01206605, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.06360221, "balance_loss_mlp": 1.02635109, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 2.0175536196047177, "language_loss": 0.8272453, "learning_rate": 3.898126521667446e-06, "loss": 0.84970307, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.7240114212036133 }, { "auxiliary_loss_clip": 0.01223626, "auxiliary_loss_mlp": 0.01048141, "balance_loss_clip": 1.06474805, "balance_loss_mlp": 1.03343081, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.6636542225843844, "language_loss": 0.83529723, "learning_rate": 3.897880934963007e-06, "loss": 0.85801494, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.6511573791503906 }, { "auxiliary_loss_clip": 0.01203738, "auxiliary_loss_mlp": 0.01050202, "balance_loss_clip": 1.06299698, "balance_loss_mlp": 1.03669596, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.1811964736126166, "language_loss": 0.78154612, "learning_rate": 3.89763506035154e-06, "loss": 0.80408549, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.657524347305298 }, { "auxiliary_loss_clip": 0.01212064, "auxiliary_loss_mlp": 0.01047907, "balance_loss_clip": 1.06358266, "balance_loss_mlp": 1.03460956, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 2.1942719571921825, "language_loss": 0.81204534, "learning_rate": 3.897388897870343e-06, "loss": 0.83464503, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.7493906021118164 }, { "auxiliary_loss_clip": 0.01217635, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.06405771, "balance_loss_mlp": 1.03362894, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 2.2105744545267467, "language_loss": 0.74680346, "learning_rate": 3.89714244755676e-06, "loss": 0.76945496, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.7376363277435303 }, { "auxiliary_loss_clip": 0.0115484, "auxiliary_loss_mlp": 0.01045652, "balance_loss_clip": 1.05455232, "balance_loss_mlp": 1.03177559, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.2857633949585416, "language_loss": 0.85758543, "learning_rate": 3.896895709448175e-06, "loss": 0.87959033, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.7813491821289062 }, { "auxiliary_loss_clip": 0.01146433, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.0550127, "balance_loss_mlp": 1.02656388, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 2.7205700784355966, "language_loss": 0.77343285, "learning_rate": 3.896648683582019e-06, "loss": 0.7953006, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.7254221439361572 }, { "auxiliary_loss_clip": 0.01169246, "auxiliary_loss_mlp": 0.01047038, "balance_loss_clip": 1.06038082, "balance_loss_mlp": 1.03276896, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.009992455258409, "language_loss": 0.80618536, "learning_rate": 3.896401369995766e-06, "loss": 0.82834822, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.7347941398620605 }, { "auxiliary_loss_clip": 0.01239758, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 1.06964231, "balance_loss_mlp": 1.02739894, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.7537903707073714, "language_loss": 0.79572713, "learning_rate": 3.896153768726932e-06, "loss": 0.81852943, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.624508857727051 }, { "auxiliary_loss_clip": 0.01224115, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.06979227, "balance_loss_mlp": 1.02886939, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 2.194392497639318, "language_loss": 0.87707722, "learning_rate": 3.8959058798130806e-06, "loss": 0.89973891, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.648239850997925 }, { "auxiliary_loss_clip": 0.01205698, "auxiliary_loss_mlp": 0.00715006, "balance_loss_clip": 1.06500816, "balance_loss_mlp": 1.00021362, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.624663991120166, "language_loss": 0.74883282, "learning_rate": 3.895657703291814e-06, "loss": 0.76803982, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 3.544126272201538 }, { "auxiliary_loss_clip": 0.0121473, "auxiliary_loss_mlp": 0.01045827, "balance_loss_clip": 1.06380534, "balance_loss_mlp": 1.03080678, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 5.723078220612679, "language_loss": 0.79898626, "learning_rate": 3.895409239200781e-06, "loss": 0.82159173, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 3.539923906326294 }, { "auxiliary_loss_clip": 0.0121513, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.06450617, "balance_loss_mlp": 1.02831089, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.1132499470243276, "language_loss": 0.91031945, "learning_rate": 3.895160487577673e-06, "loss": 0.93288809, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 2.635824203491211 }, { "auxiliary_loss_clip": 0.01130554, "auxiliary_loss_mlp": 0.01011836, "balance_loss_clip": 1.04909444, "balance_loss_mlp": 1.00656652, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7828024937409512, "language_loss": 0.60928404, "learning_rate": 3.894911448460226e-06, "loss": 0.63070798, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.0871682167053223 }, { "auxiliary_loss_clip": 0.01110906, "auxiliary_loss_mlp": 0.01050388, "balance_loss_clip": 1.04949999, "balance_loss_mlp": 1.0363096, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 1.8019104422685595, "language_loss": 0.72756898, "learning_rate": 3.8946621218862195e-06, "loss": 0.74918193, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 4.704211950302124 }, { "auxiliary_loss_clip": 0.01185772, "auxiliary_loss_mlp": 0.01043971, "balance_loss_clip": 1.06328773, "balance_loss_mlp": 1.02986896, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 2.339804450327741, "language_loss": 0.89075279, "learning_rate": 3.894412507893475e-06, "loss": 0.91305017, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.7559800148010254 }, { "auxiliary_loss_clip": 0.01179601, "auxiliary_loss_mlp": 0.01044914, "balance_loss_clip": 1.06135273, "balance_loss_mlp": 1.03046644, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 3.113227557990151, "language_loss": 0.71955496, "learning_rate": 3.894162606519859e-06, "loss": 0.74180007, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.752171039581299 }, { "auxiliary_loss_clip": 0.01173141, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.06234515, "balance_loss_mlp": 1.02344298, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 1.8122065025017262, "language_loss": 0.77138293, "learning_rate": 3.893912417803282e-06, "loss": 0.79348087, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.8099400997161865 }, { "auxiliary_loss_clip": 0.01169546, "auxiliary_loss_mlp": 0.01048465, "balance_loss_clip": 1.05663037, "balance_loss_mlp": 1.03401732, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 1.7705335870052168, "language_loss": 0.76768219, "learning_rate": 3.8936619417816975e-06, "loss": 0.78986228, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.807415723800659 }, { "auxiliary_loss_clip": 0.01194094, "auxiliary_loss_mlp": 0.01049287, "balance_loss_clip": 1.06807053, "balance_loss_mlp": 1.03519607, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 1.9100074380852312, "language_loss": 0.71647054, "learning_rate": 3.8934111784931015e-06, "loss": 0.73890436, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.6678755283355713 }, { "auxiliary_loss_clip": 0.01117976, "auxiliary_loss_mlp": 0.01006759, "balance_loss_clip": 1.04696298, "balance_loss_mlp": 1.00156164, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9191942511572885, "language_loss": 0.59090084, "learning_rate": 3.893160127975535e-06, "loss": 0.61214817, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.3257031440734863 }, { "auxiliary_loss_clip": 0.01174148, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.0584662, "balance_loss_mlp": 1.0221107, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.3265927517682217, "language_loss": 0.8115108, "learning_rate": 3.8929087902670826e-06, "loss": 0.83360505, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 2.9255902767181396 }, { "auxiliary_loss_clip": 0.01130514, "auxiliary_loss_mlp": 0.0100715, "balance_loss_clip": 1.04445553, "balance_loss_mlp": 1.00200009, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9309618960758589, "language_loss": 0.60708988, "learning_rate": 3.8926571654058715e-06, "loss": 0.62846655, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.116931676864624 }, { "auxiliary_loss_clip": 0.0118738, "auxiliary_loss_mlp": 0.01045685, "balance_loss_clip": 1.06439865, "balance_loss_mlp": 1.0319761, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.1571052474627828, "language_loss": 0.77004218, "learning_rate": 3.892405253430074e-06, "loss": 0.79237288, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.6986052989959717 }, { "auxiliary_loss_clip": 0.01210043, "auxiliary_loss_mlp": 0.00715214, "balance_loss_clip": 1.06610656, "balance_loss_mlp": 1.00027299, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 2.03849934199867, "language_loss": 0.82501578, "learning_rate": 3.892153054377904e-06, "loss": 0.84426844, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.70063853263855 }, { "auxiliary_loss_clip": 0.01061007, "auxiliary_loss_mlp": 0.01008912, "balance_loss_clip": 1.03802609, "balance_loss_mlp": 1.00326192, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9415976730824046, "language_loss": 0.59436643, "learning_rate": 3.891900568287619e-06, "loss": 0.61506557, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.2998905181884766 }, { "auxiliary_loss_clip": 0.01191097, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.06184852, "balance_loss_mlp": 1.02792716, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.44101712337845, "language_loss": 0.71670306, "learning_rate": 3.891647795197523e-06, "loss": 0.73904383, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.8594918251037598 }, { "auxiliary_loss_clip": 0.01189964, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.05783391, "balance_loss_mlp": 1.03099394, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.160717735788595, "language_loss": 0.68768287, "learning_rate": 3.8913947351459605e-06, "loss": 0.71003503, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.73524808883667 }, { "auxiliary_loss_clip": 0.01238288, "auxiliary_loss_mlp": 0.01049497, "balance_loss_clip": 1.06871545, "balance_loss_mlp": 1.0359906, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.9971775383331223, "language_loss": 0.67588198, "learning_rate": 3.89114138817132e-06, "loss": 0.69875979, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.574568271636963 }, { "auxiliary_loss_clip": 0.01222423, "auxiliary_loss_mlp": 0.01043956, "balance_loss_clip": 1.06987965, "balance_loss_mlp": 1.03122437, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.7304448570007331, "language_loss": 0.84253275, "learning_rate": 3.890887754312035e-06, "loss": 0.86519659, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.631948709487915 }, { "auxiliary_loss_clip": 0.01200727, "auxiliary_loss_mlp": 0.0104699, "balance_loss_clip": 1.06026077, "balance_loss_mlp": 1.03332293, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 1.8980208189308763, "language_loss": 0.87734413, "learning_rate": 3.890633833606581e-06, "loss": 0.89982128, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.5994465351104736 }, { "auxiliary_loss_clip": 0.01220452, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.06957996, "balance_loss_mlp": 1.03798354, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 1.8772570869556275, "language_loss": 0.69537568, "learning_rate": 3.890379626093477e-06, "loss": 0.7180922, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.5856282711029053 }, { "auxiliary_loss_clip": 0.01153864, "auxiliary_loss_mlp": 0.01042897, "balance_loss_clip": 1.05868077, "balance_loss_mlp": 1.02879405, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 1.99216997852568, "language_loss": 0.92510605, "learning_rate": 3.890125131811287e-06, "loss": 0.9470737, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.6786935329437256 }, { "auxiliary_loss_clip": 0.01188324, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.05633903, "balance_loss_mlp": 1.03292739, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 1.9906750842915781, "language_loss": 0.75425524, "learning_rate": 3.889870350798618e-06, "loss": 0.7765975, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.6899776458740234 }, { "auxiliary_loss_clip": 0.01236036, "auxiliary_loss_mlp": 0.01045035, "balance_loss_clip": 1.0642693, "balance_loss_mlp": 1.03140938, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.5865111748731502, "language_loss": 0.78638279, "learning_rate": 3.889615283094119e-06, "loss": 0.80919349, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.603739023208618 }, { "auxiliary_loss_clip": 0.01244188, "auxiliary_loss_mlp": 0.01044839, "balance_loss_clip": 1.06721926, "balance_loss_mlp": 1.03019357, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.1543786348605862, "language_loss": 0.84772277, "learning_rate": 3.889359928736485e-06, "loss": 0.87061298, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.5798566341400146 }, { "auxiliary_loss_clip": 0.01199837, "auxiliary_loss_mlp": 0.00714941, "balance_loss_clip": 1.06445479, "balance_loss_mlp": 1.00030327, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.1062541833258757, "language_loss": 0.91361368, "learning_rate": 3.889104287764451e-06, "loss": 0.93276143, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 4.401797771453857 }, { "auxiliary_loss_clip": 0.01205233, "auxiliary_loss_mlp": 0.01039785, "balance_loss_clip": 1.06801474, "balance_loss_mlp": 1.02646971, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 1.9308930099442714, "language_loss": 0.90258265, "learning_rate": 3.888848360216798e-06, "loss": 0.92503285, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 2.6237523555755615 }, { "auxiliary_loss_clip": 0.0111624, "auxiliary_loss_mlp": 0.01008507, "balance_loss_clip": 1.04099178, "balance_loss_mlp": 1.00333309, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.8063399740888191, "language_loss": 0.56606561, "learning_rate": 3.888592146132351e-06, "loss": 0.58731306, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.324424982070923 }, { "auxiliary_loss_clip": 0.01222128, "auxiliary_loss_mlp": 0.01047127, "balance_loss_clip": 1.06880713, "balance_loss_mlp": 1.03404379, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 2.122208673511983, "language_loss": 0.78392982, "learning_rate": 3.888335645549978e-06, "loss": 0.80662239, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 3.594377279281616 }, { "auxiliary_loss_clip": 0.01240889, "auxiliary_loss_mlp": 0.01046634, "balance_loss_clip": 1.06960201, "balance_loss_mlp": 1.03287709, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.4704315769177336, "language_loss": 0.80991602, "learning_rate": 3.888078858508588e-06, "loss": 0.83279121, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.615172863006592 }, { "auxiliary_loss_clip": 0.01203835, "auxiliary_loss_mlp": 0.01042649, "balance_loss_clip": 1.06752682, "balance_loss_mlp": 1.02846289, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.99877683032715, "language_loss": 0.84532261, "learning_rate": 3.8878217850471365e-06, "loss": 0.86778748, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 3.5788230895996094 }, { "auxiliary_loss_clip": 0.01242687, "auxiliary_loss_mlp": 0.0104554, "balance_loss_clip": 1.06950617, "balance_loss_mlp": 1.03174162, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 9.713214335961652, "language_loss": 0.73845983, "learning_rate": 3.887564425204621e-06, "loss": 0.76134205, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.6509177684783936 }, { "auxiliary_loss_clip": 0.01098975, "auxiliary_loss_mlp": 0.0100627, "balance_loss_clip": 1.0423491, "balance_loss_mlp": 1.00100124, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8410669918346272, "language_loss": 0.54645342, "learning_rate": 3.887306779020083e-06, "loss": 0.56750584, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.1623358726501465 }, { "auxiliary_loss_clip": 0.01224664, "auxiliary_loss_mlp": 0.01047121, "balance_loss_clip": 1.06567192, "balance_loss_mlp": 1.03371012, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.0114919631644055, "language_loss": 0.7049365, "learning_rate": 3.887048846532608e-06, "loss": 0.72765434, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.6630547046661377 }, { "auxiliary_loss_clip": 0.01098993, "auxiliary_loss_mlp": 0.01006034, "balance_loss_clip": 1.04064369, "balance_loss_mlp": 1.00093198, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7558819991022722, "language_loss": 0.58106947, "learning_rate": 3.8867906277813224e-06, "loss": 0.60211968, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.1145260334014893 }, { "auxiliary_loss_clip": 0.01225749, "auxiliary_loss_mlp": 0.00714298, "balance_loss_clip": 1.06713271, "balance_loss_mlp": 1.00029457, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.0156572698896986, "language_loss": 0.74361235, "learning_rate": 3.886532122805399e-06, "loss": 0.76301283, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.8304083347320557 }, { "auxiliary_loss_clip": 0.0113821, "auxiliary_loss_mlp": 0.01041659, "balance_loss_clip": 1.05322635, "balance_loss_mlp": 1.02722299, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 1.868294482104809, "language_loss": 0.89677387, "learning_rate": 3.886273331644053e-06, "loss": 0.9185726, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.765298843383789 }, { "auxiliary_loss_clip": 0.0116691, "auxiliary_loss_mlp": 0.01048004, "balance_loss_clip": 1.06147861, "balance_loss_mlp": 1.03507543, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 2.059436447213527, "language_loss": 0.8233813, "learning_rate": 3.886014254336542e-06, "loss": 0.84553045, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.6941206455230713 }, { "auxiliary_loss_clip": 0.01219299, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.06347179, "balance_loss_mlp": 1.03061223, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.6817065760009058, "language_loss": 0.92727721, "learning_rate": 3.885754890922168e-06, "loss": 0.9499107, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.6581196784973145 }, { "auxiliary_loss_clip": 0.0112663, "auxiliary_loss_mlp": 0.01046812, "balance_loss_clip": 1.05573452, "balance_loss_mlp": 1.03360915, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.7715021365807666, "language_loss": 0.78458202, "learning_rate": 3.885495241440277e-06, "loss": 0.80631638, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.929304361343384 }, { "auxiliary_loss_clip": 0.01239476, "auxiliary_loss_mlp": 0.0105292, "balance_loss_clip": 1.06847405, "balance_loss_mlp": 1.03923488, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 1.7930914625427024, "language_loss": 0.74308014, "learning_rate": 3.885235305930257e-06, "loss": 0.76600409, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.593362808227539 }, { "auxiliary_loss_clip": 0.01177311, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.06017125, "balance_loss_mlp": 1.03566194, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 2.3774275503484685, "language_loss": 0.85588443, "learning_rate": 3.884975084431539e-06, "loss": 0.87814742, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.6650502681732178 }, { "auxiliary_loss_clip": 0.01213652, "auxiliary_loss_mlp": 0.00714438, "balance_loss_clip": 1.06714439, "balance_loss_mlp": 1.00033665, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.265468341683967, "language_loss": 0.91626883, "learning_rate": 3.8847145769836e-06, "loss": 0.93554974, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.6702656745910645 }, { "auxiliary_loss_clip": 0.01241179, "auxiliary_loss_mlp": 0.01049559, "balance_loss_clip": 1.06916952, "balance_loss_mlp": 1.03503323, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.1232208570053945, "language_loss": 0.65998805, "learning_rate": 3.884453783625959e-06, "loss": 0.68289548, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.557293653488159 }, { "auxiliary_loss_clip": 0.01199349, "auxiliary_loss_mlp": 0.01042093, "balance_loss_clip": 1.06382537, "balance_loss_mlp": 1.02843213, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.260188840230319, "language_loss": 0.85284078, "learning_rate": 3.884192704398176e-06, "loss": 0.87525523, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.7425317764282227 }, { "auxiliary_loss_clip": 0.01221949, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.06537628, "balance_loss_mlp": 1.03178763, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.6445987395485833, "language_loss": 0.7451998, "learning_rate": 3.883931339339858e-06, "loss": 0.7678721, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.8499321937561035 }, { "auxiliary_loss_clip": 0.0122528, "auxiliary_loss_mlp": 0.01041875, "balance_loss_clip": 1.0657444, "balance_loss_mlp": 1.0284518, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 1.763259732720168, "language_loss": 0.78438884, "learning_rate": 3.883669688490654e-06, "loss": 0.80706036, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.6145901679992676 }, { "auxiliary_loss_clip": 0.01190638, "auxiliary_loss_mlp": 0.00714973, "balance_loss_clip": 1.06066823, "balance_loss_mlp": 1.00032067, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 1.8285732974440252, "language_loss": 0.85470337, "learning_rate": 3.883407751890256e-06, "loss": 0.87375951, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.7290549278259277 }, { "auxiliary_loss_clip": 0.01179736, "auxiliary_loss_mlp": 0.01040329, "balance_loss_clip": 1.05772471, "balance_loss_mlp": 1.02614284, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.891838173234138, "language_loss": 0.85670018, "learning_rate": 3.8831455295783994e-06, "loss": 0.87890077, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.7404067516326904 }, { "auxiliary_loss_clip": 0.0119527, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.0611093, "balance_loss_mlp": 1.02933729, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.8471799910280995, "language_loss": 0.74064243, "learning_rate": 3.882883021594864e-06, "loss": 0.76302826, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.667841911315918 }, { "auxiliary_loss_clip": 0.01182796, "auxiliary_loss_mlp": 0.01043791, "balance_loss_clip": 1.06389487, "balance_loss_mlp": 1.02977204, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 1.8620951375037127, "language_loss": 0.86919475, "learning_rate": 3.8826202279794705e-06, "loss": 0.89146066, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.748117208480835 }, { "auxiliary_loss_clip": 0.01241547, "auxiliary_loss_mlp": 0.01053444, "balance_loss_clip": 1.07046342, "balance_loss_mlp": 1.0396992, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 2.047535074204342, "language_loss": 0.70288956, "learning_rate": 3.882357148772085e-06, "loss": 0.7258395, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 3.518498182296753 }, { "auxiliary_loss_clip": 0.01174271, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.06147981, "balance_loss_mlp": 1.02476501, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.187153465333521, "language_loss": 0.84323519, "learning_rate": 3.882093784012617e-06, "loss": 0.86536789, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 3.5543453693389893 }, { "auxiliary_loss_clip": 0.01201099, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.06540251, "balance_loss_mlp": 1.02967322, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.9370892449871644, "language_loss": 0.84291762, "learning_rate": 3.881830133741019e-06, "loss": 0.86535883, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.6737334728240967 }, { "auxiliary_loss_clip": 0.01189376, "auxiliary_loss_mlp": 0.01041843, "balance_loss_clip": 1.06540418, "balance_loss_mlp": 1.02871263, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.8325833583752997, "language_loss": 0.7627992, "learning_rate": 3.881566197997285e-06, "loss": 0.78511143, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 3.611820936203003 }, { "auxiliary_loss_clip": 0.01198262, "auxiliary_loss_mlp": 0.01045349, "balance_loss_clip": 1.06678772, "balance_loss_mlp": 1.03252208, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5474918967288982, "language_loss": 0.74520779, "learning_rate": 3.881301976821456e-06, "loss": 0.76764393, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 3.6266186237335205 }, { "auxiliary_loss_clip": 0.01219311, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.06709445, "balance_loss_mlp": 1.02833617, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 2.787731487061293, "language_loss": 0.90454328, "learning_rate": 3.881037470253612e-06, "loss": 0.92715907, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 2.601422071456909 }, { "auxiliary_loss_clip": 0.01163418, "auxiliary_loss_mlp": 0.01049871, "balance_loss_clip": 1.06005144, "balance_loss_mlp": 1.03685343, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.5625201997879206, "language_loss": 0.7968573, "learning_rate": 3.88077267833388e-06, "loss": 0.81899023, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.6659815311431885 }, { "auxiliary_loss_clip": 0.01160678, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.0584662, "balance_loss_mlp": 1.03280997, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.2087160871190687, "language_loss": 0.83975565, "learning_rate": 3.880507601102427e-06, "loss": 0.86182261, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.7221884727478027 }, { "auxiliary_loss_clip": 0.01236053, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.06753039, "balance_loss_mlp": 1.03439617, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 1.733835159339442, "language_loss": 0.82270044, "learning_rate": 3.880242238599467e-06, "loss": 0.84552896, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.6452670097351074 }, { "auxiliary_loss_clip": 0.01234128, "auxiliary_loss_mlp": 0.01050348, "balance_loss_clip": 1.06703818, "balance_loss_mlp": 1.03684771, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.660975962361278, "language_loss": 0.82944632, "learning_rate": 3.879976590865254e-06, "loss": 0.85229117, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.587724208831787 }, { "auxiliary_loss_clip": 0.01208209, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.07070386, "balance_loss_mlp": 1.03290534, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.688907940962308, "language_loss": 0.87169695, "learning_rate": 3.879710657940087e-06, "loss": 0.89424729, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.6971755027770996 }, { "auxiliary_loss_clip": 0.01224803, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.06838703, "balance_loss_mlp": 1.03542399, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 1.814631679930406, "language_loss": 0.70537585, "learning_rate": 3.879444439864308e-06, "loss": 0.72811371, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.6893458366394043 }, { "auxiliary_loss_clip": 0.01216549, "auxiliary_loss_mlp": 0.00714449, "balance_loss_clip": 1.06273437, "balance_loss_mlp": 1.00048018, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.6540818796453534, "language_loss": 0.85782331, "learning_rate": 3.879177936678301e-06, "loss": 0.87713325, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.6866722106933594 }, { "auxiliary_loss_clip": 0.01206367, "auxiliary_loss_mlp": 0.01052021, "balance_loss_clip": 1.06450462, "balance_loss_mlp": 1.03812122, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 1.8342885194839897, "language_loss": 0.76768374, "learning_rate": 3.878911148422496e-06, "loss": 0.79026759, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.7523880004882812 }, { "auxiliary_loss_clip": 0.01223236, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.06589174, "balance_loss_mlp": 1.02546954, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.023968683303658, "language_loss": 0.70007396, "learning_rate": 3.878644075137364e-06, "loss": 0.72269559, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.773268938064575 }, { "auxiliary_loss_clip": 0.01159426, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.05445147, "balance_loss_mlp": 1.03302443, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 2.1043572018629413, "language_loss": 0.79363912, "learning_rate": 3.878376716863418e-06, "loss": 0.81569135, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.6458303928375244 }, { "auxiliary_loss_clip": 0.0119863, "auxiliary_loss_mlp": 0.01042629, "balance_loss_clip": 1.06107378, "balance_loss_mlp": 1.02924204, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 1.8308030772165604, "language_loss": 0.71388233, "learning_rate": 3.878109073641219e-06, "loss": 0.73629498, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.702230453491211 }, { "auxiliary_loss_clip": 0.0116341, "auxiliary_loss_mlp": 0.01042285, "balance_loss_clip": 1.0600208, "balance_loss_mlp": 1.02957737, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.5741881430038764, "language_loss": 0.81280696, "learning_rate": 3.877841145511366e-06, "loss": 0.83486396, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.7961268424987793 }, { "auxiliary_loss_clip": 0.01223939, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.06725383, "balance_loss_mlp": 1.03168225, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.6263822437571585, "language_loss": 0.82721162, "learning_rate": 3.8775729325145035e-06, "loss": 0.84990537, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.6277177333831787 }, { "auxiliary_loss_clip": 0.0108458, "auxiliary_loss_mlp": 0.01009493, "balance_loss_clip": 1.03791261, "balance_loss_mlp": 1.00458193, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.8309663449500876, "language_loss": 0.64748675, "learning_rate": 3.877304434691321e-06, "loss": 0.66842747, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.3434643745422363 }, { "auxiliary_loss_clip": 0.01185866, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.0637182, "balance_loss_mlp": 1.02866673, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.810786266771884, "language_loss": 0.79845923, "learning_rate": 3.877035652082548e-06, "loss": 0.82073259, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.725778579711914 }, { "auxiliary_loss_clip": 0.01193935, "auxiliary_loss_mlp": 0.01041622, "balance_loss_clip": 1.06431901, "balance_loss_mlp": 1.02781796, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.8019073692516712, "language_loss": 0.85296625, "learning_rate": 3.87676658472896e-06, "loss": 0.87532181, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.6480743885040283 }, { "auxiliary_loss_clip": 0.01220227, "auxiliary_loss_mlp": 0.01049501, "balance_loss_clip": 1.06273484, "balance_loss_mlp": 1.0358634, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.9387777572092586, "language_loss": 0.84840798, "learning_rate": 3.876497232671372e-06, "loss": 0.87110525, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.6578686237335205 }, { "auxiliary_loss_clip": 0.01170225, "auxiliary_loss_mlp": 0.01040975, "balance_loss_clip": 1.05751264, "balance_loss_mlp": 1.02824306, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.3629948856704073, "language_loss": 0.83913386, "learning_rate": 3.876227595950647e-06, "loss": 0.86124587, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.789924383163452 }, { "auxiliary_loss_clip": 0.01235887, "auxiliary_loss_mlp": 0.01044655, "balance_loss_clip": 1.06813967, "balance_loss_mlp": 1.0314343, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.617904300570204, "language_loss": 0.78951263, "learning_rate": 3.875957674607686e-06, "loss": 0.81231809, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.65168833732605 }, { "auxiliary_loss_clip": 0.01207461, "auxiliary_loss_mlp": 0.00715009, "balance_loss_clip": 1.06002355, "balance_loss_mlp": 1.00037551, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 1.8566872951680393, "language_loss": 0.87958568, "learning_rate": 3.8756874686834386e-06, "loss": 0.89881033, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.6423561573028564 }, { "auxiliary_loss_clip": 0.01223955, "auxiliary_loss_mlp": 0.00714685, "balance_loss_clip": 1.064551, "balance_loss_mlp": 1.00033307, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.6134511291657276, "language_loss": 0.80745268, "learning_rate": 3.875416978218893e-06, "loss": 0.82683909, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.738515853881836 }, { "auxiliary_loss_clip": 0.01190289, "auxiliary_loss_mlp": 0.0104522, "balance_loss_clip": 1.05714905, "balance_loss_mlp": 1.03124249, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 2.241407158630404, "language_loss": 0.82417923, "learning_rate": 3.8751462032550835e-06, "loss": 0.84653437, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 4.458430767059326 }, { "auxiliary_loss_clip": 0.01199864, "auxiliary_loss_mlp": 0.01038912, "balance_loss_clip": 1.06816399, "balance_loss_mlp": 1.02575755, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 2.1928717203389936, "language_loss": 0.82356536, "learning_rate": 3.874875143833085e-06, "loss": 0.84595311, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.695979118347168 }, { "auxiliary_loss_clip": 0.01223321, "auxiliary_loss_mlp": 0.01047576, "balance_loss_clip": 1.0673418, "balance_loss_mlp": 1.0330981, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 1.7950487517497702, "language_loss": 0.68901068, "learning_rate": 3.874603799994019e-06, "loss": 0.71171963, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 3.76627254486084 }, { "auxiliary_loss_clip": 0.01174159, "auxiliary_loss_mlp": 0.01042232, "balance_loss_clip": 1.05930412, "balance_loss_mlp": 1.02995968, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 1.952447178240724, "language_loss": 0.8679291, "learning_rate": 3.874332171779046e-06, "loss": 0.89009297, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 3.5568435192108154 }, { "auxiliary_loss_clip": 0.01180604, "auxiliary_loss_mlp": 0.01059295, "balance_loss_clip": 1.05987811, "balance_loss_mlp": 1.04597378, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.764439402355512, "language_loss": 0.75746787, "learning_rate": 3.874060259229373e-06, "loss": 0.77986687, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 2.7456512451171875 }, { "auxiliary_loss_clip": 0.01225139, "auxiliary_loss_mlp": 0.01050431, "balance_loss_clip": 1.06921136, "balance_loss_mlp": 1.03681183, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.0262011981429784, "language_loss": 0.93595207, "learning_rate": 3.873788062386249e-06, "loss": 0.95870781, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.6523144245147705 }, { "auxiliary_loss_clip": 0.01191159, "auxiliary_loss_mlp": 0.01049776, "balance_loss_clip": 1.06545079, "balance_loss_mlp": 1.0369432, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 1.7692256121174292, "language_loss": 0.81948721, "learning_rate": 3.873515581290965e-06, "loss": 0.84189659, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.773836851119995 }, { "auxiliary_loss_clip": 0.0118584, "auxiliary_loss_mlp": 0.01041028, "balance_loss_clip": 1.06463027, "balance_loss_mlp": 1.02789068, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 3.0425742271103142, "language_loss": 0.75081289, "learning_rate": 3.8732428159848575e-06, "loss": 0.77308154, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.6612467765808105 }, { "auxiliary_loss_clip": 0.01220613, "auxiliary_loss_mlp": 0.01053729, "balance_loss_clip": 1.0680542, "balance_loss_mlp": 1.04064631, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.8419937093237189, "language_loss": 0.77929354, "learning_rate": 3.872969766509304e-06, "loss": 0.802037, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.691622257232666 }, { "auxiliary_loss_clip": 0.01086781, "auxiliary_loss_mlp": 0.01015, "balance_loss_clip": 1.03924775, "balance_loss_mlp": 1.01030338, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7622733221774014, "language_loss": 0.55649346, "learning_rate": 3.872696432905726e-06, "loss": 0.57751131, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.270108461380005 }, { "auxiliary_loss_clip": 0.01223836, "auxiliary_loss_mlp": 0.01045048, "balance_loss_clip": 1.06477392, "balance_loss_mlp": 1.03169668, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 4.2284397861597345, "language_loss": 0.71301031, "learning_rate": 3.872422815215589e-06, "loss": 0.73569918, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.6296541690826416 }, { "auxiliary_loss_clip": 0.01209714, "auxiliary_loss_mlp": 0.01045654, "balance_loss_clip": 1.05935645, "balance_loss_mlp": 1.03137302, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.9557758226883122, "language_loss": 0.741732, "learning_rate": 3.8721489134803994e-06, "loss": 0.76428562, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.7132153511047363 }, { "auxiliary_loss_clip": 0.01218179, "auxiliary_loss_mlp": 0.01045245, "balance_loss_clip": 1.06662691, "balance_loss_mlp": 1.031232, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.0187830653934227, "language_loss": 0.72937328, "learning_rate": 3.871874727741707e-06, "loss": 0.75200748, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.5884690284729004 }, { "auxiliary_loss_clip": 0.01219491, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.06846237, "balance_loss_mlp": 1.03252125, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.8725247487531262, "language_loss": 0.96745706, "learning_rate": 3.871600258041108e-06, "loss": 0.99011016, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.676903247833252 }, { "auxiliary_loss_clip": 0.01194731, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.06107366, "balance_loss_mlp": 1.02953124, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 2.259361284600561, "language_loss": 0.85732472, "learning_rate": 3.871325504420238e-06, "loss": 0.87970608, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.6563217639923096 }, { "auxiliary_loss_clip": 0.01236677, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.06862295, "balance_loss_mlp": 1.0297668, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.039646294845496, "language_loss": 0.81866252, "learning_rate": 3.871050466920776e-06, "loss": 0.84145582, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.6881487369537354 }, { "auxiliary_loss_clip": 0.01173548, "auxiliary_loss_mlp": 0.01056967, "balance_loss_clip": 1.05652404, "balance_loss_mlp": 1.04427767, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 2.0140478320261304, "language_loss": 0.79642904, "learning_rate": 3.870775145584447e-06, "loss": 0.81873417, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.6591506004333496 }, { "auxiliary_loss_clip": 0.0120834, "auxiliary_loss_mlp": 0.01048761, "balance_loss_clip": 1.06362033, "balance_loss_mlp": 1.03455091, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 2.9787301471561345, "language_loss": 0.64837468, "learning_rate": 3.8704995404530145e-06, "loss": 0.67094576, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.627631902694702 }, { "auxiliary_loss_clip": 0.01232456, "auxiliary_loss_mlp": 0.01046248, "balance_loss_clip": 1.06721294, "balance_loss_mlp": 1.0333612, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 1.7232188310593473, "language_loss": 0.84829563, "learning_rate": 3.87022365156829e-06, "loss": 0.87108272, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.505070686340332 }, { "auxiliary_loss_clip": 0.0112876, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.0537256, "balance_loss_mlp": 1.03446019, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.0691588114963984, "language_loss": 0.81052727, "learning_rate": 3.869947478972123e-06, "loss": 0.83228952, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.768678903579712 }, { "auxiliary_loss_clip": 0.01210145, "auxiliary_loss_mlp": 0.01045118, "balance_loss_clip": 1.06172347, "balance_loss_mlp": 1.03250515, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.0169953460521746, "language_loss": 0.82205892, "learning_rate": 3.869671022706412e-06, "loss": 0.84461153, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.781543254852295 }, { "auxiliary_loss_clip": 0.01153385, "auxiliary_loss_mlp": 0.01045292, "balance_loss_clip": 1.05720019, "balance_loss_mlp": 1.0324887, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 2.2335016085753003, "language_loss": 0.65141237, "learning_rate": 3.869394282813092e-06, "loss": 0.67339921, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.7594003677368164 }, { "auxiliary_loss_clip": 0.0118915, "auxiliary_loss_mlp": 0.01046269, "balance_loss_clip": 1.05818892, "balance_loss_mlp": 1.03298306, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 2.4312843451616164, "language_loss": 0.89175344, "learning_rate": 3.869117259334147e-06, "loss": 0.91410768, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.6513421535491943 }, { "auxiliary_loss_clip": 0.01214572, "auxiliary_loss_mlp": 0.01057317, "balance_loss_clip": 1.06503296, "balance_loss_mlp": 1.04368556, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.703521567724508, "language_loss": 0.81878293, "learning_rate": 3.868839952311599e-06, "loss": 0.84150183, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.619317054748535 }, { "auxiliary_loss_clip": 0.01196465, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.06235576, "balance_loss_mlp": 1.02945518, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 3.2478975024147085, "language_loss": 0.80641055, "learning_rate": 3.868562361787516e-06, "loss": 0.8287971, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.6649527549743652 }, { "auxiliary_loss_clip": 0.01126501, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.05156898, "balance_loss_mlp": 1.02819002, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 1.908559218704049, "language_loss": 0.68759036, "learning_rate": 3.868284487804009e-06, "loss": 0.70926607, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 3.6860344409942627 }, { "auxiliary_loss_clip": 0.01204863, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.06102228, "balance_loss_mlp": 1.0344708, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.5854947182711925, "language_loss": 0.77879667, "learning_rate": 3.86800633040323e-06, "loss": 0.80131972, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 3.5621209144592285 }, { "auxiliary_loss_clip": 0.0119931, "auxiliary_loss_mlp": 0.00714956, "balance_loss_clip": 1.06504202, "balance_loss_mlp": 1.00052309, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.0199741206253146, "language_loss": 0.7803399, "learning_rate": 3.867727889627376e-06, "loss": 0.79948258, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.7355353832244873 }, { "auxiliary_loss_clip": 0.01170547, "auxiliary_loss_mlp": 0.01038669, "balance_loss_clip": 1.05979908, "balance_loss_mlp": 1.02521062, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.2592900989695512, "language_loss": 0.78505325, "learning_rate": 3.867449165518687e-06, "loss": 0.80714548, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 3.6565985679626465 }, { "auxiliary_loss_clip": 0.01239138, "auxiliary_loss_mlp": 0.00714603, "balance_loss_clip": 1.06707287, "balance_loss_mlp": 1.00045109, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 2.1035014492714397, "language_loss": 0.71140087, "learning_rate": 3.867170158119444e-06, "loss": 0.73093832, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 3.493649959564209 }, { "auxiliary_loss_clip": 0.01241185, "auxiliary_loss_mlp": 0.01047364, "balance_loss_clip": 1.07066441, "balance_loss_mlp": 1.03450704, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 2.03281660507559, "language_loss": 0.75522381, "learning_rate": 3.866890867471972e-06, "loss": 0.77810931, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 2.6270129680633545 }, { "auxiliary_loss_clip": 0.01192665, "auxiliary_loss_mlp": 0.0104162, "balance_loss_clip": 1.05760384, "balance_loss_mlp": 1.02810192, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 3.4621727992892373, "language_loss": 0.89587069, "learning_rate": 3.86661129361864e-06, "loss": 0.91821349, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.617953300476074 }, { "auxiliary_loss_clip": 0.01201648, "auxiliary_loss_mlp": 0.01044004, "balance_loss_clip": 1.06657827, "balance_loss_mlp": 1.03071189, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 1.8949604715526518, "language_loss": 0.86142319, "learning_rate": 3.866331436601859e-06, "loss": 0.88387966, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.6558029651641846 }, { "auxiliary_loss_clip": 0.01237207, "auxiliary_loss_mlp": 0.01048277, "balance_loss_clip": 1.06936109, "balance_loss_mlp": 1.03570032, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.033396088585414, "language_loss": 0.73620373, "learning_rate": 3.866051296464083e-06, "loss": 0.75905854, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.5882179737091064 }, { "auxiliary_loss_clip": 0.01237743, "auxiliary_loss_mlp": 0.00714566, "balance_loss_clip": 1.06589651, "balance_loss_mlp": 1.00048685, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 2.205952834346954, "language_loss": 0.85225779, "learning_rate": 3.86577087324781e-06, "loss": 0.87178087, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.637570381164551 }, { "auxiliary_loss_clip": 0.01217395, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.06870222, "balance_loss_mlp": 1.03880262, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 2.822506774261269, "language_loss": 0.77023184, "learning_rate": 3.865490166995578e-06, "loss": 0.7929213, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.618842124938965 }, { "auxiliary_loss_clip": 0.01219764, "auxiliary_loss_mlp": 0.01049917, "balance_loss_clip": 1.06780636, "balance_loss_mlp": 1.03689384, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.2702579502244626, "language_loss": 0.84237957, "learning_rate": 3.86520917774997e-06, "loss": 0.86507642, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.7238612174987793 }, { "auxiliary_loss_clip": 0.01218408, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.06702328, "balance_loss_mlp": 1.02375579, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.2244450561264055, "language_loss": 0.74787641, "learning_rate": 3.864927905553614e-06, "loss": 0.77042198, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.5978643894195557 }, { "auxiliary_loss_clip": 0.01180601, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.05924392, "balance_loss_mlp": 1.02788806, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.7628991056721424, "language_loss": 0.88907057, "learning_rate": 3.8646463504491765e-06, "loss": 0.91127968, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.7068030834198 }, { "auxiliary_loss_clip": 0.01222544, "auxiliary_loss_mlp": 0.0103784, "balance_loss_clip": 1.06860852, "balance_loss_mlp": 1.02501893, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.6296248336395196, "language_loss": 0.83233875, "learning_rate": 3.8643645124793705e-06, "loss": 0.85494256, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.611539840698242 }, { "auxiliary_loss_clip": 0.01220306, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.06765449, "balance_loss_mlp": 1.02584648, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.662584193558791, "language_loss": 0.7474016, "learning_rate": 3.8640823916869515e-06, "loss": 0.7699939, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.813715696334839 }, { "auxiliary_loss_clip": 0.01237479, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.0692395, "balance_loss_mlp": 1.03002596, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.4720087068769414, "language_loss": 0.78251714, "learning_rate": 3.863799988114714e-06, "loss": 0.80531466, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.642552375793457 }, { "auxiliary_loss_clip": 0.01238329, "auxiliary_loss_mlp": 0.01043557, "balance_loss_clip": 1.0668571, "balance_loss_mlp": 1.03064096, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 2.428223068310788, "language_loss": 0.70499241, "learning_rate": 3.863517301805502e-06, "loss": 0.72781122, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.5712015628814697 }, { "auxiliary_loss_clip": 0.01187831, "auxiliary_loss_mlp": 0.01048406, "balance_loss_clip": 1.06540489, "balance_loss_mlp": 1.03513217, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.3071752867539987, "language_loss": 0.96698821, "learning_rate": 3.863234332802196e-06, "loss": 0.98935062, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.671224355697632 }, { "auxiliary_loss_clip": 0.01191781, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.05837083, "balance_loss_mlp": 1.0292784, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.0967787371570794, "language_loss": 0.73995245, "learning_rate": 3.862951081147723e-06, "loss": 0.76229775, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.672356605529785 }, { "auxiliary_loss_clip": 0.01220412, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.06607115, "balance_loss_mlp": 1.02795005, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.72390509286691, "language_loss": 0.77884912, "learning_rate": 3.862667546885053e-06, "loss": 0.80144632, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.677889823913574 }, { "auxiliary_loss_clip": 0.01205702, "auxiliary_loss_mlp": 0.01038443, "balance_loss_clip": 1.06191111, "balance_loss_mlp": 1.02565813, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 2.8274752806982733, "language_loss": 0.7341333, "learning_rate": 3.8623837300571965e-06, "loss": 0.75657475, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.711087942123413 }, { "auxiliary_loss_clip": 0.01236525, "auxiliary_loss_mlp": 0.01044314, "balance_loss_clip": 1.067981, "balance_loss_mlp": 1.03115296, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 1.7970339434975207, "language_loss": 0.83978772, "learning_rate": 3.8620996307072085e-06, "loss": 0.86259609, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.6704864501953125 }, { "auxiliary_loss_clip": 0.01185339, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.05861926, "balance_loss_mlp": 1.02518582, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 1.9531595523721632, "language_loss": 0.64425814, "learning_rate": 3.861815248878188e-06, "loss": 0.66649032, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.6534523963928223 }, { "auxiliary_loss_clip": 0.01197025, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.06574261, "balance_loss_mlp": 1.02683234, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 2.366675968630505, "language_loss": 0.79830647, "learning_rate": 3.861530584613274e-06, "loss": 0.82067072, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.6627256870269775 }, { "auxiliary_loss_clip": 0.01223307, "auxiliary_loss_mlp": 0.00714748, "balance_loss_clip": 1.07036221, "balance_loss_mlp": 1.00075138, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.230206146246461, "language_loss": 0.82480818, "learning_rate": 3.86124563795565e-06, "loss": 0.84418869, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.5886075496673584 }, { "auxiliary_loss_clip": 0.01232644, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.06683636, "balance_loss_mlp": 1.0297972, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.6461367978659804, "language_loss": 0.69649839, "learning_rate": 3.860960408948543e-06, "loss": 0.71924275, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 3.5517194271087646 }, { "auxiliary_loss_clip": 0.01214095, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.0683502, "balance_loss_mlp": 1.02451515, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.2116210952888915, "language_loss": 0.89740616, "learning_rate": 3.860674897635222e-06, "loss": 0.91991413, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 3.4516491889953613 }, { "auxiliary_loss_clip": 0.01216798, "auxiliary_loss_mlp": 0.0104308, "balance_loss_clip": 1.06561601, "balance_loss_mlp": 1.03062308, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 1.723781338063208, "language_loss": 0.83360338, "learning_rate": 3.860389104058998e-06, "loss": 0.85620213, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.569699764251709 }, { "auxiliary_loss_clip": 0.01199397, "auxiliary_loss_mlp": 0.01032952, "balance_loss_clip": 1.0619545, "balance_loss_mlp": 1.0207448, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 2.994820317666417, "language_loss": 0.72537267, "learning_rate": 3.860103028263227e-06, "loss": 0.74769616, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 3.668060779571533 }, { "auxiliary_loss_clip": 0.01161094, "auxiliary_loss_mlp": 0.01040473, "balance_loss_clip": 1.05545747, "balance_loss_mlp": 1.02837908, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.077489899536294, "language_loss": 0.69992232, "learning_rate": 3.859816670291304e-06, "loss": 0.72193801, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 3.608248472213745 }, { "auxiliary_loss_clip": 0.01139314, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.05559874, "balance_loss_mlp": 1.0291028, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 2.1435304607450583, "language_loss": 0.89934981, "learning_rate": 3.859530030186672e-06, "loss": 0.92116249, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 2.747736930847168 }, { "auxiliary_loss_clip": 0.01201396, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.06292105, "balance_loss_mlp": 1.03261232, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.572333765308816, "language_loss": 0.82348454, "learning_rate": 3.859243107992813e-06, "loss": 0.84595305, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.701385974884033 }, { "auxiliary_loss_clip": 0.01182333, "auxiliary_loss_mlp": 0.01038377, "balance_loss_clip": 1.05475903, "balance_loss_mlp": 1.02569318, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 3.862041902868002, "language_loss": 0.78050733, "learning_rate": 3.858955903753252e-06, "loss": 0.80271447, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.7464911937713623 }, { "auxiliary_loss_clip": 0.01216672, "auxiliary_loss_mlp": 0.01039839, "balance_loss_clip": 1.06355441, "balance_loss_mlp": 1.02755463, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.5928393156736373, "language_loss": 0.83605719, "learning_rate": 3.858668417511559e-06, "loss": 0.85862231, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.701953411102295 }, { "auxiliary_loss_clip": 0.01201997, "auxiliary_loss_mlp": 0.01040946, "balance_loss_clip": 1.06382167, "balance_loss_mlp": 1.02904844, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.719101934900226, "language_loss": 0.7593447, "learning_rate": 3.8583806493113445e-06, "loss": 0.7817741, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.634108781814575 }, { "auxiliary_loss_clip": 0.01214545, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.06516123, "balance_loss_mlp": 1.02822137, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.0207853479384608, "language_loss": 0.82452762, "learning_rate": 3.858092599196263e-06, "loss": 0.84708607, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.698749303817749 }, { "auxiliary_loss_clip": 0.0121643, "auxiliary_loss_mlp": 0.01036422, "balance_loss_clip": 1.06571698, "balance_loss_mlp": 1.02370834, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.432853019845956, "language_loss": 0.82246214, "learning_rate": 3.857804267210012e-06, "loss": 0.84499067, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.664222478866577 }, { "auxiliary_loss_clip": 0.01170333, "auxiliary_loss_mlp": 0.01041234, "balance_loss_clip": 1.05564952, "balance_loss_mlp": 1.02947998, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 2.110745224822653, "language_loss": 0.88298202, "learning_rate": 3.857515653396331e-06, "loss": 0.90509772, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.730367422103882 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01042765, "balance_loss_clip": 1.05992246, "balance_loss_mlp": 1.03093958, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.2774164424520773, "language_loss": 0.87091738, "learning_rate": 3.857226757799002e-06, "loss": 0.89302266, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.7290000915527344 }, { "auxiliary_loss_clip": 0.01198252, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.05959558, "balance_loss_mlp": 1.02988362, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.3398724352350615, "language_loss": 0.74017668, "learning_rate": 3.85693758046185e-06, "loss": 0.76258391, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.7035887241363525 }, { "auxiliary_loss_clip": 0.01233262, "auxiliary_loss_mlp": 0.01041687, "balance_loss_clip": 1.06738198, "balance_loss_mlp": 1.02942073, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.9347233472220708, "language_loss": 0.82769549, "learning_rate": 3.8566481214287435e-06, "loss": 0.85044503, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.692758083343506 }, { "auxiliary_loss_clip": 0.01174059, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 1.05537462, "balance_loss_mlp": 1.02524161, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.538579471965824, "language_loss": 0.90576327, "learning_rate": 3.8563583807435935e-06, "loss": 0.9278757, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.6903345584869385 }, { "auxiliary_loss_clip": 0.01220114, "auxiliary_loss_mlp": 0.007144, "balance_loss_clip": 1.06541848, "balance_loss_mlp": 1.00049567, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 2.825110604578241, "language_loss": 0.777022, "learning_rate": 3.856068358450353e-06, "loss": 0.79636717, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.7126922607421875 }, { "auxiliary_loss_clip": 0.0119856, "auxiliary_loss_mlp": 0.01044253, "balance_loss_clip": 1.06758523, "balance_loss_mlp": 1.03166461, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.7005285353808943, "language_loss": 0.8581866, "learning_rate": 3.8557780545930186e-06, "loss": 0.88061476, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.669485330581665 }, { "auxiliary_loss_clip": 0.0119535, "auxiliary_loss_mlp": 0.01035739, "balance_loss_clip": 1.06301749, "balance_loss_mlp": 1.02313268, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 1.7249680460017887, "language_loss": 0.79206693, "learning_rate": 3.855487469215628e-06, "loss": 0.8143779, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.677116870880127 }, { "auxiliary_loss_clip": 0.01182674, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.06279302, "balance_loss_mlp": 1.02627301, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.0293274476263705, "language_loss": 0.72270846, "learning_rate": 3.855196602362264e-06, "loss": 0.74492007, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.791293144226074 }, { "auxiliary_loss_clip": 0.01217035, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.06332576, "balance_loss_mlp": 1.03114438, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.1151338886419357, "language_loss": 0.9402836, "learning_rate": 3.854905454077051e-06, "loss": 0.96288949, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.6378424167633057 }, { "auxiliary_loss_clip": 0.01125133, "auxiliary_loss_mlp": 0.01045432, "balance_loss_clip": 1.05213881, "balance_loss_mlp": 1.03310013, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 1.8348653649737894, "language_loss": 0.88501465, "learning_rate": 3.854614024404155e-06, "loss": 0.90672034, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.776392936706543 }, { "auxiliary_loss_clip": 0.01183924, "auxiliary_loss_mlp": 0.01037825, "balance_loss_clip": 1.05707967, "balance_loss_mlp": 1.02585649, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 2.2731540795051517, "language_loss": 0.89389622, "learning_rate": 3.8543223133877865e-06, "loss": 0.91611367, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.7353854179382324 }, { "auxiliary_loss_clip": 0.01180562, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.057616, "balance_loss_mlp": 1.02395833, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 2.4914082346917943, "language_loss": 0.88184983, "learning_rate": 3.854030321072198e-06, "loss": 0.9040283, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.6880784034729004 }, { "auxiliary_loss_clip": 0.01187192, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.0595572, "balance_loss_mlp": 1.02063394, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 3.4029679521094867, "language_loss": 0.73165607, "learning_rate": 3.853738047501682e-06, "loss": 0.7538563, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.6724026203155518 }, { "auxiliary_loss_clip": 0.01219868, "auxiliary_loss_mlp": 0.01046094, "balance_loss_clip": 1.06739068, "balance_loss_mlp": 1.03333259, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 1.911569408442679, "language_loss": 0.78251958, "learning_rate": 3.85344549272058e-06, "loss": 0.80517918, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 3.5418765544891357 }, { "auxiliary_loss_clip": 0.01210782, "auxiliary_loss_mlp": 0.01040821, "balance_loss_clip": 1.06141734, "balance_loss_mlp": 1.02799404, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.7585154144192732, "language_loss": 0.82573569, "learning_rate": 3.853152656773269e-06, "loss": 0.8482517, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 3.5355563163757324 }, { "auxiliary_loss_clip": 0.01196291, "auxiliary_loss_mlp": 0.01036593, "balance_loss_clip": 1.06223392, "balance_loss_mlp": 1.02437389, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 2.426939039035769, "language_loss": 0.85003269, "learning_rate": 3.852859539704174e-06, "loss": 0.87236154, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.717862367630005 }, { "auxiliary_loss_clip": 0.01165188, "auxiliary_loss_mlp": 0.0104308, "balance_loss_clip": 1.05668485, "balance_loss_mlp": 1.0303843, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 1.8238778408719682, "language_loss": 0.76254296, "learning_rate": 3.85256614155776e-06, "loss": 0.78462565, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 4.5567262172698975 }, { "auxiliary_loss_clip": 0.01212873, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.06083202, "balance_loss_mlp": 1.02874637, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 2.1914915024594395, "language_loss": 0.74089444, "learning_rate": 3.852272462378535e-06, "loss": 0.76343262, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.62060284614563 }, { "auxiliary_loss_clip": 0.01197855, "auxiliary_loss_mlp": 0.01041234, "balance_loss_clip": 1.06092095, "balance_loss_mlp": 1.02899718, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 2.0846879043401687, "language_loss": 0.78161973, "learning_rate": 3.85197850221105e-06, "loss": 0.80401063, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 2.6751320362091064 }, { "auxiliary_loss_clip": 0.01211862, "auxiliary_loss_mlp": 0.01040661, "balance_loss_clip": 1.06413627, "balance_loss_mlp": 1.0284363, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.7377022735720373, "language_loss": 0.75623107, "learning_rate": 3.851684261099899e-06, "loss": 0.77875626, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.743762254714966 }, { "auxiliary_loss_clip": 0.0119338, "auxiliary_loss_mlp": 0.01046909, "balance_loss_clip": 1.05956674, "balance_loss_mlp": 1.03380775, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 2.395572487256485, "language_loss": 0.86590391, "learning_rate": 3.851389739089718e-06, "loss": 0.8883068, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.639528274536133 }, { "auxiliary_loss_clip": 0.01220718, "auxiliary_loss_mlp": 0.01042023, "balance_loss_clip": 1.06982827, "balance_loss_mlp": 1.02906477, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 1.9247713079275606, "language_loss": 0.80188644, "learning_rate": 3.851094936225186e-06, "loss": 0.82451391, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.7315211296081543 }, { "auxiliary_loss_clip": 0.01196854, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.06571054, "balance_loss_mlp": 1.02045655, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.4343678710031738, "language_loss": 0.76530814, "learning_rate": 3.850799852551024e-06, "loss": 0.78759921, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.751539945602417 }, { "auxiliary_loss_clip": 0.01208578, "auxiliary_loss_mlp": 0.01038354, "balance_loss_clip": 1.06126833, "balance_loss_mlp": 1.02491975, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.382444912955748, "language_loss": 0.85789406, "learning_rate": 3.850504488111995e-06, "loss": 0.8803634, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.6942014694213867 }, { "auxiliary_loss_clip": 0.01189964, "auxiliary_loss_mlp": 0.01045271, "balance_loss_clip": 1.0588305, "balance_loss_mlp": 1.03313553, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 1.8567470736782783, "language_loss": 0.82686538, "learning_rate": 3.850208842952907e-06, "loss": 0.84921777, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.6802380084991455 }, { "auxiliary_loss_clip": 0.0117367, "auxiliary_loss_mlp": 0.01045154, "balance_loss_clip": 1.05778599, "balance_loss_mlp": 1.03190422, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 1.983463650870811, "language_loss": 0.79143393, "learning_rate": 3.849912917118608e-06, "loss": 0.81362224, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.821319341659546 }, { "auxiliary_loss_clip": 0.01135415, "auxiliary_loss_mlp": 0.01017025, "balance_loss_clip": 1.04687667, "balance_loss_mlp": 1.01158893, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.895483525633579, "language_loss": 0.59272373, "learning_rate": 3.849616710653992e-06, "loss": 0.61424804, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.2123446464538574 }, { "auxiliary_loss_clip": 0.01213641, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.06264186, "balance_loss_mlp": 1.03261137, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.657063267745722, "language_loss": 0.74859792, "learning_rate": 3.84932022360399e-06, "loss": 0.77118754, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.651237964630127 }, { "auxiliary_loss_clip": 0.01196051, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.06556988, "balance_loss_mlp": 1.02827168, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 3.031971875141812, "language_loss": 0.8445667, "learning_rate": 3.849023456013581e-06, "loss": 0.86693209, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.6732280254364014 }, { "auxiliary_loss_clip": 0.01222454, "auxiliary_loss_mlp": 0.01053878, "balance_loss_clip": 1.06435025, "balance_loss_mlp": 1.04128909, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 2.5540119869052904, "language_loss": 0.61790633, "learning_rate": 3.848726407927784e-06, "loss": 0.64066958, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.659020185470581 }, { "auxiliary_loss_clip": 0.01204696, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.06694436, "balance_loss_mlp": 1.02725852, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 4.028501916194408, "language_loss": 0.86349189, "learning_rate": 3.84842907939166e-06, "loss": 0.88594031, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.648057222366333 }, { "auxiliary_loss_clip": 0.01175675, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.0596981, "balance_loss_mlp": 1.02826858, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.7480955638058555, "language_loss": 0.71082795, "learning_rate": 3.8481314704503146e-06, "loss": 0.73298776, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.7218756675720215 }, { "auxiliary_loss_clip": 0.01216017, "auxiliary_loss_mlp": 0.01050425, "balance_loss_clip": 1.06670117, "balance_loss_mlp": 1.03729963, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.4448513142871047, "language_loss": 0.87989378, "learning_rate": 3.847833581148895e-06, "loss": 0.90255821, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.6189279556274414 }, { "auxiliary_loss_clip": 0.01229445, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.06252217, "balance_loss_mlp": 1.02641392, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 2.844694816945907, "language_loss": 0.80442262, "learning_rate": 3.84753541153259e-06, "loss": 0.82710755, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.6926422119140625 }, { "auxiliary_loss_clip": 0.01216817, "auxiliary_loss_mlp": 0.01040297, "balance_loss_clip": 1.06595945, "balance_loss_mlp": 1.02833414, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.6873164928869673, "language_loss": 0.83038592, "learning_rate": 3.847236961646633e-06, "loss": 0.85295707, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.643850326538086 }, { "auxiliary_loss_clip": 0.01188731, "auxiliary_loss_mlp": 0.01040715, "balance_loss_clip": 1.05819404, "balance_loss_mlp": 1.02762604, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 2.513884807961759, "language_loss": 0.78258169, "learning_rate": 3.846938231536296e-06, "loss": 0.80487609, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.575191020965576 }, { "auxiliary_loss_clip": 0.0122325, "auxiliary_loss_mlp": 0.01052243, "balance_loss_clip": 1.06896913, "balance_loss_mlp": 1.03896892, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 2.967360782256681, "language_loss": 0.80657965, "learning_rate": 3.8466392212468995e-06, "loss": 0.8293345, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.636617422103882 }, { "auxiliary_loss_clip": 0.01099058, "auxiliary_loss_mlp": 0.01021221, "balance_loss_clip": 1.03469849, "balance_loss_mlp": 1.01478338, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.820835355365584, "language_loss": 0.61971188, "learning_rate": 3.8463399308238e-06, "loss": 0.64091456, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.216688632965088 }, { "auxiliary_loss_clip": 0.01218635, "auxiliary_loss_mlp": 0.01041003, "balance_loss_clip": 1.06593084, "balance_loss_mlp": 1.02828312, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 1.8073297423457657, "language_loss": 0.63923812, "learning_rate": 3.846040360312402e-06, "loss": 0.66183448, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.70584774017334 }, { "auxiliary_loss_clip": 0.01233819, "auxiliary_loss_mlp": 0.01044424, "balance_loss_clip": 1.06484473, "balance_loss_mlp": 1.03225875, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.3912407129477176, "language_loss": 0.80837631, "learning_rate": 3.8457405097581485e-06, "loss": 0.83115876, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.632410764694214 }, { "auxiliary_loss_clip": 0.01166951, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.05519557, "balance_loss_mlp": 1.03052926, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.7195335719954843, "language_loss": 0.7807039, "learning_rate": 3.8454403792065275e-06, "loss": 0.80280244, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 3.5841922760009766 }, { "auxiliary_loss_clip": 0.01172275, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.05992889, "balance_loss_mlp": 1.03514171, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.061603601809274, "language_loss": 0.85563016, "learning_rate": 3.845139968703068e-06, "loss": 0.87782848, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.8589015007019043 }, { "auxiliary_loss_clip": 0.01164411, "auxiliary_loss_mlp": 0.01050542, "balance_loss_clip": 1.05889916, "balance_loss_mlp": 1.03727412, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.7793846973826817, "language_loss": 0.83032262, "learning_rate": 3.844839278293342e-06, "loss": 0.85247219, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 3.6799662113189697 }, { "auxiliary_loss_clip": 0.01236201, "auxiliary_loss_mlp": 0.0103958, "balance_loss_clip": 1.06817508, "balance_loss_mlp": 1.02712858, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.40520153432979, "language_loss": 0.76851612, "learning_rate": 3.8445383080229654e-06, "loss": 0.79127389, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 3.593707799911499 }, { "auxiliary_loss_clip": 0.01188581, "auxiliary_loss_mlp": 0.01044326, "balance_loss_clip": 1.06014132, "balance_loss_mlp": 1.03119481, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.2866180646220458, "language_loss": 0.73658621, "learning_rate": 3.844237057937593e-06, "loss": 0.75891519, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 2.680910587310791 }, { "auxiliary_loss_clip": 0.012218, "auxiliary_loss_mlp": 0.01040044, "balance_loss_clip": 1.06473231, "balance_loss_mlp": 1.02747929, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.3175700137357755, "language_loss": 0.77788568, "learning_rate": 3.843935528082926e-06, "loss": 0.80050415, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.6239631175994873 }, { "auxiliary_loss_clip": 0.01220916, "auxiliary_loss_mlp": 0.01040726, "balance_loss_clip": 1.06687212, "balance_loss_mlp": 1.02825117, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 5.301348447972378, "language_loss": 0.85027957, "learning_rate": 3.843633718504704e-06, "loss": 0.87289596, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.663470506668091 }, { "auxiliary_loss_clip": 0.01186638, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.0613997, "balance_loss_mlp": 1.02679372, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 2.365625462103936, "language_loss": 0.89845479, "learning_rate": 3.843331629248715e-06, "loss": 0.92071331, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.6704084873199463 }, { "auxiliary_loss_clip": 0.01236001, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.06963003, "balance_loss_mlp": 1.03360224, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.4566304525616243, "language_loss": 0.77077144, "learning_rate": 3.843029260360782e-06, "loss": 0.79359388, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.690626621246338 }, { "auxiliary_loss_clip": 0.01214909, "auxiliary_loss_mlp": 0.0104492, "balance_loss_clip": 1.06599092, "balance_loss_mlp": 1.03291011, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.737661271113986, "language_loss": 0.79158771, "learning_rate": 3.8427266118867755e-06, "loss": 0.81418598, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.6197736263275146 }, { "auxiliary_loss_clip": 0.0119832, "auxiliary_loss_mlp": 0.01042956, "balance_loss_clip": 1.06243324, "balance_loss_mlp": 1.02973616, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 1.9122953821356916, "language_loss": 0.82418239, "learning_rate": 3.842423683872608e-06, "loss": 0.84659517, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.7943172454833984 }, { "auxiliary_loss_clip": 0.01217254, "auxiliary_loss_mlp": 0.01043014, "balance_loss_clip": 1.06462908, "balance_loss_mlp": 1.03049684, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.8814704399199607, "language_loss": 0.7821542, "learning_rate": 3.842120476364232e-06, "loss": 0.80475688, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.5885379314422607 }, { "auxiliary_loss_clip": 0.01220433, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.06353688, "balance_loss_mlp": 1.02940726, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.273138543277162, "language_loss": 0.83862221, "learning_rate": 3.841816989407644e-06, "loss": 0.86124718, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.6448254585266113 }, { "auxiliary_loss_clip": 0.01175885, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.05789948, "balance_loss_mlp": 1.02989185, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.1914102284699815, "language_loss": 0.76764357, "learning_rate": 3.841513223048884e-06, "loss": 0.78982478, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.8034486770629883 }, { "auxiliary_loss_clip": 0.01175051, "auxiliary_loss_mlp": 0.01048082, "balance_loss_clip": 1.05794895, "balance_loss_mlp": 1.03512394, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.3282230364525534, "language_loss": 0.78693688, "learning_rate": 3.841209177334031e-06, "loss": 0.80916822, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.693808078765869 }, { "auxiliary_loss_clip": 0.01212934, "auxiliary_loss_mlp": 0.01050515, "balance_loss_clip": 1.0642457, "balance_loss_mlp": 1.03872478, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.7507450052138276, "language_loss": 0.7468369, "learning_rate": 3.84090485230921e-06, "loss": 0.76947141, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.600878953933716 }, { "auxiliary_loss_clip": 0.01232832, "auxiliary_loss_mlp": 0.01040847, "balance_loss_clip": 1.06685495, "balance_loss_mlp": 1.02766216, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.3007988750811696, "language_loss": 0.76247501, "learning_rate": 3.840600248020588e-06, "loss": 0.78521174, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.5559616088867188 }, { "auxiliary_loss_clip": 0.01203886, "auxiliary_loss_mlp": 0.01042809, "balance_loss_clip": 1.06103563, "balance_loss_mlp": 1.02966666, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.1557200354165906, "language_loss": 0.7977733, "learning_rate": 3.840295364514371e-06, "loss": 0.82024032, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.704000234603882 }, { "auxiliary_loss_clip": 0.01197612, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.06244719, "balance_loss_mlp": 1.03134894, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 2.353513658842278, "language_loss": 0.78238523, "learning_rate": 3.83999020183681e-06, "loss": 0.80479872, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.633074998855591 }, { "auxiliary_loss_clip": 0.01140735, "auxiliary_loss_mlp": 0.01043371, "balance_loss_clip": 1.05399942, "balance_loss_mlp": 1.02981114, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 2.4936470428207254, "language_loss": 0.78397155, "learning_rate": 3.839684760034199e-06, "loss": 0.80581266, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.77258563041687 }, { "auxiliary_loss_clip": 0.01176104, "auxiliary_loss_mlp": 0.01037969, "balance_loss_clip": 1.06082261, "balance_loss_mlp": 1.02566683, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 2.527789389143267, "language_loss": 0.65455627, "learning_rate": 3.8393790391528716e-06, "loss": 0.67669702, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.796055316925049 }, { "auxiliary_loss_clip": 0.01194805, "auxiliary_loss_mlp": 0.01040713, "balance_loss_clip": 1.0602138, "balance_loss_mlp": 1.02869046, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 1.8684201315247233, "language_loss": 0.89135742, "learning_rate": 3.8390730392392075e-06, "loss": 0.91371256, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.704982042312622 }, { "auxiliary_loss_clip": 0.01235338, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.06867456, "balance_loss_mlp": 1.02791107, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.07757948509843, "language_loss": 0.79219472, "learning_rate": 3.838766760339626e-06, "loss": 0.81494486, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.5693795680999756 }, { "auxiliary_loss_clip": 0.01158423, "auxiliary_loss_mlp": 0.01037239, "balance_loss_clip": 1.05379462, "balance_loss_mlp": 1.02461457, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 3.3459759121282207, "language_loss": 0.79097968, "learning_rate": 3.838460202500587e-06, "loss": 0.81293631, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.675830841064453 }, { "auxiliary_loss_clip": 0.01181116, "auxiliary_loss_mlp": 0.01045027, "balance_loss_clip": 1.06527591, "balance_loss_mlp": 1.03162217, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.169320413175059, "language_loss": 0.74633813, "learning_rate": 3.838153365768599e-06, "loss": 0.76859957, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.6248412132263184 }, { "auxiliary_loss_clip": 0.01180181, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.06631148, "balance_loss_mlp": 1.03012013, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.1796999010653906, "language_loss": 0.75221193, "learning_rate": 3.837846250190206e-06, "loss": 0.77443975, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 3.6986169815063477 }, { "auxiliary_loss_clip": 0.01161057, "auxiliary_loss_mlp": 0.00713666, "balance_loss_clip": 1.05888009, "balance_loss_mlp": 1.0004648, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.5399359616710986, "language_loss": 0.77295232, "learning_rate": 3.837538855811998e-06, "loss": 0.79169953, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 3.5793423652648926 }, { "auxiliary_loss_clip": 0.01203252, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.06344676, "balance_loss_mlp": 1.02689791, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.430702282434699, "language_loss": 0.70591992, "learning_rate": 3.837231182680606e-06, "loss": 0.72834349, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 3.5690155029296875 }, { "auxiliary_loss_clip": 0.0122045, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.06618929, "balance_loss_mlp": 1.02796447, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.535505831897656, "language_loss": 0.758385, "learning_rate": 3.836923230842706e-06, "loss": 0.78098547, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.6800761222839355 }, { "auxiliary_loss_clip": 0.01162019, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.02617168, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.018933032628439, "language_loss": 0.80901647, "learning_rate": 3.836615000345011e-06, "loss": 0.83102596, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 3.64932918548584 }, { "auxiliary_loss_clip": 0.01231238, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.06550217, "balance_loss_mlp": 1.02853501, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.0132014498752637, "language_loss": 0.78285503, "learning_rate": 3.836306491234282e-06, "loss": 0.80557418, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.5820508003234863 }, { "auxiliary_loss_clip": 0.01192878, "auxiliary_loss_mlp": 0.01036474, "balance_loss_clip": 1.06562233, "balance_loss_mlp": 1.02411842, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.1138587664579758, "language_loss": 0.75316054, "learning_rate": 3.835997703557317e-06, "loss": 0.77545404, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.627472400665283 }, { "auxiliary_loss_clip": 0.01159946, "auxiliary_loss_mlp": 0.01052849, "balance_loss_clip": 1.05150282, "balance_loss_mlp": 1.04100561, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.7257622680767195, "language_loss": 0.80109984, "learning_rate": 3.83568863736096e-06, "loss": 0.82322776, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.702059030532837 }, { "auxiliary_loss_clip": 0.01178055, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.0550735, "balance_loss_mlp": 1.02706265, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.3296131096562256, "language_loss": 0.89410448, "learning_rate": 3.8353792926920975e-06, "loss": 0.91627854, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.735001802444458 }, { "auxiliary_loss_clip": 0.01223874, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.06758094, "balance_loss_mlp": 1.02712083, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.1052838517806176, "language_loss": 0.8173337, "learning_rate": 3.835069669597655e-06, "loss": 0.83997166, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.7461888790130615 }, { "auxiliary_loss_clip": 0.0121837, "auxiliary_loss_mlp": 0.00714647, "balance_loss_clip": 1.06223297, "balance_loss_mlp": 1.00051129, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.024075660145853, "language_loss": 0.79634261, "learning_rate": 3.834759768124603e-06, "loss": 0.81567276, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.7048022747039795 }, { "auxiliary_loss_clip": 0.01184376, "auxiliary_loss_mlp": 0.01041589, "balance_loss_clip": 1.06459737, "balance_loss_mlp": 1.027969, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 4.149668882753558, "language_loss": 0.76228821, "learning_rate": 3.834449588319953e-06, "loss": 0.78454792, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.633981466293335 }, { "auxiliary_loss_clip": 0.01211973, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.06757545, "balance_loss_mlp": 1.035532, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 2.2726404669789115, "language_loss": 0.85197699, "learning_rate": 3.834139130230758e-06, "loss": 0.87457567, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.659503221511841 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.06007433, "balance_loss_mlp": 1.02686405, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.6702618512800051, "language_loss": 0.81103313, "learning_rate": 3.833828393904117e-06, "loss": 0.83341086, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.6867823600769043 }, { "auxiliary_loss_clip": 0.0115764, "auxiliary_loss_mlp": 0.01044323, "balance_loss_clip": 1.05524004, "balance_loss_mlp": 1.03180003, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.0806557444626894, "language_loss": 0.77163684, "learning_rate": 3.833517379387165e-06, "loss": 0.79365647, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.6865596771240234 }, { "auxiliary_loss_clip": 0.01217203, "auxiliary_loss_mlp": 0.01046205, "balance_loss_clip": 1.06407034, "balance_loss_mlp": 1.03397393, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.3688922856692187, "language_loss": 0.88694948, "learning_rate": 3.833206086727085e-06, "loss": 0.90958357, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.6479432582855225 }, { "auxiliary_loss_clip": 0.0118054, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.05626273, "balance_loss_mlp": 1.02642298, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.097339963317741, "language_loss": 0.70440209, "learning_rate": 3.8328945159710994e-06, "loss": 0.72659266, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.683959722518921 }, { "auxiliary_loss_clip": 0.01224104, "auxiliary_loss_mlp": 0.00713722, "balance_loss_clip": 1.06951332, "balance_loss_mlp": 1.00046098, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.0257054459170694, "language_loss": 0.88927174, "learning_rate": 3.832582667166473e-06, "loss": 0.90864998, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.680676221847534 }, { "auxiliary_loss_clip": 0.01194263, "auxiliary_loss_mlp": 0.0103831, "balance_loss_clip": 1.05996513, "balance_loss_mlp": 1.02568603, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 2.0412861777240545, "language_loss": 0.81750011, "learning_rate": 3.8322705403605125e-06, "loss": 0.83982581, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.687138795852661 }, { "auxiliary_loss_clip": 0.01190837, "auxiliary_loss_mlp": 0.01035925, "balance_loss_clip": 1.06164241, "balance_loss_mlp": 1.02454042, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.1896173979136413, "language_loss": 0.81307197, "learning_rate": 3.831958135600568e-06, "loss": 0.83533955, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.664212942123413 }, { "auxiliary_loss_clip": 0.01218103, "auxiliary_loss_mlp": 0.01045136, "balance_loss_clip": 1.06546652, "balance_loss_mlp": 1.03326845, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 32.99020578169518, "language_loss": 0.79605603, "learning_rate": 3.831645452934032e-06, "loss": 0.81868845, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.5868003368377686 }, { "auxiliary_loss_clip": 0.01232502, "auxiliary_loss_mlp": 0.01042986, "balance_loss_clip": 1.06851363, "balance_loss_mlp": 1.0309999, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 2.207675154065547, "language_loss": 0.79929799, "learning_rate": 3.831332492408336e-06, "loss": 0.8220529, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.6947505474090576 }, { "auxiliary_loss_clip": 0.01191932, "auxiliary_loss_mlp": 0.01036353, "balance_loss_clip": 1.05986977, "balance_loss_mlp": 1.02450943, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 1.9269209245852361, "language_loss": 0.6954211, "learning_rate": 3.831019254070957e-06, "loss": 0.71770394, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.634887933731079 }, { "auxiliary_loss_clip": 0.01166956, "auxiliary_loss_mlp": 0.01040601, "balance_loss_clip": 1.05870557, "balance_loss_mlp": 1.02876925, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 2.700944607039412, "language_loss": 0.95264769, "learning_rate": 3.8307057379694135e-06, "loss": 0.97472334, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.7625021934509277 }, { "auxiliary_loss_clip": 0.01230837, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.06413174, "balance_loss_mlp": 1.02559853, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 1.9957051726160573, "language_loss": 0.82182562, "learning_rate": 3.830391944151264e-06, "loss": 0.84451181, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.5605521202087402 }, { "auxiliary_loss_clip": 0.01199722, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.06129026, "balance_loss_mlp": 1.0249474, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.8615540413660174, "language_loss": 0.67417669, "learning_rate": 3.830077872664114e-06, "loss": 0.69654429, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.735535144805908 }, { "auxiliary_loss_clip": 0.01147829, "auxiliary_loss_mlp": 0.01036625, "balance_loss_clip": 1.05331922, "balance_loss_mlp": 1.02456141, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 1.704050849363249, "language_loss": 0.73109388, "learning_rate": 3.829763523555604e-06, "loss": 0.75293839, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 3.815819025039673 }, { "auxiliary_loss_clip": 0.01212396, "auxiliary_loss_mlp": 0.01040411, "balance_loss_clip": 1.06812978, "balance_loss_mlp": 1.02901435, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.2005947451963417, "language_loss": 0.78305805, "learning_rate": 3.829448896873423e-06, "loss": 0.80558616, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.6186652183532715 }, { "auxiliary_loss_clip": 0.01152251, "auxiliary_loss_mlp": 0.0071429, "balance_loss_clip": 1.05898309, "balance_loss_mlp": 1.00049031, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 2.2739489457825712, "language_loss": 0.79032028, "learning_rate": 3.829133992665299e-06, "loss": 0.80898571, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 3.7739808559417725 }, { "auxiliary_loss_clip": 0.01201661, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.0606606, "balance_loss_mlp": 1.03073657, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.175366487353926, "language_loss": 0.88951117, "learning_rate": 3.828818810979002e-06, "loss": 0.91196823, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 2.737103223800659 }, { "auxiliary_loss_clip": 0.01231609, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.06771564, "balance_loss_mlp": 1.03037333, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 1.981708524569028, "language_loss": 0.80345476, "learning_rate": 3.8285033518623454e-06, "loss": 0.82619309, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 3.5938141345977783 }, { "auxiliary_loss_clip": 0.01220114, "auxiliary_loss_mlp": 0.0104113, "balance_loss_clip": 1.06650662, "balance_loss_mlp": 1.02868485, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 3.6796706783852478, "language_loss": 0.81389654, "learning_rate": 3.8281876153631845e-06, "loss": 0.83650899, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 2.7032833099365234 }, { "auxiliary_loss_clip": 0.0115915, "auxiliary_loss_mlp": 0.01045109, "balance_loss_clip": 1.05627513, "balance_loss_mlp": 1.03240156, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 2.7622586705056253, "language_loss": 0.6464408, "learning_rate": 3.827871601529416e-06, "loss": 0.66848338, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.7258217334747314 }, { "auxiliary_loss_clip": 0.01172147, "auxiliary_loss_mlp": 0.0104291, "balance_loss_clip": 1.05724001, "balance_loss_mlp": 1.03097677, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 1.6578755722378966, "language_loss": 0.80377918, "learning_rate": 3.827555310408979e-06, "loss": 0.82592976, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.686976432800293 }, { "auxiliary_loss_clip": 0.01178313, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.06183219, "balance_loss_mlp": 1.0230422, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.5853599290094602, "language_loss": 0.82740408, "learning_rate": 3.827238742049854e-06, "loss": 0.84954083, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.702451705932617 }, { "auxiliary_loss_clip": 0.01226112, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.0625912, "balance_loss_mlp": 1.02643418, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 2.5707678663853675, "language_loss": 0.51872659, "learning_rate": 3.826921896500066e-06, "loss": 0.54137063, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.742062568664551 }, { "auxiliary_loss_clip": 0.01183423, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.05890536, "balance_loss_mlp": 1.02473211, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 1.866201947998354, "language_loss": 0.78340364, "learning_rate": 3.826604773807678e-06, "loss": 0.80561036, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.7087364196777344 }, { "auxiliary_loss_clip": 0.01194637, "auxiliary_loss_mlp": 0.01044635, "balance_loss_clip": 1.055511, "balance_loss_mlp": 1.0312717, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.681970369090488, "language_loss": 0.7347821, "learning_rate": 3.826287374020798e-06, "loss": 0.75717473, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.6530563831329346 }, { "auxiliary_loss_clip": 0.01233165, "auxiliary_loss_mlp": 0.01039627, "balance_loss_clip": 1.06704044, "balance_loss_mlp": 1.02728283, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 1.9568635098876175, "language_loss": 0.82468176, "learning_rate": 3.825969697187575e-06, "loss": 0.84740973, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.641719102859497 }, { "auxiliary_loss_clip": 0.01176354, "auxiliary_loss_mlp": 0.01043468, "balance_loss_clip": 1.05650616, "balance_loss_mlp": 1.03128505, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 1.99095977686927, "language_loss": 0.69405389, "learning_rate": 3.8256517433562015e-06, "loss": 0.71625209, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.695533514022827 }, { "auxiliary_loss_clip": 0.01228074, "auxiliary_loss_mlp": 0.01040883, "balance_loss_clip": 1.06327295, "balance_loss_mlp": 1.02859235, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.392697643119918, "language_loss": 0.91500705, "learning_rate": 3.82533351257491e-06, "loss": 0.93769664, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.619652032852173 }, { "auxiliary_loss_clip": 0.01213284, "auxiliary_loss_mlp": 0.0104789, "balance_loss_clip": 1.06518543, "balance_loss_mlp": 1.03564715, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.9450444701614449, "language_loss": 0.88453245, "learning_rate": 3.825015004891975e-06, "loss": 0.90714413, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.6127681732177734 }, { "auxiliary_loss_clip": 0.01208867, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.0627799, "balance_loss_mlp": 1.02527714, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 1.7871225845158092, "language_loss": 0.75720143, "learning_rate": 3.824696220355716e-06, "loss": 0.77966487, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.659583568572998 }, { "auxiliary_loss_clip": 0.0119423, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.06185853, "balance_loss_mlp": 1.03200054, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 2.0041861311045057, "language_loss": 0.78581429, "learning_rate": 3.824377159014491e-06, "loss": 0.80818927, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.6520090103149414 }, { "auxiliary_loss_clip": 0.01208654, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.06344795, "balance_loss_mlp": 1.02303529, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 1.765227748650813, "language_loss": 0.84694302, "learning_rate": 3.824057820916702e-06, "loss": 0.86938381, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.6328649520874023 }, { "auxiliary_loss_clip": 0.0119852, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.06245184, "balance_loss_mlp": 1.02536011, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.164489527516578, "language_loss": 0.71703768, "learning_rate": 3.8237382061107904e-06, "loss": 0.73940337, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.6213417053222656 }, { "auxiliary_loss_clip": 0.0111346, "auxiliary_loss_mlp": 0.01034916, "balance_loss_clip": 1.04604769, "balance_loss_mlp": 1.02287579, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 1.833692436881882, "language_loss": 0.78618413, "learning_rate": 3.823418314645243e-06, "loss": 0.80766785, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.8232173919677734 }, { "auxiliary_loss_clip": 0.01142657, "auxiliary_loss_mlp": 0.01032306, "balance_loss_clip": 1.05636811, "balance_loss_mlp": 1.02070141, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.5366160564092706, "language_loss": 0.75333154, "learning_rate": 3.823098146568588e-06, "loss": 0.77508116, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.738290786743164 }, { "auxiliary_loss_clip": 0.01214121, "auxiliary_loss_mlp": 0.0103915, "balance_loss_clip": 1.06392276, "balance_loss_mlp": 1.02731228, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 1.7876788395125502, "language_loss": 0.7110101, "learning_rate": 3.822777701929394e-06, "loss": 0.7335428, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.654264450073242 }, { "auxiliary_loss_clip": 0.01201478, "auxiliary_loss_mlp": 0.01039614, "balance_loss_clip": 1.05998182, "balance_loss_mlp": 1.02739549, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 1.7564552848924653, "language_loss": 0.73263294, "learning_rate": 3.8224569807762714e-06, "loss": 0.75504386, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.652513027191162 }, { "auxiliary_loss_clip": 0.01139543, "auxiliary_loss_mlp": 0.01032907, "balance_loss_clip": 1.05113316, "balance_loss_mlp": 1.0209918, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 2.2641180132805685, "language_loss": 0.76182353, "learning_rate": 3.822135983157873e-06, "loss": 0.783548, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.754723072052002 }, { "auxiliary_loss_clip": 0.01226734, "auxiliary_loss_mlp": 0.00713556, "balance_loss_clip": 1.06374931, "balance_loss_mlp": 1.00056863, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.2459251365093036, "language_loss": 0.84261751, "learning_rate": 3.821814709122896e-06, "loss": 0.86202037, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.5353262424468994 }, { "auxiliary_loss_clip": 0.01195426, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.06349969, "balance_loss_mlp": 1.03840733, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.2713196223016734, "language_loss": 0.85200465, "learning_rate": 3.821493158720076e-06, "loss": 0.87445617, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 4.3472206592559814 }, { "auxiliary_loss_clip": 0.01176415, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.05415189, "balance_loss_mlp": 1.03025675, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 5.175778493178059, "language_loss": 0.73615491, "learning_rate": 3.821171331998191e-06, "loss": 0.75834501, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.674160957336426 }, { "auxiliary_loss_clip": 0.01094751, "auxiliary_loss_mlp": 0.01011579, "balance_loss_clip": 1.03665924, "balance_loss_mlp": 1.00595212, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7108042564262799, "language_loss": 0.54459858, "learning_rate": 3.820849229006064e-06, "loss": 0.56566191, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 4.214728593826294 }, { "auxiliary_loss_clip": 0.01229174, "auxiliary_loss_mlp": 0.01043113, "balance_loss_clip": 1.0650537, "balance_loss_mlp": 1.03156197, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 2.252211257972217, "language_loss": 0.70793086, "learning_rate": 3.8205268497925564e-06, "loss": 0.7306537, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.5965723991394043 }, { "auxiliary_loss_clip": 0.01229377, "auxiliary_loss_mlp": 0.01041429, "balance_loss_clip": 1.06644619, "balance_loss_mlp": 1.02999139, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.5083714174228313, "language_loss": 0.78446293, "learning_rate": 3.8202041944065725e-06, "loss": 0.80717099, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 3.5337743759155273 }, { "auxiliary_loss_clip": 0.01229505, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.06718433, "balance_loss_mlp": 1.02594244, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.7948279423110436, "language_loss": 0.73880607, "learning_rate": 3.819881262897061e-06, "loss": 0.76147741, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.587383508682251 }, { "auxiliary_loss_clip": 0.01183593, "auxiliary_loss_mlp": 0.01050386, "balance_loss_clip": 1.06392694, "balance_loss_mlp": 1.03726685, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 2.0025047464084342, "language_loss": 0.73579484, "learning_rate": 3.819558055313008e-06, "loss": 0.7581346, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.7642173767089844 }, { "auxiliary_loss_clip": 0.01216824, "auxiliary_loss_mlp": 0.01037588, "balance_loss_clip": 1.06375241, "balance_loss_mlp": 1.0260371, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 2.7858198480804126, "language_loss": 0.77669472, "learning_rate": 3.819234571703444e-06, "loss": 0.7992388, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.662081718444824 }, { "auxiliary_loss_clip": 0.01203451, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.06056857, "balance_loss_mlp": 1.02168918, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 1.9436179301498135, "language_loss": 0.85430205, "learning_rate": 3.8189108121174435e-06, "loss": 0.87667495, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.6176276206970215 }, { "auxiliary_loss_clip": 0.0117507, "auxiliary_loss_mlp": 0.01038646, "balance_loss_clip": 1.06414628, "balance_loss_mlp": 1.02748775, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.6829829689196487, "language_loss": 0.83792084, "learning_rate": 3.818586776604118e-06, "loss": 0.86005795, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.783865213394165 }, { "auxiliary_loss_clip": 0.01187996, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.05961919, "balance_loss_mlp": 1.02598619, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 1.7685169206705706, "language_loss": 0.61672235, "learning_rate": 3.818262465212625e-06, "loss": 0.63897854, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.7535171508789062 }, { "auxiliary_loss_clip": 0.01203737, "auxiliary_loss_mlp": 0.0103779, "balance_loss_clip": 1.0640986, "balance_loss_mlp": 1.02551186, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 1.9715150120861995, "language_loss": 0.77052903, "learning_rate": 3.817937877992161e-06, "loss": 0.79294431, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.6001813411712646 }, { "auxiliary_loss_clip": 0.01171612, "auxiliary_loss_mlp": 0.00713847, "balance_loss_clip": 1.05497921, "balance_loss_mlp": 1.00053787, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.312448564962149, "language_loss": 0.86151552, "learning_rate": 3.817613014991967e-06, "loss": 0.88037008, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.6918442249298096 }, { "auxiliary_loss_clip": 0.01167051, "auxiliary_loss_mlp": 0.01044024, "balance_loss_clip": 1.05669153, "balance_loss_mlp": 1.03138793, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.1273049982332966, "language_loss": 0.76592749, "learning_rate": 3.817287876261323e-06, "loss": 0.78803819, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.7972335815429688 }, { "auxiliary_loss_clip": 0.01191071, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.06349826, "balance_loss_mlp": 1.03177035, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 1.8401334135009408, "language_loss": 0.79875898, "learning_rate": 3.816962461849553e-06, "loss": 0.82110989, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.696160078048706 }, { "auxiliary_loss_clip": 0.01187023, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.06164062, "balance_loss_mlp": 1.02544725, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 2.413941180454956, "language_loss": 0.84168643, "learning_rate": 3.8166367718060235e-06, "loss": 0.8639245, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.6468546390533447 }, { "auxiliary_loss_clip": 0.01205136, "auxiliary_loss_mlp": 0.01048569, "balance_loss_clip": 1.06038535, "balance_loss_mlp": 1.0364871, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 4.856593923478398, "language_loss": 0.76888454, "learning_rate": 3.816310806180139e-06, "loss": 0.79142153, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.6145622730255127 }, { "auxiliary_loss_clip": 0.01194066, "auxiliary_loss_mlp": 0.01038681, "balance_loss_clip": 1.06372273, "balance_loss_mlp": 1.02706969, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.638703983120611, "language_loss": 0.81048816, "learning_rate": 3.81598456502135e-06, "loss": 0.83281559, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.6595280170440674 }, { "auxiliary_loss_clip": 0.01190193, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.06363702, "balance_loss_mlp": 1.02658677, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 1.9859060193145042, "language_loss": 0.87059081, "learning_rate": 3.8156580483791455e-06, "loss": 0.89287716, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.62539005279541 }, { "auxiliary_loss_clip": 0.01228425, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.06561923, "balance_loss_mlp": 1.02419293, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 1.9594404608932117, "language_loss": 0.77304482, "learning_rate": 3.815331256303059e-06, "loss": 0.79568601, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.6659226417541504 }, { "auxiliary_loss_clip": 0.01174914, "auxiliary_loss_mlp": 0.01044689, "balance_loss_clip": 1.06288838, "balance_loss_mlp": 1.03299499, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.153247641196982, "language_loss": 0.76938891, "learning_rate": 3.815004188842665e-06, "loss": 0.79158497, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.7535150051116943 }, { "auxiliary_loss_clip": 0.0118635, "auxiliary_loss_mlp": 0.0104205, "balance_loss_clip": 1.05762422, "balance_loss_mlp": 1.02985477, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.534861631604857, "language_loss": 0.79554749, "learning_rate": 3.814676846047578e-06, "loss": 0.81783146, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.695348024368286 }, { "auxiliary_loss_clip": 0.01208715, "auxiliary_loss_mlp": 0.01037667, "balance_loss_clip": 1.06347251, "balance_loss_mlp": 1.02626443, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.7352947921920427, "language_loss": 0.6944952, "learning_rate": 3.8143492279674565e-06, "loss": 0.716959, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.740950345993042 }, { "auxiliary_loss_clip": 0.01097316, "auxiliary_loss_mlp": 0.01007234, "balance_loss_clip": 1.04348588, "balance_loss_mlp": 1.00174999, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 1.396942425490369, "language_loss": 0.58395171, "learning_rate": 3.8140213346519997e-06, "loss": 0.60499716, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 2.9940879344940186 }, { "auxiliary_loss_clip": 0.01169042, "auxiliary_loss_mlp": 0.01038153, "balance_loss_clip": 1.05953205, "balance_loss_mlp": 1.0270431, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 1.7100004049419157, "language_loss": 0.76815981, "learning_rate": 3.813693166150948e-06, "loss": 0.7902317, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.8265256881713867 }, { "auxiliary_loss_clip": 0.01167911, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.05890584, "balance_loss_mlp": 1.02983356, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.190990093983896, "language_loss": 0.85398978, "learning_rate": 3.813364722514086e-06, "loss": 0.87609041, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.8346028327941895 }, { "auxiliary_loss_clip": 0.01208179, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.06294334, "balance_loss_mlp": 1.02504146, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 2.102909319507143, "language_loss": 0.80171168, "learning_rate": 3.8130360037912368e-06, "loss": 0.82415855, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 4.384136438369751 }, { "auxiliary_loss_clip": 0.01210402, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.06318569, "balance_loss_mlp": 1.02830315, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.520990868435035, "language_loss": 0.81705225, "learning_rate": 3.812707010032268e-06, "loss": 0.83955371, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 3.6074957847595215 }, { "auxiliary_loss_clip": 0.01216447, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.06668079, "balance_loss_mlp": 1.03900862, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 1.8057368957348081, "language_loss": 0.79479361, "learning_rate": 3.8123777412870863e-06, "loss": 0.8174715, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 2.6451265811920166 }, { "auxiliary_loss_clip": 0.01195497, "auxiliary_loss_mlp": 0.01046055, "balance_loss_clip": 1.060094, "balance_loss_mlp": 1.03399062, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 1.9349147548101622, "language_loss": 0.78014284, "learning_rate": 3.812048197605643e-06, "loss": 0.8025583, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 2.6275711059570312 }, { "auxiliary_loss_clip": 0.01207847, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.0621376, "balance_loss_mlp": 1.0244776, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 2.3658607498710102, "language_loss": 0.81361765, "learning_rate": 3.8117183790379277e-06, "loss": 0.83605421, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 3.5191071033477783 }, { "auxiliary_loss_clip": 0.01229283, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.06659627, "balance_loss_mlp": 1.02704799, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 2.852533497671908, "language_loss": 0.93794155, "learning_rate": 3.811388285633976e-06, "loss": 0.96062016, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.5363059043884277 }, { "auxiliary_loss_clip": 0.01164159, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.05923843, "balance_loss_mlp": 1.02283978, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 1.9586562257563982, "language_loss": 0.61956573, "learning_rate": 3.811057917443861e-06, "loss": 0.64155865, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.802551507949829 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01010505, "balance_loss_clip": 1.04104114, "balance_loss_mlp": 1.0048548, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8526114755737236, "language_loss": 0.68249619, "learning_rate": 3.8107272745177e-06, "loss": 0.70373386, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.3565104007720947 }, { "auxiliary_loss_clip": 0.01176751, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.05979133, "balance_loss_mlp": 1.03053772, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.7853102808073007, "language_loss": 0.78678644, "learning_rate": 3.8103963569056513e-06, "loss": 0.80897272, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.6906847953796387 }, { "auxiliary_loss_clip": 0.01182142, "auxiliary_loss_mlp": 0.01037378, "balance_loss_clip": 1.05531442, "balance_loss_mlp": 1.02668524, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.650067516566532, "language_loss": 0.88199872, "learning_rate": 3.8100651646579146e-06, "loss": 0.904194, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.706163167953491 }, { "auxiliary_loss_clip": 0.01185855, "auxiliary_loss_mlp": 0.01040028, "balance_loss_clip": 1.05776536, "balance_loss_mlp": 1.02844071, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.005010837897673, "language_loss": 0.92212343, "learning_rate": 3.8097336978247317e-06, "loss": 0.94438231, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.6270902156829834 }, { "auxiliary_loss_clip": 0.01177513, "auxiliary_loss_mlp": 0.01037548, "balance_loss_clip": 1.05623686, "balance_loss_mlp": 1.0252161, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 3.8086634504186287, "language_loss": 0.89141667, "learning_rate": 3.8094019564563854e-06, "loss": 0.91356725, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.6238815784454346 }, { "auxiliary_loss_clip": 0.01225541, "auxiliary_loss_mlp": 0.00713774, "balance_loss_clip": 1.06309211, "balance_loss_mlp": 1.00069976, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.4504526222015737, "language_loss": 0.75456268, "learning_rate": 3.809069940603201e-06, "loss": 0.77395582, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.5694615840911865 }, { "auxiliary_loss_clip": 0.01180571, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.05899882, "balance_loss_mlp": 1.02525067, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.0438063520820577, "language_loss": 0.77990627, "learning_rate": 3.8087376503155452e-06, "loss": 0.8020817, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.654615640640259 }, { "auxiliary_loss_clip": 0.01099998, "auxiliary_loss_mlp": 0.01010144, "balance_loss_clip": 1.03401828, "balance_loss_mlp": 1.00466084, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.9011919433173057, "language_loss": 0.56232232, "learning_rate": 3.808405085643826e-06, "loss": 0.58342373, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.257272958755493 }, { "auxiliary_loss_clip": 0.01227932, "auxiliary_loss_mlp": 0.00714183, "balance_loss_clip": 1.06487179, "balance_loss_mlp": 1.00062954, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 2.0569474783144517, "language_loss": 0.88968307, "learning_rate": 3.8080722466384925e-06, "loss": 0.90910423, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.6598429679870605 }, { "auxiliary_loss_clip": 0.01227127, "auxiliary_loss_mlp": 0.01042202, "balance_loss_clip": 1.0619812, "balance_loss_mlp": 1.02937496, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.283169652818388, "language_loss": 0.71060777, "learning_rate": 3.8077391333500376e-06, "loss": 0.73330104, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.632312059402466 }, { "auxiliary_loss_clip": 0.01193794, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.06317902, "balance_loss_mlp": 1.02981925, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.6261675480582094, "language_loss": 0.76360172, "learning_rate": 3.8074057458289934e-06, "loss": 0.78595567, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.6891913414001465 }, { "auxiliary_loss_clip": 0.01189257, "auxiliary_loss_mlp": 0.01038983, "balance_loss_clip": 1.0573988, "balance_loss_mlp": 1.02693725, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 1.9695768279233352, "language_loss": 0.82791412, "learning_rate": 3.807072084125934e-06, "loss": 0.85019648, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.6483917236328125 }, { "auxiliary_loss_clip": 0.01191764, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.06304574, "balance_loss_mlp": 1.02485561, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 4.05076171817723, "language_loss": 0.80344093, "learning_rate": 3.806738148291477e-06, "loss": 0.82572347, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6413674354553223 }, { "auxiliary_loss_clip": 0.01142279, "auxiliary_loss_mlp": 0.01045864, "balance_loss_clip": 1.05395103, "balance_loss_mlp": 1.03303742, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 1.9389792453951593, "language_loss": 0.71256018, "learning_rate": 3.8064039383762793e-06, "loss": 0.73444158, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.8473618030548096 }, { "auxiliary_loss_clip": 0.01212168, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.06714511, "balance_loss_mlp": 1.02331495, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 4.167691849863895, "language_loss": 0.76761043, "learning_rate": 3.8060694544310396e-06, "loss": 0.79008079, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.611990213394165 }, { "auxiliary_loss_clip": 0.01228696, "auxiliary_loss_mlp": 0.01049572, "balance_loss_clip": 1.06509006, "balance_loss_mlp": 1.03672147, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 1.7402799290297424, "language_loss": 0.78513968, "learning_rate": 3.8057346965065006e-06, "loss": 0.80792236, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.6637141704559326 }, { "auxiliary_loss_clip": 0.01189112, "auxiliary_loss_mlp": 0.01036872, "balance_loss_clip": 1.05799174, "balance_loss_mlp": 1.02570248, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.637625216919347, "language_loss": 0.84684587, "learning_rate": 3.805399664653443e-06, "loss": 0.8691057, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.7015633583068848 }, { "auxiliary_loss_clip": 0.01226601, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 1.06326175, "balance_loss_mlp": 1.02737713, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 3.231645889658256, "language_loss": 0.74346268, "learning_rate": 3.805064358922692e-06, "loss": 0.76612997, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.6535680294036865 }, { "auxiliary_loss_clip": 0.01213387, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.06183231, "balance_loss_mlp": 1.02611542, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.7048593107533916, "language_loss": 0.80931312, "learning_rate": 3.8047287793651136e-06, "loss": 0.83182698, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.6154096126556396 }, { "auxiliary_loss_clip": 0.01180275, "auxiliary_loss_mlp": 0.01043207, "balance_loss_clip": 1.060462, "balance_loss_mlp": 1.03153634, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 2.3152803148999967, "language_loss": 0.88566744, "learning_rate": 3.8043929260316137e-06, "loss": 0.9079023, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 4.422821283340454 }, { "auxiliary_loss_clip": 0.01197046, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.06684446, "balance_loss_mlp": 1.02672386, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 2.2235864424413854, "language_loss": 0.83258057, "learning_rate": 3.8040567989731417e-06, "loss": 0.85493416, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 3.5233306884765625 }, { "auxiliary_loss_clip": 0.01203399, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.06037903, "balance_loss_mlp": 1.02264214, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 2.2868626534958665, "language_loss": 0.79690003, "learning_rate": 3.8037203982406876e-06, "loss": 0.81926787, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.751006603240967 }, { "auxiliary_loss_clip": 0.01226614, "auxiliary_loss_mlp": 0.01034683, "balance_loss_clip": 1.06477404, "balance_loss_mlp": 1.02295899, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 1.8793474927943354, "language_loss": 0.73221803, "learning_rate": 3.8033837238852835e-06, "loss": 0.75483096, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 2.610044240951538 }, { "auxiliary_loss_clip": 0.01178371, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.05600572, "balance_loss_mlp": 1.02419174, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.9761329159912997, "language_loss": 0.69412887, "learning_rate": 3.8030467759580017e-06, "loss": 0.71626681, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.632361888885498 }, { "auxiliary_loss_clip": 0.01213449, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.06259012, "balance_loss_mlp": 1.02451587, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 1.9277400326123686, "language_loss": 0.86548197, "learning_rate": 3.802709554509958e-06, "loss": 0.88798106, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 3.559342622756958 }, { "auxiliary_loss_clip": 0.01186776, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.05662918, "balance_loss_mlp": 1.02314699, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.0240786747447177, "language_loss": 0.78972352, "learning_rate": 3.8023720595923083e-06, "loss": 0.81194115, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.7160794734954834 }, { "auxiliary_loss_clip": 0.01158229, "auxiliary_loss_mlp": 0.01036205, "balance_loss_clip": 1.05510843, "balance_loss_mlp": 1.02467108, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 3.9182895730251417, "language_loss": 0.87355828, "learning_rate": 3.80203429125625e-06, "loss": 0.89550269, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.6744344234466553 }, { "auxiliary_loss_clip": 0.01131683, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 1.05344152, "balance_loss_mlp": 1.02738297, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.944643886742212, "language_loss": 0.70234287, "learning_rate": 3.8016962495530225e-06, "loss": 0.72404182, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.8155956268310547 }, { "auxiliary_loss_clip": 0.01225801, "auxiliary_loss_mlp": 0.01049647, "balance_loss_clip": 1.06343007, "balance_loss_mlp": 1.03745794, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.4025210260171455, "language_loss": 0.76098144, "learning_rate": 3.8013579345339063e-06, "loss": 0.78373593, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.5476980209350586 }, { "auxiliary_loss_clip": 0.01180177, "auxiliary_loss_mlp": 0.01035768, "balance_loss_clip": 1.06045699, "balance_loss_mlp": 1.02359629, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 2.1742312080845414, "language_loss": 0.69234312, "learning_rate": 3.801019346250224e-06, "loss": 0.71450257, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.7545855045318604 }, { "auxiliary_loss_clip": 0.01206215, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.06265616, "balance_loss_mlp": 1.02534735, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.259388915184904, "language_loss": 0.83776659, "learning_rate": 3.8006804847533395e-06, "loss": 0.86019599, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.6346514225006104 }, { "auxiliary_loss_clip": 0.01225539, "auxiliary_loss_mlp": 0.01038066, "balance_loss_clip": 1.06468844, "balance_loss_mlp": 1.02637136, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 1.859738188319621, "language_loss": 0.85558403, "learning_rate": 3.8003413500946556e-06, "loss": 0.87822008, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.6494882106781006 }, { "auxiliary_loss_clip": 0.01195712, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.06298018, "balance_loss_mlp": 1.02665114, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 2.6628809717951967, "language_loss": 0.82818735, "learning_rate": 3.8000019423256216e-06, "loss": 0.85052371, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.685225009918213 }, { "auxiliary_loss_clip": 0.01182269, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.06030226, "balance_loss_mlp": 1.02831137, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 2.1497733616422825, "language_loss": 0.88126528, "learning_rate": 3.7996622614977234e-06, "loss": 0.90348148, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.726912021636963 }, { "auxiliary_loss_clip": 0.01194754, "auxiliary_loss_mlp": 0.01044294, "balance_loss_clip": 1.06575596, "balance_loss_mlp": 1.0322243, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 1.7994819411871548, "language_loss": 0.78907061, "learning_rate": 3.799322307662492e-06, "loss": 0.81146115, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.629979372024536 }, { "auxiliary_loss_clip": 0.01160779, "auxiliary_loss_mlp": 0.01042677, "balance_loss_clip": 1.05621684, "balance_loss_mlp": 1.03076255, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.1622514077574393, "language_loss": 0.84161431, "learning_rate": 3.798982080871496e-06, "loss": 0.86364889, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.7196121215820312 }, { "auxiliary_loss_clip": 0.01228383, "auxiliary_loss_mlp": 0.01037811, "balance_loss_clip": 1.06559777, "balance_loss_mlp": 1.02562761, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 1.9448060540109358, "language_loss": 0.67701781, "learning_rate": 3.798641581176349e-06, "loss": 0.69967967, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.745699167251587 }, { "auxiliary_loss_clip": 0.01192415, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.05949771, "balance_loss_mlp": 1.02581692, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 2.4589716751334354, "language_loss": 0.74353009, "learning_rate": 3.7983008086287044e-06, "loss": 0.76582766, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.7281861305236816 }, { "auxiliary_loss_clip": 0.01191752, "auxiliary_loss_mlp": 0.01044517, "balance_loss_clip": 1.05987477, "balance_loss_mlp": 1.03236389, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.823258490218152, "language_loss": 0.79612774, "learning_rate": 3.797959763280257e-06, "loss": 0.81849045, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.7139410972595215 }, { "auxiliary_loss_clip": 0.01213861, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.0647769, "balance_loss_mlp": 1.03256881, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 3.9813656707947103, "language_loss": 0.78759545, "learning_rate": 3.797618445182743e-06, "loss": 0.8101778, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.638658285140991 }, { "auxiliary_loss_clip": 0.01153788, "auxiliary_loss_mlp": 0.01040778, "balance_loss_clip": 1.05510736, "balance_loss_mlp": 1.02948272, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.041563475232795, "language_loss": 0.85190332, "learning_rate": 3.79727685438794e-06, "loss": 0.87384892, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.693218231201172 }, { "auxiliary_loss_clip": 0.01127438, "auxiliary_loss_mlp": 0.01010133, "balance_loss_clip": 1.04368901, "balance_loss_mlp": 1.00388622, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.839476594278951, "language_loss": 0.61675513, "learning_rate": 3.796934990947667e-06, "loss": 0.6381309, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.2418694496154785 }, { "auxiliary_loss_clip": 0.01125522, "auxiliary_loss_mlp": 0.01006048, "balance_loss_clip": 1.04304647, "balance_loss_mlp": 1.00039792, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8887413788231037, "language_loss": 0.62502801, "learning_rate": 3.7965928549137854e-06, "loss": 0.64634365, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.1377902030944824 }, { "auxiliary_loss_clip": 0.01180492, "auxiliary_loss_mlp": 0.01040074, "balance_loss_clip": 1.05611074, "balance_loss_mlp": 1.02709246, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 1.8556171106898478, "language_loss": 0.7738142, "learning_rate": 3.7962504463381953e-06, "loss": 0.79601991, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.7820072174072266 }, { "auxiliary_loss_clip": 0.01187853, "auxiliary_loss_mlp": 0.00714344, "balance_loss_clip": 1.06385469, "balance_loss_mlp": 1.00074637, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 9.05204504364822, "language_loss": 0.79023314, "learning_rate": 3.7959077652728412e-06, "loss": 0.80925512, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 3.567047119140625 }, { "auxiliary_loss_clip": 0.01191676, "auxiliary_loss_mlp": 0.01038731, "balance_loss_clip": 1.05860591, "balance_loss_mlp": 1.02720976, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.259985075438605, "language_loss": 0.77503508, "learning_rate": 3.795564811769707e-06, "loss": 0.79733914, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 3.6419527530670166 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.010377, "balance_loss_clip": 1.06723976, "balance_loss_mlp": 1.02614248, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 1.892923360503741, "language_loss": 0.77806348, "learning_rate": 3.795221585880818e-06, "loss": 0.80041111, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.679051637649536 }, { "auxiliary_loss_clip": 0.01181718, "auxiliary_loss_mlp": 0.01048977, "balance_loss_clip": 1.06598604, "balance_loss_mlp": 1.0376761, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 1.8122119039002251, "language_loss": 0.91051614, "learning_rate": 3.794878087658242e-06, "loss": 0.93282312, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 3.5857157707214355 }, { "auxiliary_loss_clip": 0.01213109, "auxiliary_loss_mlp": 0.01040272, "balance_loss_clip": 1.06315899, "balance_loss_mlp": 1.02882242, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 1.9954931423553157, "language_loss": 0.78905404, "learning_rate": 3.7945343171540873e-06, "loss": 0.81158781, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 2.719700574874878 }, { "auxiliary_loss_clip": 0.01228905, "auxiliary_loss_mlp": 0.01044773, "balance_loss_clip": 1.06573272, "balance_loss_mlp": 1.03297687, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 1.98406867750377, "language_loss": 0.79157794, "learning_rate": 3.7941902744205033e-06, "loss": 0.81431472, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 3.485013961791992 }, { "auxiliary_loss_clip": 0.01197729, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.06069517, "balance_loss_mlp": 1.03248727, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 2.800903529998178, "language_loss": 0.8333075, "learning_rate": 3.7938459595096817e-06, "loss": 0.85572535, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.6571035385131836 }, { "auxiliary_loss_clip": 0.01219352, "auxiliary_loss_mlp": 0.01048377, "balance_loss_clip": 1.06439757, "balance_loss_mlp": 1.03596687, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.8838953408875883, "language_loss": 0.86359102, "learning_rate": 3.7935013724738545e-06, "loss": 0.88626832, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.643220901489258 }, { "auxiliary_loss_clip": 0.01208053, "auxiliary_loss_mlp": 0.01042148, "balance_loss_clip": 1.06445909, "balance_loss_mlp": 1.03085315, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.8358678823381251, "language_loss": 0.7809552, "learning_rate": 3.7931565133652945e-06, "loss": 0.80345714, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.625222682952881 }, { "auxiliary_loss_clip": 0.01226823, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.06420279, "balance_loss_mlp": 1.02964127, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.115229574747235, "language_loss": 0.67628491, "learning_rate": 3.792811382236317e-06, "loss": 0.69896758, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.640263319015503 }, { "auxiliary_loss_clip": 0.01215358, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.06291986, "balance_loss_mlp": 1.02494383, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 1.8568912658945833, "language_loss": 0.78038585, "learning_rate": 3.792465979139279e-06, "loss": 0.80291355, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.7135519981384277 }, { "auxiliary_loss_clip": 0.01095483, "auxiliary_loss_mlp": 0.01009993, "balance_loss_clip": 1.04113531, "balance_loss_mlp": 1.00405598, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9356146345576113, "language_loss": 0.65678275, "learning_rate": 3.792120304126576e-06, "loss": 0.67783755, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.3304266929626465 }, { "auxiliary_loss_clip": 0.01128823, "auxiliary_loss_mlp": 0.01039118, "balance_loss_clip": 1.05331969, "balance_loss_mlp": 1.02692914, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 1.899701311540615, "language_loss": 0.83609951, "learning_rate": 3.791774357250649e-06, "loss": 0.85777891, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.947967529296875 }, { "auxiliary_loss_clip": 0.01188712, "auxiliary_loss_mlp": 0.01044522, "balance_loss_clip": 1.06008291, "balance_loss_mlp": 1.03268504, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.3130699543488102, "language_loss": 0.78852701, "learning_rate": 3.7914281385639757e-06, "loss": 0.81085932, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.697186231613159 }, { "auxiliary_loss_clip": 0.01207305, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.05818951, "balance_loss_mlp": 1.02525496, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.2022681122276837, "language_loss": 0.79597199, "learning_rate": 3.7910816481190784e-06, "loss": 0.81841028, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.6361939907073975 }, { "auxiliary_loss_clip": 0.01179834, "auxiliary_loss_mlp": 0.01047063, "balance_loss_clip": 1.05596638, "balance_loss_mlp": 1.03554177, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.0140151404761735, "language_loss": 0.75193542, "learning_rate": 3.7907348859685193e-06, "loss": 0.77420437, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.7982006072998047 }, { "auxiliary_loss_clip": 0.01203169, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.06185424, "balance_loss_mlp": 1.01914644, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.179677021341079, "language_loss": 0.80216134, "learning_rate": 3.790387852164902e-06, "loss": 0.82449281, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.7010083198547363 }, { "auxiliary_loss_clip": 0.01209819, "auxiliary_loss_mlp": 0.01042869, "balance_loss_clip": 1.06188011, "balance_loss_mlp": 1.03155589, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 2.1920458792469217, "language_loss": 0.76882708, "learning_rate": 3.7900405467608707e-06, "loss": 0.79135394, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.6150028705596924 }, { "auxiliary_loss_clip": 0.01142392, "auxiliary_loss_mlp": 0.01046212, "balance_loss_clip": 1.05075753, "balance_loss_mlp": 1.03379643, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.4351459443622514, "language_loss": 0.78880316, "learning_rate": 3.7896929698091114e-06, "loss": 0.81068921, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.767613649368286 }, { "auxiliary_loss_clip": 0.0122701, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.06427598, "balance_loss_mlp": 1.02544427, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 3.807746820042106, "language_loss": 0.67861986, "learning_rate": 3.7893451213623518e-06, "loss": 0.70126855, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.6194357872009277 }, { "auxiliary_loss_clip": 0.01209789, "auxiliary_loss_mlp": 0.00714431, "balance_loss_clip": 1.06477356, "balance_loss_mlp": 1.00065207, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 1.9986616820805356, "language_loss": 0.82308745, "learning_rate": 3.7889970014733606e-06, "loss": 0.84232968, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.671729326248169 }, { "auxiliary_loss_clip": 0.01139355, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.05076444, "balance_loss_mlp": 1.03554308, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.74900715072101, "language_loss": 0.77842343, "learning_rate": 3.7886486101949463e-06, "loss": 0.80028695, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.7587485313415527 }, { "auxiliary_loss_clip": 0.01144947, "auxiliary_loss_mlp": 0.01042807, "balance_loss_clip": 1.05149221, "balance_loss_mlp": 1.0310353, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 1.9936579468616795, "language_loss": 0.87966037, "learning_rate": 3.7882999475799594e-06, "loss": 0.90153801, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.7259767055511475 }, { "auxiliary_loss_clip": 0.0113581, "auxiliary_loss_mlp": 0.01036943, "balance_loss_clip": 1.05199003, "balance_loss_mlp": 1.02583885, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.8473792888092537, "language_loss": 0.81505013, "learning_rate": 3.787951013681293e-06, "loss": 0.83677757, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.705719232559204 }, { "auxiliary_loss_clip": 0.01205841, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.06000853, "balance_loss_mlp": 1.02274895, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.3076434933141776, "language_loss": 0.77057791, "learning_rate": 3.787601808551879e-06, "loss": 0.79298675, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.6469244956970215 }, { "auxiliary_loss_clip": 0.0117292, "auxiliary_loss_mlp": 0.01041196, "balance_loss_clip": 1.05519581, "balance_loss_mlp": 1.030509, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 2.3371500008994084, "language_loss": 0.83895761, "learning_rate": 3.7872523322446926e-06, "loss": 0.86109877, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.569403886795044 }, { "auxiliary_loss_clip": 0.01161428, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.05185783, "balance_loss_mlp": 1.02121139, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 1.8985647413269338, "language_loss": 0.60097528, "learning_rate": 3.7869025848127478e-06, "loss": 0.62292099, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 4.527285099029541 }, { "auxiliary_loss_clip": 0.01209251, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.06060648, "balance_loss_mlp": 1.02490139, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 4.41556102310976, "language_loss": 0.80482465, "learning_rate": 3.786552566309102e-06, "loss": 0.82727909, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.6257972717285156 }, { "auxiliary_loss_clip": 0.01189122, "auxiliary_loss_mlp": 0.00714055, "balance_loss_clip": 1.06230474, "balance_loss_mlp": 1.0005796, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.067896884155242, "language_loss": 0.85609412, "learning_rate": 3.7862022767868517e-06, "loss": 0.87512589, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 3.559075355529785 }, { "auxiliary_loss_clip": 0.0117054, "auxiliary_loss_mlp": 0.0103774, "balance_loss_clip": 1.06127143, "balance_loss_mlp": 1.02656448, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.7075311709452565, "language_loss": 0.84365362, "learning_rate": 3.7858517162991367e-06, "loss": 0.86573637, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 2.7256975173950195 }, { "auxiliary_loss_clip": 0.01172948, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.05459571, "balance_loss_mlp": 1.02713335, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.7307354051033506, "language_loss": 0.60414153, "learning_rate": 3.7855008848991363e-06, "loss": 0.62625873, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 2.7556278705596924 }, { "auxiliary_loss_clip": 0.01189308, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.06281006, "balance_loss_mlp": 1.02491462, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 2.5238372287916273, "language_loss": 0.77602839, "learning_rate": 3.7851497826400714e-06, "loss": 0.79827899, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 3.6200051307678223 }, { "auxiliary_loss_clip": 0.01228202, "auxiliary_loss_mlp": 0.01039088, "balance_loss_clip": 1.06599021, "balance_loss_mlp": 1.02700043, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 1.9226983181521196, "language_loss": 0.76047492, "learning_rate": 3.7847984095752034e-06, "loss": 0.78314775, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.7455053329467773 }, { "auxiliary_loss_clip": 0.01222502, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.06270933, "balance_loss_mlp": 1.02402461, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 2.74551129528365, "language_loss": 0.80054152, "learning_rate": 3.784446765757836e-06, "loss": 0.82311463, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.5780179500579834 }, { "auxiliary_loss_clip": 0.011577, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.05490041, "balance_loss_mlp": 1.02479959, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.057930433055329, "language_loss": 0.77959287, "learning_rate": 3.7840948512413133e-06, "loss": 0.80153763, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.7214465141296387 }, { "auxiliary_loss_clip": 0.01170843, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.0608108, "balance_loss_mlp": 1.02592182, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 1.851242615426232, "language_loss": 0.78698492, "learning_rate": 3.7837426660790196e-06, "loss": 0.80907238, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.852980375289917 }, { "auxiliary_loss_clip": 0.01221613, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.06114864, "balance_loss_mlp": 1.02560973, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.1697073317909377, "language_loss": 0.8231976, "learning_rate": 3.783390210324382e-06, "loss": 0.84578192, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.642289638519287 }, { "auxiliary_loss_clip": 0.01173563, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.06038964, "balance_loss_mlp": 1.02757967, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 2.6028240865251364, "language_loss": 0.7280314, "learning_rate": 3.7830374840308676e-06, "loss": 0.75015289, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.730724811553955 }, { "auxiliary_loss_clip": 0.01211086, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.06393909, "balance_loss_mlp": 1.02294219, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 2.1955590894044654, "language_loss": 0.82844847, "learning_rate": 3.7826844872519842e-06, "loss": 0.85090721, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.6196978092193604 }, { "auxiliary_loss_clip": 0.01189901, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.062078, "balance_loss_mlp": 1.0239315, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 2.353583917844296, "language_loss": 0.72845089, "learning_rate": 3.782331220041282e-06, "loss": 0.75069892, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.849257469177246 }, { "auxiliary_loss_clip": 0.01183577, "auxiliary_loss_mlp": 0.0104153, "balance_loss_clip": 1.058828, "balance_loss_mlp": 1.02999032, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.074092378393725, "language_loss": 0.83019412, "learning_rate": 3.7819776824523504e-06, "loss": 0.85244513, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.677337884902954 }, { "auxiliary_loss_clip": 0.01199947, "auxiliary_loss_mlp": 0.01045354, "balance_loss_clip": 1.06135559, "balance_loss_mlp": 1.0339334, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 1.933354227099381, "language_loss": 0.83784127, "learning_rate": 3.7816238745388213e-06, "loss": 0.86029434, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.762279987335205 }, { "auxiliary_loss_clip": 0.01196107, "auxiliary_loss_mlp": 0.01037605, "balance_loss_clip": 1.05916548, "balance_loss_mlp": 1.02662539, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 5.199279806391458, "language_loss": 0.86810863, "learning_rate": 3.781269796354367e-06, "loss": 0.89044577, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.697831153869629 }, { "auxiliary_loss_clip": 0.01191312, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.0606333, "balance_loss_mlp": 1.03389156, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.76346110853191, "language_loss": 0.85920417, "learning_rate": 3.7809154479527006e-06, "loss": 0.88156128, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.6836884021759033 }, { "auxiliary_loss_clip": 0.01167773, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.05847847, "balance_loss_mlp": 1.02752805, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.3167350144619046, "language_loss": 0.84287, "learning_rate": 3.780560829387577e-06, "loss": 0.86493748, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.6820547580718994 }, { "auxiliary_loss_clip": 0.01130803, "auxiliary_loss_mlp": 0.01005228, "balance_loss_clip": 1.04738212, "balance_loss_mlp": 1.00107944, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8547249951010103, "language_loss": 0.57880598, "learning_rate": 3.7802059407127915e-06, "loss": 0.60016626, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.1668996810913086 }, { "auxiliary_loss_clip": 0.01183672, "auxiliary_loss_mlp": 0.01036043, "balance_loss_clip": 1.0580312, "balance_loss_mlp": 1.02496886, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.117742741127464, "language_loss": 0.86090809, "learning_rate": 3.7798507819821797e-06, "loss": 0.88310522, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.6838760375976562 }, { "auxiliary_loss_clip": 0.01166015, "auxiliary_loss_mlp": 0.01041812, "balance_loss_clip": 1.05924773, "balance_loss_mlp": 1.02989161, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.208779600277767, "language_loss": 0.78686976, "learning_rate": 3.7794953532496197e-06, "loss": 0.80894804, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.692949056625366 }, { "auxiliary_loss_clip": 0.01083529, "auxiliary_loss_mlp": 0.00703769, "balance_loss_clip": 1.05422139, "balance_loss_mlp": 0.99988794, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.8731202890378877, "language_loss": 0.57961142, "learning_rate": 3.7791396545690295e-06, "loss": 0.59748441, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.2741451263427734 }, { "auxiliary_loss_clip": 0.01209916, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.06542206, "balance_loss_mlp": 1.02248549, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 2.33693733557991, "language_loss": 0.80391908, "learning_rate": 3.7787836859943685e-06, "loss": 0.82635772, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.6094024181365967 }, { "auxiliary_loss_clip": 0.01207008, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.06385636, "balance_loss_mlp": 1.02738118, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 2.423515764991595, "language_loss": 0.78837258, "learning_rate": 3.7784274475796363e-06, "loss": 0.81082785, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.6422371864318848 }, { "auxiliary_loss_clip": 0.01180225, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.05910742, "balance_loss_mlp": 1.02587032, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.0665903921187954, "language_loss": 0.7599147, "learning_rate": 3.7780709393788745e-06, "loss": 0.782098, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.6980180740356445 }, { "auxiliary_loss_clip": 0.01226405, "auxiliary_loss_mlp": 0.01039454, "balance_loss_clip": 1.06575823, "balance_loss_mlp": 1.02819502, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 1.9345223330915335, "language_loss": 0.74967676, "learning_rate": 3.777714161446165e-06, "loss": 0.77233529, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 4.246178865432739 }, { "auxiliary_loss_clip": 0.01211797, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.0645169, "balance_loss_mlp": 1.02521658, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 1.9955935268432348, "language_loss": 0.69226766, "learning_rate": 3.7773571138356304e-06, "loss": 0.71474874, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.7542340755462646 }, { "auxiliary_loss_clip": 0.01144681, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.05811954, "balance_loss_mlp": 1.02472162, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.388938425512321, "language_loss": 0.8912853, "learning_rate": 3.776999796601435e-06, "loss": 0.91309094, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 3.644894599914551 }, { "auxiliary_loss_clip": 0.01215829, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.06549311, "balance_loss_mlp": 1.02499533, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.05896307369023, "language_loss": 0.72811043, "learning_rate": 3.776642209797783e-06, "loss": 0.75063449, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.6872880458831787 }, { "auxiliary_loss_clip": 0.01199377, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.05932295, "balance_loss_mlp": 1.02478266, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 1.938665235397083, "language_loss": 0.77803636, "learning_rate": 3.7762843534789205e-06, "loss": 0.80039191, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 2.6622700691223145 }, { "auxiliary_loss_clip": 0.01200108, "auxiliary_loss_mlp": 0.01037133, "balance_loss_clip": 1.06152773, "balance_loss_mlp": 1.02547455, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.1615885569037747, "language_loss": 0.88186061, "learning_rate": 3.7759262276991343e-06, "loss": 0.9042331, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 3.483306646347046 }, { "auxiliary_loss_clip": 0.01195784, "auxiliary_loss_mlp": 0.01042442, "balance_loss_clip": 1.06176329, "balance_loss_mlp": 1.03073621, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.3558927551534845, "language_loss": 0.80485928, "learning_rate": 3.7755678325127506e-06, "loss": 0.82724154, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.6678719520568848 }, { "auxiliary_loss_clip": 0.01156349, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.06148589, "balance_loss_mlp": 1.02750599, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 2.405304686886366, "language_loss": 0.75673777, "learning_rate": 3.7752091679741393e-06, "loss": 0.77869421, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.6841509342193604 }, { "auxiliary_loss_clip": 0.01208629, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.06509089, "balance_loss_mlp": 1.02740622, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 3.219160080178821, "language_loss": 0.77196169, "learning_rate": 3.774850234137708e-06, "loss": 0.79444385, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.7220311164855957 }, { "auxiliary_loss_clip": 0.0120461, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.06062031, "balance_loss_mlp": 1.02490401, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.0703465154602996, "language_loss": 0.82842535, "learning_rate": 3.7744910310579076e-06, "loss": 0.85083723, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.6576080322265625 }, { "auxiliary_loss_clip": 0.01224919, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.06482959, "balance_loss_mlp": 1.02663922, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.039542203739292, "language_loss": 0.85520911, "learning_rate": 3.774131558789229e-06, "loss": 0.87783933, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.675163984298706 }, { "auxiliary_loss_clip": 0.01230794, "auxiliary_loss_mlp": 0.0071372, "balance_loss_clip": 1.06858444, "balance_loss_mlp": 1.00053525, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.548078211734334, "language_loss": 0.69567716, "learning_rate": 3.773771817386203e-06, "loss": 0.71512228, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.540241003036499 }, { "auxiliary_loss_clip": 0.01190969, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.06086946, "balance_loss_mlp": 1.02225518, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.6968247826214131, "language_loss": 0.79069728, "learning_rate": 3.773411806903403e-06, "loss": 0.81294495, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.7111854553222656 }, { "auxiliary_loss_clip": 0.01143742, "auxiliary_loss_mlp": 0.01043984, "balance_loss_clip": 1.05497015, "balance_loss_mlp": 1.03260493, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 1.7416621122021785, "language_loss": 0.94568801, "learning_rate": 3.7730515273954415e-06, "loss": 0.96756518, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.743035078048706 }, { "auxiliary_loss_clip": 0.01225234, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.0653441, "balance_loss_mlp": 1.02159858, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 1.9044640613665986, "language_loss": 0.84794331, "learning_rate": 3.772690978916973e-06, "loss": 0.87051868, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.6679790019989014 }, { "auxiliary_loss_clip": 0.01206486, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.06317663, "balance_loss_mlp": 1.02423489, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 2.4272027261775206, "language_loss": 0.86600763, "learning_rate": 3.772330161522693e-06, "loss": 0.88842827, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.629131317138672 }, { "auxiliary_loss_clip": 0.01191559, "auxiliary_loss_mlp": 0.01046871, "balance_loss_clip": 1.06460452, "balance_loss_mlp": 1.03504503, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 2.049227597871401, "language_loss": 0.79701316, "learning_rate": 3.7719690752673365e-06, "loss": 0.81939745, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.670588731765747 }, { "auxiliary_loss_clip": 0.01174349, "auxiliary_loss_mlp": 0.01036843, "balance_loss_clip": 1.06075573, "balance_loss_mlp": 1.02584004, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 2.996701772974132, "language_loss": 0.78267241, "learning_rate": 3.7716077202056796e-06, "loss": 0.8047843, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.7371652126312256 }, { "auxiliary_loss_clip": 0.01178155, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.0577085, "balance_loss_mlp": 1.02799904, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.488888335266236, "language_loss": 0.93844378, "learning_rate": 3.7712460963925404e-06, "loss": 0.96062446, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.6356427669525146 }, { "auxiliary_loss_clip": 0.01180076, "auxiliary_loss_mlp": 0.01043285, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.03210914, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 2.302931187417454, "language_loss": 0.7537784, "learning_rate": 3.7708842038827775e-06, "loss": 0.77601194, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.715546131134033 }, { "auxiliary_loss_clip": 0.01205771, "auxiliary_loss_mlp": 0.01040211, "balance_loss_clip": 1.05949914, "balance_loss_mlp": 1.02807546, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 2.1307015107777745, "language_loss": 0.85813379, "learning_rate": 3.770522042731288e-06, "loss": 0.8805936, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.6593756675720215 }, { "auxiliary_loss_clip": 0.01149409, "auxiliary_loss_mlp": 0.01043849, "balance_loss_clip": 1.05681968, "balance_loss_mlp": 1.03149867, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 1.9555392020383764, "language_loss": 0.87760383, "learning_rate": 3.7701596129930122e-06, "loss": 0.89953637, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.770052433013916 }, { "auxiliary_loss_clip": 0.01182247, "auxiliary_loss_mlp": 0.0103929, "balance_loss_clip": 1.05895114, "balance_loss_mlp": 1.02710724, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 1.9482212539941395, "language_loss": 0.73348635, "learning_rate": 3.7697969147229315e-06, "loss": 0.75570178, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.681746482849121 }, { "auxiliary_loss_clip": 0.01204092, "auxiliary_loss_mlp": 0.01038529, "balance_loss_clip": 1.06074154, "balance_loss_mlp": 1.027246, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 2.115688365245183, "language_loss": 0.84994912, "learning_rate": 3.7694339479760647e-06, "loss": 0.87237537, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.687833786010742 }, { "auxiliary_loss_clip": 0.01109249, "auxiliary_loss_mlp": 0.01005368, "balance_loss_clip": 1.04332972, "balance_loss_mlp": 1.00110078, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.7771634119378937, "language_loss": 0.57339597, "learning_rate": 3.769070712807476e-06, "loss": 0.59454215, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.3524045944213867 }, { "auxiliary_loss_clip": 0.0112849, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.05516315, "balance_loss_mlp": 1.02968061, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 1.8006671193051282, "language_loss": 0.78541028, "learning_rate": 3.768707209272266e-06, "loss": 0.80710363, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.809541702270508 }, { "auxiliary_loss_clip": 0.01186233, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.05940759, "balance_loss_mlp": 1.02317905, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.2549076929621474, "language_loss": 0.76661682, "learning_rate": 3.768343437425579e-06, "loss": 0.78882957, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 3.59535551071167 }, { "auxiliary_loss_clip": 0.0111501, "auxiliary_loss_mlp": 0.01043377, "balance_loss_clip": 1.05208206, "balance_loss_mlp": 1.03135502, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.4000477702370318, "language_loss": 0.85709149, "learning_rate": 3.7679793973225987e-06, "loss": 0.87867534, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 3.6134133338928223 }, { "auxiliary_loss_clip": 0.01070886, "auxiliary_loss_mlp": 0.0101812, "balance_loss_clip": 1.03052044, "balance_loss_mlp": 1.01323247, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8509199080615716, "language_loss": 0.61643875, "learning_rate": 3.767615089018549e-06, "loss": 0.63732886, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 4.130265951156616 }, { "auxiliary_loss_clip": 0.01180373, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.05803537, "balance_loss_mlp": 1.02390957, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.0727195871562016, "language_loss": 0.86380851, "learning_rate": 3.7672505125686966e-06, "loss": 0.8859722, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.6645827293395996 }, { "auxiliary_loss_clip": 0.01156253, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.05598485, "balance_loss_mlp": 1.03385508, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 2.7207189955061266, "language_loss": 0.84281492, "learning_rate": 3.7668856680283455e-06, "loss": 0.86483806, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 2.6839826107025146 }, { "auxiliary_loss_clip": 0.01196556, "auxiliary_loss_mlp": 0.01048342, "balance_loss_clip": 1.06063104, "balance_loss_mlp": 1.03665316, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 1.8765798715544606, "language_loss": 0.82388866, "learning_rate": 3.7665205554528437e-06, "loss": 0.84633762, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.461498498916626 }, { "auxiliary_loss_clip": 0.01189796, "auxiliary_loss_mlp": 0.01042898, "balance_loss_clip": 1.0608362, "balance_loss_mlp": 1.03140593, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 2.0383924631940284, "language_loss": 0.74488139, "learning_rate": 3.7661551748975782e-06, "loss": 0.76720834, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.749837636947632 }, { "auxiliary_loss_clip": 0.011081, "auxiliary_loss_mlp": 0.01010342, "balance_loss_clip": 1.04174232, "balance_loss_mlp": 1.00590742, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.8090722831079907, "language_loss": 0.60468411, "learning_rate": 3.7657895264179772e-06, "loss": 0.62586856, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.2076809406280518 }, { "auxiliary_loss_clip": 0.011842, "auxiliary_loss_mlp": 0.01042049, "balance_loss_clip": 1.05800605, "balance_loss_mlp": 1.02976465, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.8905744414094043, "language_loss": 0.74507058, "learning_rate": 3.765423610069509e-06, "loss": 0.76733303, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.828094005584717 }, { "auxiliary_loss_clip": 0.01196492, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.06550837, "balance_loss_mlp": 1.0241046, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.7389188726552576, "language_loss": 0.72695053, "learning_rate": 3.765057425907683e-06, "loss": 0.74926752, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.7792410850524902 }, { "auxiliary_loss_clip": 0.01212496, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.06173277, "balance_loss_mlp": 1.02342772, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 2.1324883594308215, "language_loss": 0.7807427, "learning_rate": 3.764690973988048e-06, "loss": 0.80321741, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.664738416671753 }, { "auxiliary_loss_clip": 0.01181775, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.06039476, "balance_loss_mlp": 1.02430511, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 2.240370021881149, "language_loss": 0.73619533, "learning_rate": 3.7643242543661967e-06, "loss": 0.75837231, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.7853245735168457 }, { "auxiliary_loss_clip": 0.01098393, "auxiliary_loss_mlp": 0.01004417, "balance_loss_clip": 1.03800893, "balance_loss_mlp": 1.0001967, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8136450487945902, "language_loss": 0.6054374, "learning_rate": 3.7639572670977573e-06, "loss": 0.62646544, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.1201865673065186 }, { "auxiliary_loss_clip": 0.01178255, "auxiliary_loss_mlp": 0.01049652, "balance_loss_clip": 1.05919051, "balance_loss_mlp": 1.03758824, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.6123142619859128, "language_loss": 0.76426065, "learning_rate": 3.7635900122384042e-06, "loss": 0.78653973, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.785614490509033 }, { "auxiliary_loss_clip": 0.01195276, "auxiliary_loss_mlp": 0.0103877, "balance_loss_clip": 1.06013632, "balance_loss_mlp": 1.02650344, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.445144228945381, "language_loss": 0.87186497, "learning_rate": 3.7632224898438477e-06, "loss": 0.89420545, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.6102054119110107 }, { "auxiliary_loss_clip": 0.0117884, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.05771756, "balance_loss_mlp": 1.02445185, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.6280741711387348, "language_loss": 0.792265, "learning_rate": 3.762854699969842e-06, "loss": 0.81441367, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.7199020385742188 }, { "auxiliary_loss_clip": 0.0120696, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.06345367, "balance_loss_mlp": 1.02889419, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 2.0003101525621076, "language_loss": 0.73301655, "learning_rate": 3.762486642672179e-06, "loss": 0.75549704, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.6098246574401855 }, { "auxiliary_loss_clip": 0.01195039, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.06091571, "balance_loss_mlp": 1.02472663, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 3.1198705136256235, "language_loss": 0.87404317, "learning_rate": 3.7621183180066946e-06, "loss": 0.89636254, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.667178153991699 }, { "auxiliary_loss_clip": 0.01185867, "auxiliary_loss_mlp": 0.01042127, "balance_loss_clip": 1.054515, "balance_loss_mlp": 1.03050995, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.5455685185928993, "language_loss": 0.73958039, "learning_rate": 3.7617497260292625e-06, "loss": 0.76186037, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.6606833934783936 }, { "auxiliary_loss_clip": 0.01191854, "auxiliary_loss_mlp": 0.01041082, "balance_loss_clip": 1.06467867, "balance_loss_mlp": 1.029006, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 2.5359507367029988, "language_loss": 0.79054546, "learning_rate": 3.7613808667957967e-06, "loss": 0.81287485, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.612189769744873 }, { "auxiliary_loss_clip": 0.01194198, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.061234, "balance_loss_mlp": 1.02396905, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 2.384333231859414, "language_loss": 0.91168857, "learning_rate": 3.7610117403622547e-06, "loss": 0.93399036, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.6288061141967773 }, { "auxiliary_loss_clip": 0.01165736, "auxiliary_loss_mlp": 0.010458, "balance_loss_clip": 1.05481923, "balance_loss_mlp": 1.03493989, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.7292303170080685, "language_loss": 0.90040869, "learning_rate": 3.7606423467846313e-06, "loss": 0.92252409, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.6898930072784424 }, { "auxiliary_loss_clip": 0.01183502, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.06311107, "balance_loss_mlp": 1.03243065, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.9599152739759014, "language_loss": 0.79654473, "learning_rate": 3.760272686118964e-06, "loss": 0.81883126, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.6804141998291016 }, { "auxiliary_loss_clip": 0.01193425, "auxiliary_loss_mlp": 0.01042607, "balance_loss_clip": 1.06005776, "balance_loss_mlp": 1.03062057, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.07103070568372, "language_loss": 0.92606866, "learning_rate": 3.7599027584213297e-06, "loss": 0.94842899, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.6449196338653564 }, { "auxiliary_loss_clip": 0.01213166, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.06279659, "balance_loss_mlp": 1.0261265, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 1.949015177058864, "language_loss": 0.77918792, "learning_rate": 3.7595325637478465e-06, "loss": 0.80169666, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.694709539413452 }, { "auxiliary_loss_clip": 0.01180436, "auxiliary_loss_mlp": 0.01044284, "balance_loss_clip": 1.05941474, "balance_loss_mlp": 1.03235149, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 1.692407141231368, "language_loss": 0.81890506, "learning_rate": 3.7591621021546723e-06, "loss": 0.84115225, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.685173273086548 }, { "auxiliary_loss_clip": 0.01202186, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.05966449, "balance_loss_mlp": 1.02546883, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.6772956393796443, "language_loss": 0.81283116, "learning_rate": 3.7587913736980062e-06, "loss": 0.83522671, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 3.470919609069824 }, { "auxiliary_loss_clip": 0.01126007, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.05077267, "balance_loss_mlp": 1.02074957, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.7015179050424325, "language_loss": 0.84671867, "learning_rate": 3.7584203784340865e-06, "loss": 0.8683058, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 4.591398000717163 }, { "auxiliary_loss_clip": 0.0118677, "auxiliary_loss_mlp": 0.0103589, "balance_loss_clip": 1.05749917, "balance_loss_mlp": 1.02435005, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.066340888083825, "language_loss": 0.85750115, "learning_rate": 3.7580491164191938e-06, "loss": 0.87972772, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.65408992767334 }, { "auxiliary_loss_clip": 0.01116099, "auxiliary_loss_mlp": 0.0100776, "balance_loss_clip": 1.0364182, "balance_loss_mlp": 1.00389755, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.750819526213013, "language_loss": 0.61236089, "learning_rate": 3.757677587709648e-06, "loss": 0.63359946, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 3.3572676181793213 }, { "auxiliary_loss_clip": 0.01172817, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.06250787, "balance_loss_mlp": 1.02123392, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 1.9629556936350239, "language_loss": 0.75392473, "learning_rate": 3.7573057923618095e-06, "loss": 0.77598333, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 2.707627296447754 }, { "auxiliary_loss_clip": 0.01153246, "auxiliary_loss_mlp": 0.01042693, "balance_loss_clip": 1.05349755, "balance_loss_mlp": 1.02972353, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 1.8653548974905239, "language_loss": 0.74241859, "learning_rate": 3.7569337304320793e-06, "loss": 0.76437795, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 3.656923532485962 }, { "auxiliary_loss_clip": 0.01099173, "auxiliary_loss_mlp": 0.01005526, "balance_loss_clip": 1.03505898, "balance_loss_mlp": 1.00178266, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8459927555988199, "language_loss": 0.64463979, "learning_rate": 3.756561401976899e-06, "loss": 0.66568679, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.1359896659851074 }, { "auxiliary_loss_clip": 0.01225295, "auxiliary_loss_mlp": 0.01036403, "balance_loss_clip": 1.06402802, "balance_loss_mlp": 1.02424955, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 1.9327809807604766, "language_loss": 0.82313091, "learning_rate": 3.7561888070527514e-06, "loss": 0.84574789, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.643618106842041 }, { "auxiliary_loss_clip": 0.01160418, "auxiliary_loss_mlp": 0.00714444, "balance_loss_clip": 1.05693698, "balance_loss_mlp": 1.0007143, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.493439599169795, "language_loss": 0.79871154, "learning_rate": 3.7558159457161577e-06, "loss": 0.81746018, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.7000017166137695 }, { "auxiliary_loss_clip": 0.01193139, "auxiliary_loss_mlp": 0.00714208, "balance_loss_clip": 1.06142712, "balance_loss_mlp": 1.00075901, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.9800269115829625, "language_loss": 0.78358221, "learning_rate": 3.755442818023681e-06, "loss": 0.8026557, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.721830129623413 }, { "auxiliary_loss_clip": 0.01174329, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.0582037, "balance_loss_mlp": 1.02982569, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 1.8511491041561965, "language_loss": 0.75963986, "learning_rate": 3.7550694240319246e-06, "loss": 0.78179973, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.6862361431121826 }, { "auxiliary_loss_clip": 0.01212903, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.06179929, "balance_loss_mlp": 1.02238226, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.0331234404206073, "language_loss": 0.76022148, "learning_rate": 3.7546957637975326e-06, "loss": 0.78269422, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.623321056365967 }, { "auxiliary_loss_clip": 0.0112753, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.04607046, "balance_loss_mlp": 1.02799618, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.6737878063826743, "language_loss": 0.74218887, "learning_rate": 3.7543218373771873e-06, "loss": 0.76385552, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.74283504486084 }, { "auxiliary_loss_clip": 0.01130452, "auxiliary_loss_mlp": 0.00713829, "balance_loss_clip": 1.05090082, "balance_loss_mlp": 1.00066805, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.4048188983523922, "language_loss": 0.78261596, "learning_rate": 3.753947644827615e-06, "loss": 0.80105877, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.836409568786621 }, { "auxiliary_loss_clip": 0.01102601, "auxiliary_loss_mlp": 0.01008607, "balance_loss_clip": 1.03475022, "balance_loss_mlp": 1.00472093, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9311014267973381, "language_loss": 0.57249928, "learning_rate": 3.753573186205579e-06, "loss": 0.59361136, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.3533318042755127 }, { "auxiliary_loss_clip": 0.01176442, "auxiliary_loss_mlp": 0.00714058, "balance_loss_clip": 1.05301857, "balance_loss_mlp": 1.00065291, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.1671425432533526, "language_loss": 0.78052658, "learning_rate": 3.753198461567885e-06, "loss": 0.79943156, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.6643290519714355 }, { "auxiliary_loss_clip": 0.01168344, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.06199408, "balance_loss_mlp": 1.02584767, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 2.435485087260162, "language_loss": 0.9180882, "learning_rate": 3.7528234709713783e-06, "loss": 0.94013947, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.7880921363830566 }, { "auxiliary_loss_clip": 0.01210266, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.06361365, "balance_loss_mlp": 1.02148974, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 1.857214398363654, "language_loss": 0.84234774, "learning_rate": 3.7524482144729447e-06, "loss": 0.86477286, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.7192656993865967 }, { "auxiliary_loss_clip": 0.01164447, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.05329919, "balance_loss_mlp": 1.03228474, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.439108092719882, "language_loss": 0.83453566, "learning_rate": 3.7520726921295106e-06, "loss": 0.85661405, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.737135410308838 }, { "auxiliary_loss_clip": 0.01200973, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.05785024, "balance_loss_mlp": 1.021819, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 4.574999264860119, "language_loss": 0.72607636, "learning_rate": 3.751696903998042e-06, "loss": 0.74842668, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.670257568359375 }, { "auxiliary_loss_clip": 0.01204386, "auxiliary_loss_mlp": 0.01038964, "balance_loss_clip": 1.06184292, "balance_loss_mlp": 1.02789497, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.6153496938149026, "language_loss": 0.7003113, "learning_rate": 3.7513208501355456e-06, "loss": 0.72274482, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.6353182792663574 }, { "auxiliary_loss_clip": 0.01189241, "auxiliary_loss_mlp": 0.01036644, "balance_loss_clip": 1.0604682, "balance_loss_mlp": 1.02574837, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 2.091064208122043, "language_loss": 0.83763516, "learning_rate": 3.750944530599069e-06, "loss": 0.85989392, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.6594810485839844 }, { "auxiliary_loss_clip": 0.01215641, "auxiliary_loss_mlp": 0.01038464, "balance_loss_clip": 1.06451488, "balance_loss_mlp": 1.02651381, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.196529650086653, "language_loss": 0.80727661, "learning_rate": 3.7505679454456992e-06, "loss": 0.82981765, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.581852436065674 }, { "auxiliary_loss_clip": 0.01114119, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.04986501, "balance_loss_mlp": 1.03105605, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 2.7041738920493965, "language_loss": 0.70163578, "learning_rate": 3.750191094732564e-06, "loss": 0.72320485, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.733701467514038 }, { "auxiliary_loss_clip": 0.01115325, "auxiliary_loss_mlp": 0.0071375, "balance_loss_clip": 1.04926085, "balance_loss_mlp": 1.00072944, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 2.476575159441208, "language_loss": 0.75400931, "learning_rate": 3.7498139785168313e-06, "loss": 0.7723, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 2.882796287536621 }, { "auxiliary_loss_clip": 0.01204399, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.06217027, "balance_loss_mlp": 1.02315056, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.806515643953037, "language_loss": 0.77330941, "learning_rate": 3.749436596855709e-06, "loss": 0.79569709, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.6406517028808594 }, { "auxiliary_loss_clip": 0.01197372, "auxiliary_loss_mlp": 0.01036891, "balance_loss_clip": 1.05851579, "balance_loss_mlp": 1.02576303, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.019547830250684, "language_loss": 0.90400779, "learning_rate": 3.749058949806446e-06, "loss": 0.92635047, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 3.6686277389526367 }, { "auxiliary_loss_clip": 0.01207327, "auxiliary_loss_mlp": 0.0103707, "balance_loss_clip": 1.06092095, "balance_loss_mlp": 1.02569699, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.8374278560294361, "language_loss": 0.84426647, "learning_rate": 3.748681037426331e-06, "loss": 0.86671042, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 4.445532321929932 }, { "auxiliary_loss_clip": 0.0122137, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.06214309, "balance_loss_mlp": 1.03305578, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.1741163126302467, "language_loss": 0.91712677, "learning_rate": 3.7483028597726936e-06, "loss": 0.93978381, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.5497565269470215 }, { "auxiliary_loss_clip": 0.01172664, "auxiliary_loss_mlp": 0.0104407, "balance_loss_clip": 1.05942202, "balance_loss_mlp": 1.0311296, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 1.8172882282885086, "language_loss": 0.6243127, "learning_rate": 3.7479244169029017e-06, "loss": 0.64648008, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 2.7486257553100586 }, { "auxiliary_loss_clip": 0.01205298, "auxiliary_loss_mlp": 0.01040399, "balance_loss_clip": 1.05651212, "balance_loss_mlp": 1.02881217, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.324446355166273, "language_loss": 0.73155683, "learning_rate": 3.7475457088743658e-06, "loss": 0.75401384, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 3.500887632369995 }, { "auxiliary_loss_clip": 0.01179643, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.05736721, "balance_loss_mlp": 1.02537847, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 2.505752532179635, "language_loss": 0.74509287, "learning_rate": 3.7471667357445348e-06, "loss": 0.76725179, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 2.7385220527648926 }, { "auxiliary_loss_clip": 0.01141009, "auxiliary_loss_mlp": 0.01037591, "balance_loss_clip": 1.0562644, "balance_loss_mlp": 1.02652836, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 3.1051867608576043, "language_loss": 0.72143596, "learning_rate": 3.7467874975709e-06, "loss": 0.74322194, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.843656063079834 }, { "auxiliary_loss_clip": 0.01212997, "auxiliary_loss_mlp": 0.0104044, "balance_loss_clip": 1.06529391, "balance_loss_mlp": 1.02943087, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.181263255023496, "language_loss": 0.77796662, "learning_rate": 3.7464079944109904e-06, "loss": 0.80050099, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.7485554218292236 }, { "auxiliary_loss_clip": 0.0117422, "auxiliary_loss_mlp": 0.01037308, "balance_loss_clip": 1.05757856, "balance_loss_mlp": 1.02620411, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 38.07745082981312, "language_loss": 0.77391028, "learning_rate": 3.746028226322376e-06, "loss": 0.79602563, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.696925401687622 }, { "auxiliary_loss_clip": 0.01183987, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.05601859, "balance_loss_mlp": 1.0252707, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.753705925223245, "language_loss": 0.75131857, "learning_rate": 3.745648193362669e-06, "loss": 0.77352262, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.672760248184204 }, { "auxiliary_loss_clip": 0.01186883, "auxiliary_loss_mlp": 0.01040816, "balance_loss_clip": 1.05771971, "balance_loss_mlp": 1.0294435, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.3664516443769434, "language_loss": 0.72091877, "learning_rate": 3.745267895589518e-06, "loss": 0.74319577, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.6930665969848633 }, { "auxiliary_loss_clip": 0.01189843, "auxiliary_loss_mlp": 0.01040044, "balance_loss_clip": 1.06107092, "balance_loss_mlp": 1.02852225, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 1.8804612895642345, "language_loss": 0.81932592, "learning_rate": 3.7448873330606154e-06, "loss": 0.84162486, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.616877794265747 }, { "auxiliary_loss_clip": 0.01160576, "auxiliary_loss_mlp": 0.01042638, "balance_loss_clip": 1.0553534, "balance_loss_mlp": 1.03074098, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.0527618863477906, "language_loss": 0.87501597, "learning_rate": 3.7445065058336914e-06, "loss": 0.89704812, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7054443359375 }, { "auxiliary_loss_clip": 0.01139215, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.05060875, "balance_loss_mlp": 1.02751553, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.8301918535723534, "language_loss": 0.86212587, "learning_rate": 3.7441254139665176e-06, "loss": 0.88389516, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.6956534385681152 }, { "auxiliary_loss_clip": 0.01221496, "auxiliary_loss_mlp": 0.01041694, "balance_loss_clip": 1.06376588, "balance_loss_mlp": 1.03035712, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 2.1738019407292564, "language_loss": 0.82519948, "learning_rate": 3.743744057516905e-06, "loss": 0.84783137, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.619100570678711 }, { "auxiliary_loss_clip": 0.01150991, "auxiliary_loss_mlp": 0.01040081, "balance_loss_clip": 1.05403852, "balance_loss_mlp": 1.02792215, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 2.8785826684824443, "language_loss": 0.87243223, "learning_rate": 3.743362436542706e-06, "loss": 0.8943429, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.7005677223205566 }, { "auxiliary_loss_clip": 0.01220767, "auxiliary_loss_mlp": 0.01039821, "balance_loss_clip": 1.06153488, "balance_loss_mlp": 1.02869296, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 1.9448465627394569, "language_loss": 0.76792002, "learning_rate": 3.7429805511018115e-06, "loss": 0.79052585, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.8109357357025146 }, { "auxiliary_loss_clip": 0.01167707, "auxiliary_loss_mlp": 0.00714044, "balance_loss_clip": 1.0577898, "balance_loss_mlp": 1.00045252, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 1.8665825889094365, "language_loss": 0.78338969, "learning_rate": 3.7425984012521524e-06, "loss": 0.80220717, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.8697097301483154 }, { "auxiliary_loss_clip": 0.01082284, "auxiliary_loss_mlp": 0.00704417, "balance_loss_clip": 1.03249168, "balance_loss_mlp": 1.0001334, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7426037707605297, "language_loss": 0.60442662, "learning_rate": 3.7422159870517025e-06, "loss": 0.62229359, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.235630989074707 }, { "auxiliary_loss_clip": 0.01184842, "auxiliary_loss_mlp": 0.01032213, "balance_loss_clip": 1.05774426, "balance_loss_mlp": 1.02138305, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 2.0682581528928647, "language_loss": 0.78598833, "learning_rate": 3.7418333085584717e-06, "loss": 0.80815881, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.7095582485198975 }, { "auxiliary_loss_clip": 0.01171542, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.0577426, "balance_loss_mlp": 1.02643895, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.1221392531026577, "language_loss": 0.91023326, "learning_rate": 3.7414503658305128e-06, "loss": 0.93232632, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.63332462310791 }, { "auxiliary_loss_clip": 0.01159039, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.05271792, "balance_loss_mlp": 1.03212476, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.2409847749336302, "language_loss": 0.77452934, "learning_rate": 3.7410671589259185e-06, "loss": 0.7965557, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.71614670753479 }, { "auxiliary_loss_clip": 0.01223509, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.06355071, "balance_loss_mlp": 1.02683556, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.005743821240342, "language_loss": 0.79560643, "learning_rate": 3.7406836879028205e-06, "loss": 0.81822354, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.578045606613159 }, { "auxiliary_loss_clip": 0.01205052, "auxiliary_loss_mlp": 0.01041573, "balance_loss_clip": 1.06242943, "balance_loss_mlp": 1.03033733, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 2.2060261964067656, "language_loss": 0.76546907, "learning_rate": 3.7402999528193907e-06, "loss": 0.78793526, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.625089168548584 }, { "auxiliary_loss_clip": 0.01158437, "auxiliary_loss_mlp": 0.00713924, "balance_loss_clip": 1.0566473, "balance_loss_mlp": 1.00049543, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 2.635823556844008, "language_loss": 0.85541725, "learning_rate": 3.739915953733842e-06, "loss": 0.87414086, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.6767046451568604 }, { "auxiliary_loss_clip": 0.01217914, "auxiliary_loss_mlp": 0.01036818, "balance_loss_clip": 1.06083047, "balance_loss_mlp": 1.02531481, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.6545673273803172, "language_loss": 0.82059646, "learning_rate": 3.7395316907044264e-06, "loss": 0.84314382, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.639601230621338 }, { "auxiliary_loss_clip": 0.01204786, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.05883551, "balance_loss_mlp": 1.02503562, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.6457028793396757, "language_loss": 0.79589105, "learning_rate": 3.7391471637894364e-06, "loss": 0.81830812, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 3.5846149921417236 }, { "auxiliary_loss_clip": 0.01169847, "auxiliary_loss_mlp": 0.0104534, "balance_loss_clip": 1.05404997, "balance_loss_mlp": 1.03462911, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 1.7336132026267828, "language_loss": 0.84887183, "learning_rate": 3.738762373047205e-06, "loss": 0.87102377, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 3.586150884628296 }, { "auxiliary_loss_clip": 0.01174147, "auxiliary_loss_mlp": 0.01044195, "balance_loss_clip": 1.05792522, "balance_loss_mlp": 1.03338909, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 3.004489049079369, "language_loss": 0.83220929, "learning_rate": 3.738377318536103e-06, "loss": 0.85439265, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 2.7299389839172363 }, { "auxiliary_loss_clip": 0.01217677, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.06142807, "balance_loss_mlp": 1.02202916, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 2.4007758354611655, "language_loss": 0.71057332, "learning_rate": 3.7379920003145447e-06, "loss": 0.73307884, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.5612375736236572 }, { "auxiliary_loss_clip": 0.01182448, "auxiliary_loss_mlp": 0.01042958, "balance_loss_clip": 1.06094837, "balance_loss_mlp": 1.03116775, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 1.658513406966128, "language_loss": 0.83717448, "learning_rate": 3.7376064184409817e-06, "loss": 0.85942852, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 3.6344668865203857 }, { "auxiliary_loss_clip": 0.0118545, "auxiliary_loss_mlp": 0.01039484, "balance_loss_clip": 1.06041479, "balance_loss_mlp": 1.02770603, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 2.140785240380155, "language_loss": 0.87005037, "learning_rate": 3.7372205729739063e-06, "loss": 0.89229977, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.6875407695770264 }, { "auxiliary_loss_clip": 0.01208126, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.0609746, "balance_loss_mlp": 1.02937722, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.6962858241629144, "language_loss": 0.71486127, "learning_rate": 3.7368344639718514e-06, "loss": 0.73735476, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.576963186264038 }, { "auxiliary_loss_clip": 0.01205122, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.05949354, "balance_loss_mlp": 1.02689743, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.5794265606425222, "language_loss": 0.80470127, "learning_rate": 3.7364480914933895e-06, "loss": 0.82713205, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.6696178913116455 }, { "auxiliary_loss_clip": 0.01147789, "auxiliary_loss_mlp": 0.00714236, "balance_loss_clip": 1.05294836, "balance_loss_mlp": 1.0005306, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 1.8509366294676703, "language_loss": 0.81358075, "learning_rate": 3.7360614555971325e-06, "loss": 0.832201, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.717888832092285 }, { "auxiliary_loss_clip": 0.01203726, "auxiliary_loss_mlp": 0.00713299, "balance_loss_clip": 1.06139278, "balance_loss_mlp": 1.00046647, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 1.7399735973417276, "language_loss": 0.84960282, "learning_rate": 3.735674556341733e-06, "loss": 0.8687731, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.6326587200164795 }, { "auxiliary_loss_clip": 0.01186572, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.06276488, "balance_loss_mlp": 1.02626216, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 2.1360155493484445, "language_loss": 0.82788032, "learning_rate": 3.7352873937858835e-06, "loss": 0.85012829, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.702455759048462 }, { "auxiliary_loss_clip": 0.01165398, "auxiliary_loss_mlp": 0.00713523, "balance_loss_clip": 1.0581857, "balance_loss_mlp": 1.00052679, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 2.0317833875602096, "language_loss": 0.71544582, "learning_rate": 3.734899967988316e-06, "loss": 0.73423505, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.7335708141326904 }, { "auxiliary_loss_clip": 0.0115933, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.05294716, "balance_loss_mlp": 1.01999283, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 1.749900447071964, "language_loss": 0.83798057, "learning_rate": 3.7345122790078026e-06, "loss": 0.85989165, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.6747920513153076 }, { "auxiliary_loss_clip": 0.01201953, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.05947697, "balance_loss_mlp": 1.02455664, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 5.194164440944247, "language_loss": 0.9297207, "learning_rate": 3.7341243269031556e-06, "loss": 0.95209831, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.607897996902466 }, { "auxiliary_loss_clip": 0.01177719, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.05782557, "balance_loss_mlp": 1.03106308, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.6419991876782247, "language_loss": 0.77246642, "learning_rate": 3.7337361117332275e-06, "loss": 0.79466009, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.73803973197937 }, { "auxiliary_loss_clip": 0.0117225, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.05535662, "balance_loss_mlp": 1.02147579, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 1.9805511788448764, "language_loss": 0.76987904, "learning_rate": 3.7333476335569087e-06, "loss": 0.79192638, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.6518850326538086 }, { "auxiliary_loss_clip": 0.01184764, "auxiliary_loss_mlp": 0.01040212, "balance_loss_clip": 1.05743051, "balance_loss_mlp": 1.02878582, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 2.0735032470105064, "language_loss": 0.66505557, "learning_rate": 3.7329588924331325e-06, "loss": 0.68730533, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.6472463607788086 }, { "auxiliary_loss_clip": 0.01159758, "auxiliary_loss_mlp": 0.01033934, "balance_loss_clip": 1.0518657, "balance_loss_mlp": 1.02287173, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.7483780902168302, "language_loss": 0.8234576, "learning_rate": 3.732569888420871e-06, "loss": 0.84539449, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.646359920501709 }, { "auxiliary_loss_clip": 0.01218025, "auxiliary_loss_mlp": 0.01040767, "balance_loss_clip": 1.05803394, "balance_loss_mlp": 1.02925122, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 2.5007822299634896, "language_loss": 0.82658803, "learning_rate": 3.732180621579134e-06, "loss": 0.84917599, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.6350927352905273 }, { "auxiliary_loss_clip": 0.01180765, "auxiliary_loss_mlp": 0.01044215, "balance_loss_clip": 1.05930352, "balance_loss_mlp": 1.03333688, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 1.8314709735167685, "language_loss": 0.8130253, "learning_rate": 3.7317910919669745e-06, "loss": 0.83527511, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.755828619003296 }, { "auxiliary_loss_clip": 0.01202507, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.06197321, "balance_loss_mlp": 1.0264982, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.2563121046882784, "language_loss": 0.75748777, "learning_rate": 3.7314012996434826e-06, "loss": 0.7798841, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.642859697341919 }, { "auxiliary_loss_clip": 0.01187521, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.05970991, "balance_loss_mlp": 1.0244813, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 1.9521578920102933, "language_loss": 0.81330985, "learning_rate": 3.7310112446677907e-06, "loss": 0.83553791, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.6643354892730713 }, { "auxiliary_loss_clip": 0.01221811, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.06356156, "balance_loss_mlp": 1.0235908, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 1.9935335074277003, "language_loss": 0.69226539, "learning_rate": 3.7306209270990695e-06, "loss": 0.71483326, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.626368999481201 }, { "auxiliary_loss_clip": 0.01185264, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.05834365, "balance_loss_mlp": 1.02615142, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 1.7547792149122368, "language_loss": 0.86540002, "learning_rate": 3.7302303469965292e-06, "loss": 0.88761866, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.74992299079895 }, { "auxiliary_loss_clip": 0.01204473, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.06122816, "balance_loss_mlp": 1.02427912, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 1.7903103150307875, "language_loss": 0.70593601, "learning_rate": 3.7298395044194206e-06, "loss": 0.72833836, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.6941444873809814 }, { "auxiliary_loss_clip": 0.01220646, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.06306434, "balance_loss_mlp": 1.02732205, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 2.1493490229834222, "language_loss": 0.94298661, "learning_rate": 3.7294483994270356e-06, "loss": 0.96558052, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.553574800491333 }, { "auxiliary_loss_clip": 0.01145251, "auxiliary_loss_mlp": 0.01034076, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02284014, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 1.9816795573757966, "language_loss": 0.77615595, "learning_rate": 3.7290570320787033e-06, "loss": 0.79794925, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 3.664581775665283 }, { "auxiliary_loss_clip": 0.0120112, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.06192052, "balance_loss_mlp": 1.03048849, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.03080680673627, "language_loss": 0.71078795, "learning_rate": 3.728665402433793e-06, "loss": 0.73321187, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 4.28532338142395 }, { "auxiliary_loss_clip": 0.0118315, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.05963969, "balance_loss_mlp": 1.02703214, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.1855029760453575, "language_loss": 0.86573309, "learning_rate": 3.7282735105517164e-06, "loss": 0.88794053, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 2.6566412448883057 }, { "auxiliary_loss_clip": 0.01160015, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.0541203, "balance_loss_mlp": 1.02060103, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.894129956222234, "language_loss": 0.67103291, "learning_rate": 3.727881356491922e-06, "loss": 0.69295847, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.7078943252563477 }, { "auxiliary_loss_clip": 0.01219665, "auxiliary_loss_mlp": 0.01037001, "balance_loss_clip": 1.06482899, "balance_loss_mlp": 1.02634335, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 2.1733709654938367, "language_loss": 0.75771046, "learning_rate": 3.7274889403139002e-06, "loss": 0.78027713, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.594774007797241 }, { "auxiliary_loss_clip": 0.01151251, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.05832386, "balance_loss_mlp": 1.02257299, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 4.7202209581775, "language_loss": 0.78834176, "learning_rate": 3.727096262077179e-06, "loss": 0.8101871, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 3.6695942878723145 }, { "auxiliary_loss_clip": 0.01203237, "auxiliary_loss_mlp": 0.01035769, "balance_loss_clip": 1.06123281, "balance_loss_mlp": 1.02492654, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.7641084875278898, "language_loss": 0.85313332, "learning_rate": 3.7267033218413285e-06, "loss": 0.87552345, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.633733034133911 }, { "auxiliary_loss_clip": 0.01132917, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.04785037, "balance_loss_mlp": 1.0323329, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.1128409831589896, "language_loss": 0.80755389, "learning_rate": 3.726310119665957e-06, "loss": 0.82931852, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.7182605266571045 }, { "auxiliary_loss_clip": 0.01200966, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.06019616, "balance_loss_mlp": 1.02300632, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.8122528188376987, "language_loss": 0.85598683, "learning_rate": 3.725916655610713e-06, "loss": 0.87833416, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.730788469314575 }, { "auxiliary_loss_clip": 0.01173842, "auxiliary_loss_mlp": 0.01035721, "balance_loss_clip": 1.05384743, "balance_loss_mlp": 1.02477789, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 6.810786673646129, "language_loss": 0.75193346, "learning_rate": 3.725522929735284e-06, "loss": 0.77402914, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.635021686553955 }, { "auxiliary_loss_clip": 0.01186085, "auxiliary_loss_mlp": 0.01036192, "balance_loss_clip": 1.05316162, "balance_loss_mlp": 1.02502775, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 2.0304147372983725, "language_loss": 0.74459738, "learning_rate": 3.725128942099399e-06, "loss": 0.76682013, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.797335386276245 }, { "auxiliary_loss_clip": 0.01176052, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.05536389, "balance_loss_mlp": 1.02520037, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.691875654797462, "language_loss": 0.79855263, "learning_rate": 3.7247346927628245e-06, "loss": 0.82067335, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.703458547592163 }, { "auxiliary_loss_clip": 0.01183798, "auxiliary_loss_mlp": 0.00714213, "balance_loss_clip": 1.05813646, "balance_loss_mlp": 1.0003823, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.749343046193309, "language_loss": 0.78904372, "learning_rate": 3.7243401817853694e-06, "loss": 0.80802387, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.7470972537994385 }, { "auxiliary_loss_clip": 0.01188344, "auxiliary_loss_mlp": 0.01036803, "balance_loss_clip": 1.05395663, "balance_loss_mlp": 1.02594864, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 1.916034855883715, "language_loss": 0.71330774, "learning_rate": 3.723945409226879e-06, "loss": 0.73555923, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.5898585319519043 }, { "auxiliary_loss_clip": 0.01197478, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.05534053, "balance_loss_mlp": 1.02538431, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.26700378018001, "language_loss": 0.79828823, "learning_rate": 3.723550375147241e-06, "loss": 0.82062739, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.593966245651245 }, { "auxiliary_loss_clip": 0.011532, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.05030453, "balance_loss_mlp": 1.0275737, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.84833207669334, "language_loss": 0.80149287, "learning_rate": 3.7231550796063816e-06, "loss": 0.82341611, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.702533006668091 }, { "auxiliary_loss_clip": 0.01189371, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.0243535, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 1.8699219196411925, "language_loss": 0.64919573, "learning_rate": 3.722759522664266e-06, "loss": 0.6714474, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.6086690425872803 }, { "auxiliary_loss_clip": 0.01151684, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.05354595, "balance_loss_mlp": 1.02147818, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 2.75218590578657, "language_loss": 0.81625211, "learning_rate": 3.7223637043809016e-06, "loss": 0.83808714, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.7300946712493896 }, { "auxiliary_loss_clip": 0.01172713, "auxiliary_loss_mlp": 0.01039863, "balance_loss_clip": 1.05698502, "balance_loss_mlp": 1.02905643, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 1.8442990504969645, "language_loss": 0.86549973, "learning_rate": 3.7219676248163322e-06, "loss": 0.88762546, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.683486223220825 }, { "auxiliary_loss_clip": 0.01201987, "auxiliary_loss_mlp": 0.01035349, "balance_loss_clip": 1.05907524, "balance_loss_mlp": 1.02457297, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.8355449911536337, "language_loss": 0.93182766, "learning_rate": 3.721571284030643e-06, "loss": 0.95420098, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.6546478271484375 }, { "auxiliary_loss_clip": 0.0120087, "auxiliary_loss_mlp": 0.01031324, "balance_loss_clip": 1.05827498, "balance_loss_mlp": 1.02066064, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.075707167544059, "language_loss": 0.78700578, "learning_rate": 3.7211746820839587e-06, "loss": 0.80932772, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.6338329315185547 }, { "auxiliary_loss_clip": 0.01098209, "auxiliary_loss_mlp": 0.01035696, "balance_loss_clip": 1.04432821, "balance_loss_mlp": 1.0248003, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.5636105296306606, "language_loss": 0.80745113, "learning_rate": 3.7207778190364437e-06, "loss": 0.82879019, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.774749755859375 }, { "auxiliary_loss_clip": 0.01115003, "auxiliary_loss_mlp": 0.01035189, "balance_loss_clip": 1.04686308, "balance_loss_mlp": 1.02458572, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.5306674289997548, "language_loss": 0.73787463, "learning_rate": 3.720380694948302e-06, "loss": 0.75937653, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.8178153038024902 }, { "auxiliary_loss_clip": 0.01078487, "auxiliary_loss_mlp": 0.01021335, "balance_loss_clip": 1.03770852, "balance_loss_mlp": 1.01768708, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0349151888321853, "language_loss": 0.71221209, "learning_rate": 3.719983309879777e-06, "loss": 0.73321033, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.2774603366851807 }, { "auxiliary_loss_clip": 0.01153499, "auxiliary_loss_mlp": 0.01033448, "balance_loss_clip": 1.0528152, "balance_loss_mlp": 1.0226717, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 2.3650120624961324, "language_loss": 0.77377844, "learning_rate": 3.719585663891151e-06, "loss": 0.79564792, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.7549874782562256 }, { "auxiliary_loss_clip": 0.01142051, "auxiliary_loss_mlp": 0.01040843, "balance_loss_clip": 1.05360961, "balance_loss_mlp": 1.02933359, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.271339388597416, "language_loss": 0.78771126, "learning_rate": 3.719187757042747e-06, "loss": 0.80954021, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.7966136932373047 }, { "auxiliary_loss_clip": 0.01103513, "auxiliary_loss_mlp": 0.01004832, "balance_loss_clip": 1.04049897, "balance_loss_mlp": 1.00156546, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7233298385946304, "language_loss": 0.54953682, "learning_rate": 3.7187895893949275e-06, "loss": 0.5706203, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 5.041233539581299 }, { "auxiliary_loss_clip": 0.0113428, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.04919648, "balance_loss_mlp": 1.02416539, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.760592256613911, "language_loss": 0.76469421, "learning_rate": 3.7183911610080937e-06, "loss": 0.78638542, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 3.4910619258880615 }, { "auxiliary_loss_clip": 0.01169993, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.05729198, "balance_loss_mlp": 1.02871549, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 2.68189608647863, "language_loss": 0.74992079, "learning_rate": 3.7179924719426872e-06, "loss": 0.77202272, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 2.6808645725250244 }, { "auxiliary_loss_clip": 0.01201025, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.05983257, "balance_loss_mlp": 1.02929258, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.292516185928717, "language_loss": 0.76022851, "learning_rate": 3.7175935222591885e-06, "loss": 0.78264642, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.6217548847198486 }, { "auxiliary_loss_clip": 0.01191287, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.06353962, "balance_loss_mlp": 1.02499557, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 1.8258755753804226, "language_loss": 0.74358702, "learning_rate": 3.717194312018118e-06, "loss": 0.76586509, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 3.6354284286499023 }, { "auxiliary_loss_clip": 0.01203988, "auxiliary_loss_mlp": 0.01039109, "balance_loss_clip": 1.06143165, "balance_loss_mlp": 1.02787375, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.179068186589718, "language_loss": 0.76008099, "learning_rate": 3.716794841280036e-06, "loss": 0.78251195, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.594757318496704 }, { "auxiliary_loss_clip": 0.0120496, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05944288, "balance_loss_mlp": 1.02698374, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 2.720354217420318, "language_loss": 0.77521861, "learning_rate": 3.7163951101055407e-06, "loss": 0.79765284, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.6244020462036133 }, { "auxiliary_loss_clip": 0.01181038, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.05928671, "balance_loss_mlp": 1.02745533, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.9812406795332986, "language_loss": 0.79026192, "learning_rate": 3.715995118555273e-06, "loss": 0.81245959, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.6311981678009033 }, { "auxiliary_loss_clip": 0.01150435, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.05444384, "balance_loss_mlp": 1.0279659, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 1.9538824701723458, "language_loss": 0.86029887, "learning_rate": 3.71559486668991e-06, "loss": 0.88219947, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.7420542240142822 }, { "auxiliary_loss_clip": 0.01205308, "auxiliary_loss_mlp": 0.00713237, "balance_loss_clip": 1.06084657, "balance_loss_mlp": 1.00047123, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.5910368404449167, "language_loss": 0.77586877, "learning_rate": 3.715194354570169e-06, "loss": 0.79505426, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.656026601791382 }, { "auxiliary_loss_clip": 0.01200796, "auxiliary_loss_mlp": 0.01044745, "balance_loss_clip": 1.06298685, "balance_loss_mlp": 1.03386104, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 1.8883691837626564, "language_loss": 0.83237433, "learning_rate": 3.714793582256809e-06, "loss": 0.85482973, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.592679500579834 }, { "auxiliary_loss_clip": 0.01219165, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.06291842, "balance_loss_mlp": 1.0267787, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.3820185148226205, "language_loss": 0.84656924, "learning_rate": 3.7143925498106253e-06, "loss": 0.86913681, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.5551061630249023 }, { "auxiliary_loss_clip": 0.01181558, "auxiliary_loss_mlp": 0.01042593, "balance_loss_clip": 1.05398297, "balance_loss_mlp": 1.03099394, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 1.921619028845954, "language_loss": 0.79434496, "learning_rate": 3.7139912572924558e-06, "loss": 0.81658643, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.6563074588775635 }, { "auxiliary_loss_clip": 0.01195871, "auxiliary_loss_mlp": 0.01035368, "balance_loss_clip": 1.05540657, "balance_loss_mlp": 1.02461553, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 2.6558334352249835, "language_loss": 0.80086577, "learning_rate": 3.7135897047631744e-06, "loss": 0.82317817, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.600938558578491 }, { "auxiliary_loss_clip": 0.0118873, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.05869758, "balance_loss_mlp": 1.02308381, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 1.9393992095499277, "language_loss": 0.76434338, "learning_rate": 3.713187892283698e-06, "loss": 0.78657258, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.6522939205169678 }, { "auxiliary_loss_clip": 0.01147917, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.05197358, "balance_loss_mlp": 1.02779603, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 3.81654475457016, "language_loss": 0.86930442, "learning_rate": 3.71278581991498e-06, "loss": 0.89117068, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.754760980606079 }, { "auxiliary_loss_clip": 0.01172848, "auxiliary_loss_mlp": 0.00714072, "balance_loss_clip": 1.0618788, "balance_loss_mlp": 1.00053692, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.8293410338105065, "language_loss": 0.78750288, "learning_rate": 3.712383487718015e-06, "loss": 0.80637205, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.6525559425354004 }, { "auxiliary_loss_clip": 0.01128561, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.05259967, "balance_loss_mlp": 1.02219701, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.765078605081407, "language_loss": 0.86527848, "learning_rate": 3.7119808957538365e-06, "loss": 0.88689083, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.7619268894195557 }, { "auxiliary_loss_clip": 0.01174866, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.05346632, "balance_loss_mlp": 1.0282948, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 2.186607218669044, "language_loss": 0.80188036, "learning_rate": 3.711578044083517e-06, "loss": 0.82402587, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.637892723083496 }, { "auxiliary_loss_clip": 0.01183405, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.05631804, "balance_loss_mlp": 1.02335405, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 2.2097420200761877, "language_loss": 0.74392498, "learning_rate": 3.7111749327681698e-06, "loss": 0.76610506, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.6572458744049072 }, { "auxiliary_loss_clip": 0.01207043, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.06258512, "balance_loss_mlp": 1.02175176, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.156848371209539, "language_loss": 0.85948265, "learning_rate": 3.7107715618689455e-06, "loss": 0.8818748, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.614078998565674 }, { "auxiliary_loss_clip": 0.01200802, "auxiliary_loss_mlp": 0.01045437, "balance_loss_clip": 1.06074083, "balance_loss_mlp": 1.03449988, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.7090942757020844, "language_loss": 0.83564639, "learning_rate": 3.710367931447035e-06, "loss": 0.85810876, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.6685287952423096 }, { "auxiliary_loss_clip": 0.01207502, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.06089222, "balance_loss_mlp": 1.02811456, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 3.107063653823271, "language_loss": 0.86829638, "learning_rate": 3.70996404156367e-06, "loss": 0.89075929, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.6016008853912354 }, { "auxiliary_loss_clip": 0.01140571, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.05213773, "balance_loss_mlp": 1.02371156, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.9055641003921004, "language_loss": 0.72742528, "learning_rate": 3.7095598922801187e-06, "loss": 0.74917895, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.8532016277313232 }, { "auxiliary_loss_clip": 0.01220037, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.06364465, "balance_loss_mlp": 1.02534628, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 2.5538767631181374, "language_loss": 0.76081765, "learning_rate": 3.7091554836576914e-06, "loss": 0.78338397, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.5561318397521973 }, { "auxiliary_loss_clip": 0.01199851, "auxiliary_loss_mlp": 0.00713841, "balance_loss_clip": 1.06037259, "balance_loss_mlp": 1.00049806, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.6925785455397195, "language_loss": 0.82778203, "learning_rate": 3.708750815757736e-06, "loss": 0.84691888, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.6121327877044678 }, { "auxiliary_loss_clip": 0.01204766, "auxiliary_loss_mlp": 0.01041161, "balance_loss_clip": 1.06215763, "balance_loss_mlp": 1.02958584, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.347121835979016, "language_loss": 0.73175579, "learning_rate": 3.7083458886416407e-06, "loss": 0.754215, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 5.400553226470947 }, { "auxiliary_loss_clip": 0.01139745, "auxiliary_loss_mlp": 0.01042642, "balance_loss_clip": 1.05529714, "balance_loss_mlp": 1.03127551, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.1155387108943007, "language_loss": 0.88103002, "learning_rate": 3.707940702370832e-06, "loss": 0.90285385, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.7997422218322754 }, { "auxiliary_loss_clip": 0.01115731, "auxiliary_loss_mlp": 0.01016142, "balance_loss_clip": 1.04075408, "balance_loss_mlp": 1.01301885, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.902461600657024, "language_loss": 0.58280295, "learning_rate": 3.707535257006777e-06, "loss": 0.60412169, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.282428503036499 }, { "auxiliary_loss_clip": 0.01188337, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.06073046, "balance_loss_mlp": 1.02553058, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.5690996598665885, "language_loss": 0.88463837, "learning_rate": 3.707129552610981e-06, "loss": 0.90688854, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.6165857315063477 }, { "auxiliary_loss_clip": 0.0117642, "auxiliary_loss_mlp": 0.01037169, "balance_loss_clip": 1.05786765, "balance_loss_mlp": 1.02618408, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 1.8611505508198758, "language_loss": 0.73473692, "learning_rate": 3.70672358924499e-06, "loss": 0.75687277, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 3.5760536193847656 }, { "auxiliary_loss_clip": 0.01172276, "auxiliary_loss_mlp": 0.01043721, "balance_loss_clip": 1.06204045, "balance_loss_mlp": 1.03316462, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 1.9483770604656536, "language_loss": 0.78303742, "learning_rate": 3.706317366970386e-06, "loss": 0.80519748, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 2.8608574867248535 }, { "auxiliary_loss_clip": 0.01221372, "auxiliary_loss_mlp": 0.00714396, "balance_loss_clip": 1.06113863, "balance_loss_mlp": 1.00055683, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 3.1673227751258652, "language_loss": 0.8369177, "learning_rate": 3.705910885848795e-06, "loss": 0.85627538, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.6773762702941895 }, { "auxiliary_loss_clip": 0.0120144, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.06082201, "balance_loss_mlp": 1.02984214, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 2.33520139845263, "language_loss": 0.84732074, "learning_rate": 3.705504145941879e-06, "loss": 0.86974388, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.6093356609344482 }, { "auxiliary_loss_clip": 0.01217606, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.06075811, "balance_loss_mlp": 1.0263989, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 2.474632619209451, "language_loss": 0.78672254, "learning_rate": 3.7050971473113403e-06, "loss": 0.80926871, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.6512815952301025 }, { "auxiliary_loss_clip": 0.01197327, "auxiliary_loss_mlp": 0.00713634, "balance_loss_clip": 1.05825341, "balance_loss_mlp": 1.0004921, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 1.8307910002839463, "language_loss": 0.79811144, "learning_rate": 3.7046898900189196e-06, "loss": 0.81722111, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.733370304107666 }, { "auxiliary_loss_clip": 0.01173114, "auxiliary_loss_mlp": 0.01044302, "balance_loss_clip": 1.05936408, "balance_loss_mlp": 1.03251815, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.7202235416950378, "language_loss": 0.82957625, "learning_rate": 3.704282374126398e-06, "loss": 0.85175037, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.7268877029418945 }, { "auxiliary_loss_clip": 0.01169132, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.05632973, "balance_loss_mlp": 1.03219056, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.7484007791009508, "language_loss": 0.87252462, "learning_rate": 3.7038745996955954e-06, "loss": 0.89465678, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 264.80313420295715 }, { "auxiliary_loss_clip": 0.01173594, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.05626845, "balance_loss_mlp": 1.02872515, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 3.9654985782178835, "language_loss": 0.72171748, "learning_rate": 3.703466566788371e-06, "loss": 0.74385929, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.666299343109131 }, { "auxiliary_loss_clip": 0.01177873, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.05908251, "balance_loss_mlp": 1.02731216, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 1.9166513530217524, "language_loss": 0.74415386, "learning_rate": 3.703058275466622e-06, "loss": 0.76630998, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.723318099975586 }, { "auxiliary_loss_clip": 0.01187026, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.05825424, "balance_loss_mlp": 1.03038442, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 2.0004025243354384, "language_loss": 0.77878225, "learning_rate": 3.7026497257922877e-06, "loss": 0.80106652, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.6224710941314697 }, { "auxiliary_loss_clip": 0.01147827, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.03437138, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.624833835149715, "language_loss": 0.85205603, "learning_rate": 3.7022409178273436e-06, "loss": 0.87398672, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.773052930831909 }, { "auxiliary_loss_clip": 0.01198328, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.0587579, "balance_loss_mlp": 1.02552223, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 1.9403667200183699, "language_loss": 0.78444183, "learning_rate": 3.7018318516338054e-06, "loss": 0.80678862, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.577632427215576 }, { "auxiliary_loss_clip": 0.01208107, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.06173515, "balance_loss_mlp": 1.02548957, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 2.17892653120236, "language_loss": 0.8179118, "learning_rate": 3.7014225272737284e-06, "loss": 0.84035039, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.623086929321289 }, { "auxiliary_loss_clip": 0.01191455, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.05608153, "balance_loss_mlp": 1.02797198, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.281511577309999, "language_loss": 0.73855174, "learning_rate": 3.701012944809207e-06, "loss": 0.76085794, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.593851327896118 }, { "auxiliary_loss_clip": 0.01187713, "auxiliary_loss_mlp": 0.00712982, "balance_loss_clip": 1.06094944, "balance_loss_mlp": 1.00057149, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.5051025486604073, "language_loss": 0.78730452, "learning_rate": 3.700603104302374e-06, "loss": 0.80631149, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.680213212966919 }, { "auxiliary_loss_clip": 0.01072994, "auxiliary_loss_mlp": 0.01005012, "balance_loss_clip": 1.03573215, "balance_loss_mlp": 1.00169802, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.9089947718980956, "language_loss": 0.56030732, "learning_rate": 3.7001930058154027e-06, "loss": 0.58108741, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.3094727993011475 }, { "auxiliary_loss_clip": 0.01167067, "auxiliary_loss_mlp": 0.01045292, "balance_loss_clip": 1.05553365, "balance_loss_mlp": 1.03358531, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.858055023410873, "language_loss": 0.79393309, "learning_rate": 3.6997826494105037e-06, "loss": 0.81605667, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.7201061248779297 }, { "auxiliary_loss_clip": 0.01184434, "auxiliary_loss_mlp": 0.01043602, "balance_loss_clip": 1.05752099, "balance_loss_mlp": 1.03255737, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.256793798003621, "language_loss": 0.69230044, "learning_rate": 3.6993720351499286e-06, "loss": 0.71458083, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.6965832710266113 }, { "auxiliary_loss_clip": 0.01181534, "auxiliary_loss_mlp": 0.0103616, "balance_loss_clip": 1.06190419, "balance_loss_mlp": 1.02605665, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 1.7982611095910834, "language_loss": 0.76792234, "learning_rate": 3.6989611630959666e-06, "loss": 0.79009926, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.6186230182647705 }, { "auxiliary_loss_clip": 0.01112448, "auxiliary_loss_mlp": 0.01002886, "balance_loss_clip": 1.03939962, "balance_loss_mlp": 0.99976313, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.6850454128863355, "language_loss": 0.58318794, "learning_rate": 3.6985500333109474e-06, "loss": 0.60434127, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.309236764907837 }, { "auxiliary_loss_clip": 0.01156866, "auxiliary_loss_mlp": 0.01035281, "balance_loss_clip": 1.05281067, "balance_loss_mlp": 1.02520216, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 2.125696999172843, "language_loss": 0.76782101, "learning_rate": 3.6981386458572385e-06, "loss": 0.78974247, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 4.4892847537994385 }, { "auxiliary_loss_clip": 0.01160802, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.05324852, "balance_loss_mlp": 1.01893282, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.3501066198812706, "language_loss": 0.76304954, "learning_rate": 3.6977270007972468e-06, "loss": 0.78495848, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 3.543776035308838 }, { "auxiliary_loss_clip": 0.0118954, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.06067002, "balance_loss_mlp": 1.02798963, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.2978462615251476, "language_loss": 0.72324884, "learning_rate": 3.6973150981934196e-06, "loss": 0.74553078, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 2.7010507583618164 }, { "auxiliary_loss_clip": 0.01221497, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.06252384, "balance_loss_mlp": 1.02760231, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.4602292401150136, "language_loss": 0.8343159, "learning_rate": 3.6969029381082415e-06, "loss": 0.8569206, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.5289950370788574 }, { "auxiliary_loss_clip": 0.01182718, "auxiliary_loss_mlp": 0.01039174, "balance_loss_clip": 1.05855584, "balance_loss_mlp": 1.02923799, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.9586171238094865, "language_loss": 0.79316199, "learning_rate": 3.696490520604237e-06, "loss": 0.81538093, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.6707756519317627 }, { "auxiliary_loss_clip": 0.01198853, "auxiliary_loss_mlp": 0.01040622, "balance_loss_clip": 1.06161654, "balance_loss_mlp": 1.02995324, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.994624958425923, "language_loss": 0.80516982, "learning_rate": 3.696077845743968e-06, "loss": 0.8275646, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 3.6461124420166016 }, { "auxiliary_loss_clip": 0.01221237, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.06273282, "balance_loss_mlp": 1.0281601, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 2.478619391350561, "language_loss": 0.73628938, "learning_rate": 3.69566491359004e-06, "loss": 0.75889444, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 2.613410472869873 }, { "auxiliary_loss_clip": 0.01181074, "auxiliary_loss_mlp": 0.01038331, "balance_loss_clip": 1.05595636, "balance_loss_mlp": 1.02720881, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 1.8913660080592942, "language_loss": 0.69142962, "learning_rate": 3.695251724205092e-06, "loss": 0.71362364, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.936973810195923 }, { "auxiliary_loss_clip": 0.0121711, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.06007648, "balance_loss_mlp": 1.01986575, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.9870542103332094, "language_loss": 0.86389184, "learning_rate": 3.6948382776518054e-06, "loss": 0.88637245, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.5943007469177246 }, { "auxiliary_loss_clip": 0.01176275, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.05585742, "balance_loss_mlp": 1.02941239, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.131256498777638, "language_loss": 0.7929337, "learning_rate": 3.6944245739929e-06, "loss": 0.81509715, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.833486557006836 }, { "auxiliary_loss_clip": 0.0120272, "auxiliary_loss_mlp": 0.01037932, "balance_loss_clip": 1.06245649, "balance_loss_mlp": 1.02709007, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 2.4393360888083433, "language_loss": 0.71594894, "learning_rate": 3.6940106132911332e-06, "loss": 0.73835546, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.549591302871704 }, { "auxiliary_loss_clip": 0.0120547, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.06271064, "balance_loss_mlp": 1.02387476, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 1.9316583093722544, "language_loss": 0.88662708, "learning_rate": 3.6935963956093037e-06, "loss": 0.9090271, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.6321969032287598 }, { "auxiliary_loss_clip": 0.01192186, "auxiliary_loss_mlp": 0.01039522, "balance_loss_clip": 1.05855668, "balance_loss_mlp": 1.02922249, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.7078639844584131, "language_loss": 0.6905781, "learning_rate": 3.6931819210102474e-06, "loss": 0.71289515, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.578887462615967 }, { "auxiliary_loss_clip": 0.01217187, "auxiliary_loss_mlp": 0.01036849, "balance_loss_clip": 1.0604291, "balance_loss_mlp": 1.0257504, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 1.7852120370658577, "language_loss": 0.84452087, "learning_rate": 3.6927671895568402e-06, "loss": 0.86706126, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.6453373432159424 }, { "auxiliary_loss_clip": 0.01221481, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.06476903, "balance_loss_mlp": 1.0275166, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 2.3966158438286214, "language_loss": 0.86674827, "learning_rate": 3.692352201311996e-06, "loss": 0.88934469, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.5832903385162354 }, { "auxiliary_loss_clip": 0.01162751, "auxiliary_loss_mlp": 0.01039818, "balance_loss_clip": 1.05525589, "balance_loss_mlp": 1.02910137, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 1.8137181849775965, "language_loss": 0.76287824, "learning_rate": 3.6919369563386687e-06, "loss": 0.78490394, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.720130205154419 }, { "auxiliary_loss_clip": 0.01183184, "auxiliary_loss_mlp": 0.01037446, "balance_loss_clip": 1.05916095, "balance_loss_mlp": 1.0264194, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 2.460762057355219, "language_loss": 0.7874729, "learning_rate": 3.69152145469985e-06, "loss": 0.80967927, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.651702642440796 }, { "auxiliary_loss_clip": 0.01154219, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.05507731, "balance_loss_mlp": 1.02680719, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 2.00662574232149, "language_loss": 0.8213551, "learning_rate": 3.691105696458572e-06, "loss": 0.84327972, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.7160286903381348 }, { "auxiliary_loss_clip": 0.0121885, "auxiliary_loss_mlp": 0.0103496, "balance_loss_clip": 1.06421411, "balance_loss_mlp": 1.02457666, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 2.4685304669884953, "language_loss": 0.67733651, "learning_rate": 3.690689681677904e-06, "loss": 0.69987458, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.597928762435913 }, { "auxiliary_loss_clip": 0.01185989, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 1.05739737, "balance_loss_mlp": 1.02737176, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.8113464997957762, "language_loss": 0.88503587, "learning_rate": 3.690273410420956e-06, "loss": 0.9072721, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.680419921875 }, { "auxiliary_loss_clip": 0.01198522, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.05936301, "balance_loss_mlp": 1.02216887, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.5235738372558836, "language_loss": 0.76358002, "learning_rate": 3.689856882750875e-06, "loss": 0.78589708, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.632753610610962 }, { "auxiliary_loss_clip": 0.01195224, "auxiliary_loss_mlp": 0.01039298, "balance_loss_clip": 1.05997193, "balance_loss_mlp": 1.0295527, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 3.406987617495968, "language_loss": 0.78657067, "learning_rate": 3.6894400987308486e-06, "loss": 0.80891585, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.584219217300415 }, { "auxiliary_loss_clip": 0.01202174, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.05841458, "balance_loss_mlp": 1.02510989, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 1.907452088256081, "language_loss": 0.85177553, "learning_rate": 3.6890230584241024e-06, "loss": 0.87415445, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.5721628665924072 }, { "auxiliary_loss_clip": 0.01124554, "auxiliary_loss_mlp": 0.01005108, "balance_loss_clip": 1.04007483, "balance_loss_mlp": 1.00172269, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.080175904023805, "language_loss": 0.66433716, "learning_rate": 3.6886057618939016e-06, "loss": 0.68563378, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.1882874965667725 }, { "auxiliary_loss_clip": 0.01159228, "auxiliary_loss_mlp": 0.01035643, "balance_loss_clip": 1.05365038, "balance_loss_mlp": 1.0242641, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.3743480779233304, "language_loss": 0.69317567, "learning_rate": 3.6881882092035492e-06, "loss": 0.71512437, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.8097732067108154 }, { "auxiliary_loss_clip": 0.01090422, "auxiliary_loss_mlp": 0.00704289, "balance_loss_clip": 1.03803134, "balance_loss_mlp": 0.99990129, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9397019204096593, "language_loss": 0.61239159, "learning_rate": 3.6877704004163873e-06, "loss": 0.63033867, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.408236026763916 }, { "auxiliary_loss_clip": 0.01216067, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.06019771, "balance_loss_mlp": 1.0175693, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 4.1350624868744745, "language_loss": 0.77529436, "learning_rate": 3.6873523355957984e-06, "loss": 0.79773331, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.4751431941986084 }, { "auxiliary_loss_clip": 0.0112589, "auxiliary_loss_mlp": 0.01005417, "balance_loss_clip": 1.04125881, "balance_loss_mlp": 1.00203156, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9934101601261978, "language_loss": 0.64043802, "learning_rate": 3.686934014805201e-06, "loss": 0.66175109, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 4.721646785736084 }, { "auxiliary_loss_clip": 0.01200852, "auxiliary_loss_mlp": 0.01037364, "balance_loss_clip": 1.0618546, "balance_loss_mlp": 1.02652812, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.8324014513603188, "language_loss": 0.80898273, "learning_rate": 3.6865154381080552e-06, "loss": 0.83136487, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 2.730767250061035 }, { "auxiliary_loss_clip": 0.01118544, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.04981446, "balance_loss_mlp": 1.02723455, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 1.9667088552391159, "language_loss": 0.8249833, "learning_rate": 3.6860966055678585e-06, "loss": 0.84654081, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.756995439529419 }, { "auxiliary_loss_clip": 0.01203433, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.06270742, "balance_loss_mlp": 1.02607012, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.6939651546364087, "language_loss": 0.86116046, "learning_rate": 3.685677517248147e-06, "loss": 0.88356292, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.6677045822143555 }, { "auxiliary_loss_clip": 0.01180918, "auxiliary_loss_mlp": 0.00713369, "balance_loss_clip": 1.06194282, "balance_loss_mlp": 1.00054181, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 1.9087204674547293, "language_loss": 0.80417997, "learning_rate": 3.6852581732124967e-06, "loss": 0.82312286, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 3.6628665924072266 }, { "auxiliary_loss_clip": 0.01199957, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.06081319, "balance_loss_mlp": 1.03097939, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 1.7436965854177011, "language_loss": 0.76048499, "learning_rate": 3.6848385735245213e-06, "loss": 0.78291082, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 2.5978128910064697 }, { "auxiliary_loss_clip": 0.01187273, "auxiliary_loss_mlp": 0.01036204, "balance_loss_clip": 1.05561852, "balance_loss_mlp": 1.02593434, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 2.0080548372897837, "language_loss": 0.86071742, "learning_rate": 3.6844187182478734e-06, "loss": 0.88295209, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.681009292602539 }, { "auxiliary_loss_clip": 0.01171753, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.05478024, "balance_loss_mlp": 1.02595544, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 2.0946659214048773, "language_loss": 0.74788141, "learning_rate": 3.683998607446246e-06, "loss": 0.76996422, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.6402089595794678 }, { "auxiliary_loss_clip": 0.01200958, "auxiliary_loss_mlp": 0.01038579, "balance_loss_clip": 1.06031668, "balance_loss_mlp": 1.02837467, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 2.277746593058756, "language_loss": 0.7502898, "learning_rate": 3.6835782411833686e-06, "loss": 0.77268517, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.608322858810425 }, { "auxiliary_loss_clip": 0.01153543, "auxiliary_loss_mlp": 0.01035741, "balance_loss_clip": 1.05531466, "balance_loss_mlp": 1.02590632, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.6574408273295875, "language_loss": 0.7408756, "learning_rate": 3.68315761952301e-06, "loss": 0.76276845, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": -0.012935876846313477 }, { "auxiliary_loss_clip": 0.0121501, "auxiliary_loss_mlp": 0.01039453, "balance_loss_clip": 1.06025493, "balance_loss_mlp": 1.02880192, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 1.906664363622809, "language_loss": 0.83268535, "learning_rate": 3.6827367425289797e-06, "loss": 0.85522997, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.6079368591308594 }, { "auxiliary_loss_clip": 0.01182589, "auxiliary_loss_mlp": 0.01033088, "balance_loss_clip": 1.05891442, "balance_loss_mlp": 1.0218581, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.2357345235755868, "language_loss": 0.72402561, "learning_rate": 3.6823156102651225e-06, "loss": 0.74618244, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.637969493865967 }, { "auxiliary_loss_clip": 0.01122505, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.05376637, "balance_loss_mlp": 1.02379894, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 1.8568421713701095, "language_loss": 0.70515877, "learning_rate": 3.6818942227953257e-06, "loss": 0.7267313, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.817654609680176 }, { "auxiliary_loss_clip": 0.01169164, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.05651021, "balance_loss_mlp": 1.01929307, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 1.9988220325950008, "language_loss": 0.68690836, "learning_rate": 3.681472580183512e-06, "loss": 0.70889902, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.8447399139404297 }, { "auxiliary_loss_clip": 0.01196229, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.06139684, "balance_loss_mlp": 1.02360916, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 1.9205242826424065, "language_loss": 0.86111474, "learning_rate": 3.6810506824936455e-06, "loss": 0.88341725, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.6389853954315186 }, { "auxiliary_loss_clip": 0.01093161, "auxiliary_loss_mlp": 0.01004178, "balance_loss_clip": 1.03651047, "balance_loss_mlp": 1.00055444, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.1102103704913917, "language_loss": 0.6256088, "learning_rate": 3.680628529789726e-06, "loss": 0.64658219, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.0869970321655273 }, { "auxiliary_loss_clip": 0.0122177, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.06408167, "balance_loss_mlp": 1.02620971, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 2.0396619445093775, "language_loss": 0.86247599, "learning_rate": 3.680206122135796e-06, "loss": 0.88506818, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.5793564319610596 }, { "auxiliary_loss_clip": 0.01159496, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.06021273, "balance_loss_mlp": 1.02788174, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 1.8798762275904004, "language_loss": 0.78602535, "learning_rate": 3.6797834595959323e-06, "loss": 0.8080008, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.7800095081329346 }, { "auxiliary_loss_clip": 0.01137976, "auxiliary_loss_mlp": 0.01039976, "balance_loss_clip": 1.04986024, "balance_loss_mlp": 1.02885425, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 3.369934834286752, "language_loss": 0.77112341, "learning_rate": 3.679360542234254e-06, "loss": 0.79290289, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.7313897609710693 }, { "auxiliary_loss_clip": 0.01172206, "auxiliary_loss_mlp": 0.00713079, "balance_loss_clip": 1.05223858, "balance_loss_mlp": 1.00061226, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.5918549622297904, "language_loss": 0.72318864, "learning_rate": 3.678937370114916e-06, "loss": 0.74204147, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.761467933654785 }, { "auxiliary_loss_clip": 0.01177151, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.05683517, "balance_loss_mlp": 1.02188826, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 2.0482097871424063, "language_loss": 0.78569102, "learning_rate": 3.678513943302114e-06, "loss": 0.80778193, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.623189687728882 }, { "auxiliary_loss_clip": 0.01213695, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.0604192, "balance_loss_mlp": 1.02535868, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 1.767530982847294, "language_loss": 0.8477388, "learning_rate": 3.678090261860082e-06, "loss": 0.87022895, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.6387996673583984 }, { "auxiliary_loss_clip": 0.01164488, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.04974627, "balance_loss_mlp": 1.02733469, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 1.9843319348966781, "language_loss": 0.78033817, "learning_rate": 3.6776663258530906e-06, "loss": 0.80235761, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.645158529281616 }, { "auxiliary_loss_clip": 0.01203431, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.06078887, "balance_loss_mlp": 1.02566755, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 1.8710035424093276, "language_loss": 0.71160924, "learning_rate": 3.6772421353454516e-06, "loss": 0.73400009, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.6363179683685303 }, { "auxiliary_loss_clip": 0.01199409, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.06290817, "balance_loss_mlp": 1.02697694, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.7892453987435843, "language_loss": 0.88522422, "learning_rate": 3.6768176904015153e-06, "loss": 0.90759152, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.657458543777466 }, { "auxiliary_loss_clip": 0.01200687, "auxiliary_loss_mlp": 0.01032134, "balance_loss_clip": 1.05721021, "balance_loss_mlp": 1.0216496, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 2.0491501762329825, "language_loss": 0.59898531, "learning_rate": 3.6763929910856674e-06, "loss": 0.62131357, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.582050085067749 }, { "auxiliary_loss_clip": 0.01197108, "auxiliary_loss_mlp": 0.01031854, "balance_loss_clip": 1.06009793, "balance_loss_mlp": 1.02170968, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.617209695562762, "language_loss": 0.77845365, "learning_rate": 3.6759680374623365e-06, "loss": 0.80074328, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 4.278313875198364 }, { "auxiliary_loss_clip": 0.0121467, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.06263316, "balance_loss_mlp": 1.0304513, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 3.1800717173559856, "language_loss": 0.7515533, "learning_rate": 3.675542829595986e-06, "loss": 0.77411032, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 2.6310770511627197 }, { "auxiliary_loss_clip": 0.01181925, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.05673683, "balance_loss_mlp": 1.02392936, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 2.076818902954661, "language_loss": 0.79431456, "learning_rate": 3.6751173675511213e-06, "loss": 0.81647956, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.7059946060180664 }, { "auxiliary_loss_clip": 0.01181287, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.05388367, "balance_loss_mlp": 1.02404428, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.07147711412915, "language_loss": 0.87541854, "learning_rate": 3.674691651392283e-06, "loss": 0.89757526, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.6866707801818848 }, { "auxiliary_loss_clip": 0.01188975, "auxiliary_loss_mlp": 0.01038906, "balance_loss_clip": 1.06052685, "balance_loss_mlp": 1.02774179, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 13.297509614125708, "language_loss": 0.75796545, "learning_rate": 3.674265681184053e-06, "loss": 0.78024423, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 3.714754819869995 }, { "auxiliary_loss_clip": 0.01183648, "auxiliary_loss_mlp": 0.01037982, "balance_loss_clip": 1.05771935, "balance_loss_mlp": 1.02820635, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.6755610933032699, "language_loss": 0.86681163, "learning_rate": 3.6738394569910504e-06, "loss": 0.88902795, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 2.7287683486938477 }, { "auxiliary_loss_clip": 0.01199323, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.06167924, "balance_loss_mlp": 1.02391171, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.0456839427334983, "language_loss": 0.83179563, "learning_rate": 3.6734129788779333e-06, "loss": 0.85413432, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.7258260250091553 }, { "auxiliary_loss_clip": 0.01163713, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.05837369, "balance_loss_mlp": 1.02650666, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 1.8265883499018019, "language_loss": 0.90178263, "learning_rate": 3.6729862469093976e-06, "loss": 0.92378306, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.710806131362915 }, { "auxiliary_loss_clip": 0.01168844, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.05518055, "balance_loss_mlp": 1.0264281, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.393780034968771, "language_loss": 0.82804215, "learning_rate": 3.6725592611501782e-06, "loss": 0.85009432, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.677335739135742 }, { "auxiliary_loss_clip": 0.01195629, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.05652893, "balance_loss_mlp": 1.02322328, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 2.089452503221546, "language_loss": 0.76289636, "learning_rate": 3.6721320216650496e-06, "loss": 0.78519434, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.6592748165130615 }, { "auxiliary_loss_clip": 0.01181863, "auxiliary_loss_mlp": 0.0103387, "balance_loss_clip": 1.0594418, "balance_loss_mlp": 1.02311754, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 2.2651698786755405, "language_loss": 0.83472836, "learning_rate": 3.6717045285188215e-06, "loss": 0.85688573, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.646519184112549 }, { "auxiliary_loss_clip": 0.01135885, "auxiliary_loss_mlp": 0.01037551, "balance_loss_clip": 1.04899323, "balance_loss_mlp": 1.0269475, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.228837659991577, "language_loss": 0.86612439, "learning_rate": 3.671276781776346e-06, "loss": 0.88785875, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.714918613433838 }, { "auxiliary_loss_clip": 0.01174835, "auxiliary_loss_mlp": 0.0102666, "balance_loss_clip": 1.05465651, "balance_loss_mlp": 1.01662898, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 2.3640495756617534, "language_loss": 0.66847992, "learning_rate": 3.6708487815025128e-06, "loss": 0.69049478, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7431390285491943 }, { "auxiliary_loss_clip": 0.01167357, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.05802166, "balance_loss_mlp": 1.02624857, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.706377747350896, "language_loss": 0.74240661, "learning_rate": 3.6704205277622463e-06, "loss": 0.76444757, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.631469488143921 }, { "auxiliary_loss_clip": 0.01186565, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.05743039, "balance_loss_mlp": 1.02435207, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.816570193990303, "language_loss": 0.80625653, "learning_rate": 3.6699920206205146e-06, "loss": 0.82847166, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.7163054943084717 }, { "auxiliary_loss_clip": 0.0120222, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.06024337, "balance_loss_mlp": 1.0231297, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 1.6862205374519503, "language_loss": 0.81907415, "learning_rate": 3.669563260142321e-06, "loss": 0.84143412, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.6159024238586426 }, { "auxiliary_loss_clip": 0.01181753, "auxiliary_loss_mlp": 0.01044603, "balance_loss_clip": 1.06066811, "balance_loss_mlp": 1.03388, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 2.138342089304348, "language_loss": 0.84521449, "learning_rate": 3.6691342463927083e-06, "loss": 0.86747801, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.6789934635162354 }, { "auxiliary_loss_clip": 0.0117246, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.05820477, "balance_loss_mlp": 1.02876687, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.6490246026683957, "language_loss": 0.81830335, "learning_rate": 3.668704979436758e-06, "loss": 0.84042525, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.7543108463287354 }, { "auxiliary_loss_clip": 0.01174857, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.05392075, "balance_loss_mlp": 1.02308655, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.486547724619498, "language_loss": 0.78702533, "learning_rate": 3.668275459339588e-06, "loss": 0.8091116, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.6436872482299805 }, { "auxiliary_loss_clip": 0.01216796, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.06260538, "balance_loss_mlp": 1.01898432, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 2.1673121750526234, "language_loss": 0.80008727, "learning_rate": 3.667845686166358e-06, "loss": 0.82255298, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.5424697399139404 }, { "auxiliary_loss_clip": 0.0114844, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.0531553, "balance_loss_mlp": 1.02091932, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.6467540307573607, "language_loss": 0.85931093, "learning_rate": 3.6674156599822634e-06, "loss": 0.88111138, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.743877649307251 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.05248618, "balance_loss_mlp": 1.02611709, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 1.9353396807412349, "language_loss": 0.82330626, "learning_rate": 3.666985380852539e-06, "loss": 0.8451848, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.717716693878174 }, { "auxiliary_loss_clip": 0.01185075, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.06145477, "balance_loss_mlp": 1.02751899, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 3.1084294142169573, "language_loss": 0.74026304, "learning_rate": 3.6665548488424576e-06, "loss": 0.76249248, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.7199509143829346 }, { "auxiliary_loss_clip": 0.01218065, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.06235814, "balance_loss_mlp": 1.02683568, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 1.8586992695511046, "language_loss": 0.87867981, "learning_rate": 3.6661240640173307e-06, "loss": 0.90123725, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.6193878650665283 }, { "auxiliary_loss_clip": 0.01090719, "auxiliary_loss_mlp": 0.01008525, "balance_loss_clip": 1.04205036, "balance_loss_mlp": 1.00485325, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8621549921932751, "language_loss": 0.57899886, "learning_rate": 3.6656930264425085e-06, "loss": 0.59999126, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.2874577045440674 }, { "auxiliary_loss_clip": 0.01216364, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.06248498, "balance_loss_mlp": 1.02476406, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 1.8133779311194695, "language_loss": 0.75299788, "learning_rate": 3.665261736183378e-06, "loss": 0.77550578, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 4.474575996398926 }, { "auxiliary_loss_clip": 0.0116726, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.05840087, "balance_loss_mlp": 1.02489281, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 2.6291919972121622, "language_loss": 0.88814396, "learning_rate": 3.664830193305366e-06, "loss": 0.91017628, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 3.55092453956604 }, { "auxiliary_loss_clip": 0.01159908, "auxiliary_loss_mlp": 0.010353, "balance_loss_clip": 1.054389, "balance_loss_mlp": 1.02424312, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 2.71790238690886, "language_loss": 0.76755929, "learning_rate": 3.6643983978739373e-06, "loss": 0.78951138, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 2.657363176345825 }, { "auxiliary_loss_clip": 0.01179224, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.06209183, "balance_loss_mlp": 1.02748072, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.7826732858024275, "language_loss": 0.82096332, "learning_rate": 3.663966349954596e-06, "loss": 0.84313762, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.70682954788208 }, { "auxiliary_loss_clip": 0.01114876, "auxiliary_loss_mlp": 0.01003084, "balance_loss_clip": 1.04231, "balance_loss_mlp": 0.99948353, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7834179011123311, "language_loss": 0.59766459, "learning_rate": 3.6635340496128816e-06, "loss": 0.61884415, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 3.1208832263946533 }, { "auxiliary_loss_clip": 0.01147183, "auxiliary_loss_mlp": 0.01036582, "balance_loss_clip": 1.05510473, "balance_loss_mlp": 1.02641308, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.7983224313492552, "language_loss": 0.92671257, "learning_rate": 3.6631014969143747e-06, "loss": 0.94855022, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 3.7217659950256348 }, { "auxiliary_loss_clip": 0.01200647, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.06138515, "balance_loss_mlp": 1.03108406, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 2.622160010194154, "language_loss": 0.88857472, "learning_rate": 3.662668691924693e-06, "loss": 0.91099936, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 2.805621385574341 }, { "auxiliary_loss_clip": 0.01165334, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.05544066, "balance_loss_mlp": 1.02719378, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 2.1359658466290656, "language_loss": 0.71192992, "learning_rate": 3.6622356347094927e-06, "loss": 0.73396599, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.686122417449951 }, { "auxiliary_loss_clip": 0.01167358, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.05666113, "balance_loss_mlp": 1.02120399, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 2.240643084692075, "language_loss": 0.78695548, "learning_rate": 3.6618023253344684e-06, "loss": 0.80895555, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.775390148162842 }, { "auxiliary_loss_clip": 0.01195474, "auxiliary_loss_mlp": 0.01037791, "balance_loss_clip": 1.05718088, "balance_loss_mlp": 1.02678823, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.8875656265966534, "language_loss": 0.83437645, "learning_rate": 3.6613687638653527e-06, "loss": 0.85670906, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.5867693424224854 }, { "auxiliary_loss_clip": 0.0117986, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.05979776, "balance_loss_mlp": 1.02387857, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 1.8873703696030575, "language_loss": 0.77986062, "learning_rate": 3.660934950367916e-06, "loss": 0.80200553, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.7019765377044678 }, { "auxiliary_loss_clip": 0.01200827, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.05958021, "balance_loss_mlp": 1.02131581, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 2.127162906805994, "language_loss": 0.83635938, "learning_rate": 3.660500884907968e-06, "loss": 0.85869038, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.644702672958374 }, { "auxiliary_loss_clip": 0.0108305, "auxiliary_loss_mlp": 0.01003967, "balance_loss_clip": 1.04473507, "balance_loss_mlp": 1.00010478, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8222082903750625, "language_loss": 0.60043865, "learning_rate": 3.660066567551356e-06, "loss": 0.6213088, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.194499969482422 }, { "auxiliary_loss_clip": 0.0120128, "auxiliary_loss_mlp": 0.00713584, "balance_loss_clip": 1.06058002, "balance_loss_mlp": 1.00075638, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 2.5829782049767767, "language_loss": 0.83663231, "learning_rate": 3.6596319983639657e-06, "loss": 0.85578096, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.6624915599823 }, { "auxiliary_loss_clip": 0.01171151, "auxiliary_loss_mlp": 0.00713801, "balance_loss_clip": 1.06065321, "balance_loss_mlp": 1.00069523, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.8488258472791752, "language_loss": 0.8599537, "learning_rate": 3.6591971774117214e-06, "loss": 0.87880325, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.7441110610961914 }, { "auxiliary_loss_clip": 0.01205598, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.06322634, "balance_loss_mlp": 1.03352189, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.334272395252782, "language_loss": 0.79880536, "learning_rate": 3.6587621047605833e-06, "loss": 0.82130301, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.672348976135254 }, { "auxiliary_loss_clip": 0.01202587, "auxiliary_loss_mlp": 0.01042152, "balance_loss_clip": 1.06240284, "balance_loss_mlp": 1.03153634, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.1580792376647002, "language_loss": 0.87153804, "learning_rate": 3.6583267804765542e-06, "loss": 0.89398539, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.6145615577697754 }, { "auxiliary_loss_clip": 0.01199622, "auxiliary_loss_mlp": 0.01038909, "balance_loss_clip": 1.0609926, "balance_loss_mlp": 1.02739334, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 1.8213016074308144, "language_loss": 0.85533381, "learning_rate": 3.6578912046256702e-06, "loss": 0.8777191, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.63858699798584 }, { "auxiliary_loss_clip": 0.01162817, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.05596268, "balance_loss_mlp": 1.02947366, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.992403152666515, "language_loss": 0.76279867, "learning_rate": 3.6574553772740083e-06, "loss": 0.78483903, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.6406941413879395 }, { "auxiliary_loss_clip": 0.01111614, "auxiliary_loss_mlp": 0.0104548, "balance_loss_clip": 1.04801738, "balance_loss_mlp": 1.04272652, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8756982825377365, "language_loss": 0.61918116, "learning_rate": 3.657019298487684e-06, "loss": 0.64075208, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.2369799613952637 }, { "auxiliary_loss_clip": 0.01203963, "auxiliary_loss_mlp": 0.0071501, "balance_loss_clip": 1.05969107, "balance_loss_mlp": 1.00088072, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 1.79917202724613, "language_loss": 0.83453643, "learning_rate": 3.6565829683328495e-06, "loss": 0.85372615, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.746366024017334 }, { "auxiliary_loss_clip": 0.01194191, "auxiliary_loss_mlp": 0.01036928, "balance_loss_clip": 1.06019974, "balance_loss_mlp": 1.02683091, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 3.058506558444282, "language_loss": 0.86426103, "learning_rate": 3.6561463868756965e-06, "loss": 0.88657218, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.6368660926818848 }, { "auxiliary_loss_clip": 0.01197381, "auxiliary_loss_mlp": 0.01039117, "balance_loss_clip": 1.06167281, "balance_loss_mlp": 1.0283165, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 1.9045556048450032, "language_loss": 0.78095889, "learning_rate": 3.655709554182452e-06, "loss": 0.80332392, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.7218422889709473 }, { "auxiliary_loss_clip": 0.01202468, "auxiliary_loss_mlp": 0.01034834, "balance_loss_clip": 1.06045771, "balance_loss_mlp": 1.02365828, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 1.780324347914342, "language_loss": 0.8436861, "learning_rate": 3.6552724703193855e-06, "loss": 0.86605906, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.610365390777588 }, { "auxiliary_loss_clip": 0.01081377, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 1.04680264, "balance_loss_mlp": 1.0230248, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7993915083610912, "language_loss": 0.55930489, "learning_rate": 3.654835135352801e-06, "loss": 0.58038801, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.18857741355896 }, { "auxiliary_loss_clip": 0.01148357, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.05052245, "balance_loss_mlp": 1.02710104, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 3.1749681418430726, "language_loss": 0.87312591, "learning_rate": 3.654397549349043e-06, "loss": 0.89498597, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.6974949836730957 }, { "auxiliary_loss_clip": 0.01182813, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.06104529, "balance_loss_mlp": 1.02853847, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.0703811105094996, "language_loss": 0.75142378, "learning_rate": 3.653959712374491e-06, "loss": 0.7736457, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 4.578454256057739 }, { "auxiliary_loss_clip": 0.01161678, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.06055427, "balance_loss_mlp": 1.02384889, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.7790105465617159, "language_loss": 0.82660556, "learning_rate": 3.6535216244955663e-06, "loss": 0.84857082, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 3.5368621349334717 }, { "auxiliary_loss_clip": 0.01182653, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.05986738, "balance_loss_mlp": 1.01842463, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 3.303115405024562, "language_loss": 0.70799768, "learning_rate": 3.653083285778726e-06, "loss": 0.73011506, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.71860933303833 }, { "auxiliary_loss_clip": 0.01205071, "auxiliary_loss_mlp": 0.01039031, "balance_loss_clip": 1.06188285, "balance_loss_mlp": 1.0286299, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 2.436123196633981, "language_loss": 0.81060988, "learning_rate": 3.6526446962904653e-06, "loss": 0.83305091, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 2.620867967605591 }, { "auxiliary_loss_clip": 0.01197007, "auxiliary_loss_mlp": 0.01042553, "balance_loss_clip": 1.06188047, "balance_loss_mlp": 1.03197944, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.460460516105577, "language_loss": 0.74273282, "learning_rate": 3.652205856097318e-06, "loss": 0.76512837, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.7057244777679443 }, { "auxiliary_loss_clip": 0.0117639, "auxiliary_loss_mlp": 0.00713945, "balance_loss_clip": 1.05767155, "balance_loss_mlp": 1.00077188, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 1.9688806184859247, "language_loss": 0.7939077, "learning_rate": 3.651766765265856e-06, "loss": 0.81281102, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.575662612915039 }, { "auxiliary_loss_clip": 0.01175968, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.05473018, "balance_loss_mlp": 1.02429509, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 2.6597902156895237, "language_loss": 0.81042278, "learning_rate": 3.65132742386269e-06, "loss": 0.83252907, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.6007678508758545 }, { "auxiliary_loss_clip": 0.01215507, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.06152558, "balance_loss_mlp": 1.02401996, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.7806244067744366, "language_loss": 0.84468591, "learning_rate": 3.6508878319544656e-06, "loss": 0.86718118, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 2.556539535522461 }, { "auxiliary_loss_clip": 0.01173171, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.05696666, "balance_loss_mlp": 1.02986538, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 2.693803106151193, "language_loss": 0.80732751, "learning_rate": 3.65044798960787e-06, "loss": 0.82946414, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.6015448570251465 }, { "auxiliary_loss_clip": 0.01157537, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.05430079, "balance_loss_mlp": 1.0234983, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 2.0931309839779964, "language_loss": 0.78365874, "learning_rate": 3.650007896889627e-06, "loss": 0.8055743, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.555351734161377 }, { "auxiliary_loss_clip": 0.01218228, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.06611991, "balance_loss_mlp": 1.02198958, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 2.8710001783033747, "language_loss": 0.80769646, "learning_rate": 3.6495675538664974e-06, "loss": 0.83020258, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.5122265815734863 }, { "auxiliary_loss_clip": 0.01180185, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.05411768, "balance_loss_mlp": 1.02404892, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 1.7646250216202137, "language_loss": 0.82334954, "learning_rate": 3.649126960605282e-06, "loss": 0.84549391, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.669778823852539 }, { "auxiliary_loss_clip": 0.0117882, "auxiliary_loss_mlp": 0.01037742, "balance_loss_clip": 1.05871201, "balance_loss_mlp": 1.02661443, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 2.6153819419084954, "language_loss": 0.83890796, "learning_rate": 3.6486861171728174e-06, "loss": 0.86107355, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.695052146911621 }, { "auxiliary_loss_clip": 0.01164774, "auxiliary_loss_mlp": 0.01035773, "balance_loss_clip": 1.05327809, "balance_loss_mlp": 1.02503824, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.6548942848079913, "language_loss": 0.78463995, "learning_rate": 3.6482450236359803e-06, "loss": 0.80664551, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.7233355045318604 }, { "auxiliary_loss_clip": 0.01197708, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.06176543, "balance_loss_mlp": 1.02900314, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.131721437111872, "language_loss": 0.77560329, "learning_rate": 3.647803680061683e-06, "loss": 0.79797757, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.65537691116333 }, { "auxiliary_loss_clip": 0.01184093, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.05820167, "balance_loss_mlp": 1.01963294, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 3.4380463342750422, "language_loss": 0.74457788, "learning_rate": 3.6473620865168776e-06, "loss": 0.76672459, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.6520299911499023 }, { "auxiliary_loss_clip": 0.01182767, "auxiliary_loss_mlp": 0.01039772, "balance_loss_clip": 1.06037438, "balance_loss_mlp": 1.02914453, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 1.826648862260189, "language_loss": 0.81385994, "learning_rate": 3.646920243068554e-06, "loss": 0.83608538, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.644392490386963 }, { "auxiliary_loss_clip": 0.01168589, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.05530167, "balance_loss_mlp": 1.02315259, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 1.7586190184108166, "language_loss": 0.7485441, "learning_rate": 3.6464781497837384e-06, "loss": 0.77056921, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.730957508087158 }, { "auxiliary_loss_clip": 0.01187993, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.05915236, "balance_loss_mlp": 1.02086449, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.8908653473062984, "language_loss": 0.72493756, "learning_rate": 3.6460358067294965e-06, "loss": 0.74712753, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.688319683074951 }, { "auxiliary_loss_clip": 0.01216629, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.06103563, "balance_loss_mlp": 1.02126241, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.478178535229784, "language_loss": 0.77490211, "learning_rate": 3.645593213972932e-06, "loss": 0.79738522, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.6594319343566895 }, { "auxiliary_loss_clip": 0.01194855, "auxiliary_loss_mlp": 0.01037604, "balance_loss_clip": 1.06121969, "balance_loss_mlp": 1.02683926, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.235030586448516, "language_loss": 0.80183411, "learning_rate": 3.6451503715811852e-06, "loss": 0.82415867, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.6066946983337402 }, { "auxiliary_loss_clip": 0.01181914, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.06162441, "balance_loss_mlp": 1.02577269, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 1.9670536160999632, "language_loss": 0.79822063, "learning_rate": 3.6447072796214345e-06, "loss": 0.82039905, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.661491870880127 }, { "auxiliary_loss_clip": 0.01068915, "auxiliary_loss_mlp": 0.01019639, "balance_loss_clip": 1.03751862, "balance_loss_mlp": 1.01603878, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.9119606228533754, "language_loss": 0.63174778, "learning_rate": 3.644263938160898e-06, "loss": 0.65263331, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.169644832611084 }, { "auxiliary_loss_clip": 0.01168325, "auxiliary_loss_mlp": 0.01038238, "balance_loss_clip": 1.060179, "balance_loss_mlp": 1.0275985, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.893896370945883, "language_loss": 0.71818471, "learning_rate": 3.6438203472668293e-06, "loss": 0.74025035, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 2.7386481761932373 }, { "auxiliary_loss_clip": 0.01185838, "auxiliary_loss_mlp": 0.0103374, "balance_loss_clip": 1.06068444, "balance_loss_mlp": 1.02335739, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 3.9507089665257067, "language_loss": 0.81555468, "learning_rate": 3.6433765070065206e-06, "loss": 0.83775043, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.678070306777954 }, { "auxiliary_loss_clip": 0.01215369, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.06220961, "balance_loss_mlp": 1.0244422, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.3153209198064424, "language_loss": 0.8750785, "learning_rate": 3.6429324174473025e-06, "loss": 0.89758635, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.610929250717163 }, { "auxiliary_loss_clip": 0.01199533, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.02708697, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 3.6275307284868643, "language_loss": 0.84949416, "learning_rate": 3.6424880786565425e-06, "loss": 0.87186122, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 4.485093355178833 }, { "auxiliary_loss_clip": 0.01145852, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.05737638, "balance_loss_mlp": 1.02456248, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 3.064505593374719, "language_loss": 0.79850686, "learning_rate": 3.6420434907016482e-06, "loss": 0.82032412, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 3.5717530250549316 }, { "auxiliary_loss_clip": 0.01202168, "auxiliary_loss_mlp": 0.01029653, "balance_loss_clip": 1.06407428, "balance_loss_mlp": 1.01971102, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 2.1944729895161292, "language_loss": 0.81097794, "learning_rate": 3.6415986536500606e-06, "loss": 0.83329606, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.594817876815796 }, { "auxiliary_loss_clip": 0.01144306, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.06008816, "balance_loss_mlp": 1.02257252, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 2.0811518516229777, "language_loss": 0.80542123, "learning_rate": 3.641153567569263e-06, "loss": 0.82719064, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 2.6936681270599365 }, { "auxiliary_loss_clip": 0.01194285, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.0589509, "balance_loss_mlp": 1.02616858, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.4249881717478106, "language_loss": 0.95559108, "learning_rate": 3.640708232526774e-06, "loss": 0.97789705, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 2.6677212715148926 }, { "auxiliary_loss_clip": 0.01127749, "auxiliary_loss_mlp": 0.01041524, "balance_loss_clip": 1.04818773, "balance_loss_mlp": 1.03089023, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 1.9979358737563844, "language_loss": 0.78361237, "learning_rate": 3.6402626485901504e-06, "loss": 0.80530506, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 3.7129876613616943 }, { "auxiliary_loss_clip": 0.01195539, "auxiliary_loss_mlp": 0.01038166, "balance_loss_clip": 1.06301975, "balance_loss_mlp": 1.02829552, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.051373420295439, "language_loss": 0.78022015, "learning_rate": 3.639816815826988e-06, "loss": 0.80255717, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.609076976776123 }, { "auxiliary_loss_clip": 0.01179653, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.05997276, "balance_loss_mlp": 1.03047001, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 2.2620039376651633, "language_loss": 0.77802038, "learning_rate": 3.6393707343049176e-06, "loss": 0.80022001, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.6556477546691895 }, { "auxiliary_loss_clip": 0.0120146, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.06003785, "balance_loss_mlp": 1.02455902, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.548481621512073, "language_loss": 0.73189676, "learning_rate": 3.6389244040916104e-06, "loss": 0.75425625, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.684131383895874 }, { "auxiliary_loss_clip": 0.01170892, "auxiliary_loss_mlp": 0.00714881, "balance_loss_clip": 1.05584717, "balance_loss_mlp": 1.00072956, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 2.0598135732690603, "language_loss": 0.79442209, "learning_rate": 3.6384778252547747e-06, "loss": 0.81327981, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.7165844440460205 }, { "auxiliary_loss_clip": 0.01178816, "auxiliary_loss_mlp": 0.00714155, "balance_loss_clip": 1.06172884, "balance_loss_mlp": 1.00083315, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.416386537152908, "language_loss": 0.77864397, "learning_rate": 3.638030997862155e-06, "loss": 0.79757375, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.6906869411468506 }, { "auxiliary_loss_clip": 0.01095923, "auxiliary_loss_mlp": 0.01005204, "balance_loss_clip": 1.04130816, "balance_loss_mlp": 1.00169897, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7641964159670924, "language_loss": 0.59418178, "learning_rate": 3.6375839219815356e-06, "loss": 0.61519301, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.181307077407837 }, { "auxiliary_loss_clip": 0.01216566, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.06235886, "balance_loss_mlp": 1.02688301, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 2.2627704179503576, "language_loss": 0.82553226, "learning_rate": 3.6371365976807375e-06, "loss": 0.84807158, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.6443498134613037 }, { "auxiliary_loss_clip": 0.01140203, "auxiliary_loss_mlp": 0.01046935, "balance_loss_clip": 1.0553515, "balance_loss_mlp": 1.03679585, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.8436266516185933, "language_loss": 0.8331483, "learning_rate": 3.6366890250276185e-06, "loss": 0.85501963, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.747483253479004 }, { "auxiliary_loss_clip": 0.01214108, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.06214762, "balance_loss_mlp": 1.02462208, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.340811993914169, "language_loss": 0.89996362, "learning_rate": 3.6362412040900764e-06, "loss": 0.92244494, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.6863279342651367 }, { "auxiliary_loss_clip": 0.01199854, "auxiliary_loss_mlp": 0.01031564, "balance_loss_clip": 1.05914879, "balance_loss_mlp": 1.02094817, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.49251240704782, "language_loss": 0.80263484, "learning_rate": 3.635793134936044e-06, "loss": 0.82494903, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.6676042079925537 }, { "auxiliary_loss_clip": 0.01197896, "auxiliary_loss_mlp": 0.01035993, "balance_loss_clip": 1.06122112, "balance_loss_mlp": 1.02600288, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 4.983725506460881, "language_loss": 0.73003101, "learning_rate": 3.635344817633494e-06, "loss": 0.75236988, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.741034746170044 }, { "auxiliary_loss_clip": 0.01196041, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.06072116, "balance_loss_mlp": 1.02468061, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.1369921389096014, "language_loss": 0.75330186, "learning_rate": 3.634896252250436e-06, "loss": 0.77561247, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.597778797149658 }, { "auxiliary_loss_clip": 0.01219106, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.06414223, "balance_loss_mlp": 1.0260675, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 2.2747034352458133, "language_loss": 0.82342911, "learning_rate": 3.6344474388549157e-06, "loss": 0.84598815, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.6592447757720947 }, { "auxiliary_loss_clip": 0.0119981, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.06186223, "balance_loss_mlp": 1.02616024, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 2.053752096044472, "language_loss": 0.80006397, "learning_rate": 3.6339983775150183e-06, "loss": 0.8224256, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.5621416568756104 }, { "auxiliary_loss_clip": 0.01196714, "auxiliary_loss_mlp": 0.01031368, "balance_loss_clip": 1.06202221, "balance_loss_mlp": 1.0206933, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 3.3849270083899525, "language_loss": 0.84591556, "learning_rate": 3.6335490682988664e-06, "loss": 0.86819637, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.6237545013427734 }, { "auxiliary_loss_clip": 0.01125816, "auxiliary_loss_mlp": 0.01035761, "balance_loss_clip": 1.05191505, "balance_loss_mlp": 1.02484739, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 1.926193965308287, "language_loss": 0.83142197, "learning_rate": 3.63309951127462e-06, "loss": 0.85303771, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.7742581367492676 }, { "auxiliary_loss_clip": 0.01164867, "auxiliary_loss_mlp": 0.01038618, "balance_loss_clip": 1.05862284, "balance_loss_mlp": 1.0277338, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.6050545473717293, "language_loss": 0.75204527, "learning_rate": 3.6326497065104757e-06, "loss": 0.77408016, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.7317497730255127 }, { "auxiliary_loss_clip": 0.0120347, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.0621078, "balance_loss_mlp": 1.02504015, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 5.895607582105031, "language_loss": 0.77350426, "learning_rate": 3.6321996540746697e-06, "loss": 0.79589379, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.687469005584717 }, { "auxiliary_loss_clip": 0.01162242, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.05622756, "balance_loss_mlp": 1.02694356, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 3.4156155833830835, "language_loss": 0.80485713, "learning_rate": 3.6317493540354733e-06, "loss": 0.82684988, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.772521734237671 }, { "auxiliary_loss_clip": 0.01191228, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.05756211, "balance_loss_mlp": 1.02546799, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 1.8956321753528438, "language_loss": 0.76693618, "learning_rate": 3.6312988064611976e-06, "loss": 0.78920984, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.621013879776001 }, { "auxiliary_loss_clip": 0.01162852, "auxiliary_loss_mlp": 0.01038029, "balance_loss_clip": 1.05289614, "balance_loss_mlp": 1.02781272, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 1.757870933174858, "language_loss": 0.81235409, "learning_rate": 3.6308480114201896e-06, "loss": 0.83436292, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 3.5845730304718018 }, { "auxiliary_loss_clip": 0.01217925, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.06583858, "balance_loss_mlp": 1.02747107, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.8158144935526652, "language_loss": 0.76363122, "learning_rate": 3.630396968980835e-06, "loss": 0.78619003, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 3.500368356704712 }, { "auxiliary_loss_clip": 0.01181262, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.05735707, "balance_loss_mlp": 1.02214789, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 3.0924906881101926, "language_loss": 0.83345503, "learning_rate": 3.6299456792115575e-06, "loss": 0.85559618, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.758342981338501 }, { "auxiliary_loss_clip": 0.0109435, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.04494095, "balance_loss_mlp": 1.02622843, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 3.253683953661522, "language_loss": 0.80939376, "learning_rate": 3.629494142180815e-06, "loss": 0.83070844, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 2.763315200805664 }, { "auxiliary_loss_clip": 0.01213739, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.06112075, "balance_loss_mlp": 1.02426338, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.4077265909885868, "language_loss": 0.84959185, "learning_rate": 3.6290423579571075e-06, "loss": 0.87207764, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 3.484236001968384 }, { "auxiliary_loss_clip": 0.01195388, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.06063867, "balance_loss_mlp": 1.0260129, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.7101460305814649, "language_loss": 0.79700732, "learning_rate": 3.6285903266089694e-06, "loss": 0.81932449, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.6397409439086914 }, { "auxiliary_loss_clip": 0.01182416, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.0596869, "balance_loss_mlp": 1.02340055, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 2.343226636258376, "language_loss": 0.7687552, "learning_rate": 3.628138048204974e-06, "loss": 0.79091394, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.6912739276885986 }, { "auxiliary_loss_clip": 0.01137161, "auxiliary_loss_mlp": 0.01037247, "balance_loss_clip": 1.05173397, "balance_loss_mlp": 1.02664399, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 2.0017960954061143, "language_loss": 0.75979024, "learning_rate": 3.6276855228137304e-06, "loss": 0.78153431, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.671478748321533 }, { "auxiliary_loss_clip": 0.01214628, "auxiliary_loss_mlp": 0.00713959, "balance_loss_clip": 1.06237662, "balance_loss_mlp": 1.00082695, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.3885981014513016, "language_loss": 0.81818211, "learning_rate": 3.6272327505038874e-06, "loss": 0.83746797, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.60728120803833 }, { "auxiliary_loss_clip": 0.01154643, "auxiliary_loss_mlp": 0.0103234, "balance_loss_clip": 1.05473852, "balance_loss_mlp": 1.02253461, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.042826265717023, "language_loss": 0.77935773, "learning_rate": 3.626779731344131e-06, "loss": 0.80122757, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.9060943126678467 }, { "auxiliary_loss_clip": 0.01211104, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.05928063, "balance_loss_mlp": 1.02485514, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.1739700012240384, "language_loss": 0.85242593, "learning_rate": 3.6263264654031814e-06, "loss": 0.87488461, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.6151959896087646 }, { "auxiliary_loss_clip": 0.01108619, "auxiliary_loss_mlp": 0.0102318, "balance_loss_clip": 1.05852842, "balance_loss_mlp": 1.01967549, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.751208082863956, "language_loss": 0.59157586, "learning_rate": 3.6258729527498008e-06, "loss": 0.61289382, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.239999294281006 }, { "auxiliary_loss_clip": 0.01184324, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.05782056, "balance_loss_mlp": 1.02177596, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.5747837577835213, "language_loss": 0.64489239, "learning_rate": 3.6254191934527854e-06, "loss": 0.6670599, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.6882147789001465 }, { "auxiliary_loss_clip": 0.01164125, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.05976033, "balance_loss_mlp": 1.02776623, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 2.1558389373226614, "language_loss": 0.64527333, "learning_rate": 3.6249651875809715e-06, "loss": 0.66729605, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.7419652938842773 }, { "auxiliary_loss_clip": 0.01176019, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.05862224, "balance_loss_mlp": 1.02727485, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 2.1879597580338417, "language_loss": 0.89179683, "learning_rate": 3.62451093520323e-06, "loss": 0.91393137, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.653888463973999 }, { "auxiliary_loss_clip": 0.01140409, "auxiliary_loss_mlp": 0.01034395, "balance_loss_clip": 1.05193913, "balance_loss_mlp": 1.02351153, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 2.126753298389836, "language_loss": 0.90506238, "learning_rate": 3.6240564363884714e-06, "loss": 0.92681044, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.6997182369232178 }, { "auxiliary_loss_clip": 0.01198546, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.0561868, "balance_loss_mlp": 1.02739334, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 1.9649502457674517, "language_loss": 0.70495957, "learning_rate": 3.623601691205643e-06, "loss": 0.72732502, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.6288623809814453 }, { "auxiliary_loss_clip": 0.01193051, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.05614233, "balance_loss_mlp": 1.02426243, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 3.095077396123972, "language_loss": 0.81391543, "learning_rate": 3.623146699723729e-06, "loss": 0.83619237, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.64542293548584 }, { "auxiliary_loss_clip": 0.01182247, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.06306267, "balance_loss_mlp": 1.01997817, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.6760426964086743, "language_loss": 0.77365941, "learning_rate": 3.6226914620117507e-06, "loss": 0.79578394, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.603379487991333 }, { "auxiliary_loss_clip": 0.0116028, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.04997754, "balance_loss_mlp": 1.02141011, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.6329196155090115, "language_loss": 0.80650967, "learning_rate": 3.622235978138768e-06, "loss": 0.82842475, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.6982498168945312 }, { "auxiliary_loss_clip": 0.01198236, "auxiliary_loss_mlp": 0.01045495, "balance_loss_clip": 1.06246257, "balance_loss_mlp": 1.03552866, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 2.0349353362149714, "language_loss": 0.8094179, "learning_rate": 3.621780248173877e-06, "loss": 0.83185518, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.669551372528076 }, { "auxiliary_loss_clip": 0.01133022, "auxiliary_loss_mlp": 0.01007304, "balance_loss_clip": 1.0532757, "balance_loss_mlp": 1.00320363, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8351932727921922, "language_loss": 0.61005872, "learning_rate": 3.6213242721862125e-06, "loss": 0.63146204, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.1949334144592285 }, { "auxiliary_loss_clip": 0.01172545, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.05854154, "balance_loss_mlp": 1.0286442, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.7926464495617176, "language_loss": 0.74902534, "learning_rate": 3.620868050244945e-06, "loss": 0.7711336, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.727710247039795 }, { "auxiliary_loss_clip": 0.01172945, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.05386043, "balance_loss_mlp": 1.02473021, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 1.8929098395430786, "language_loss": 0.77800536, "learning_rate": 3.6204115824192817e-06, "loss": 0.80009121, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.6457931995391846 }, { "auxiliary_loss_clip": 0.01171801, "auxiliary_loss_mlp": 0.01034205, "balance_loss_clip": 1.05469048, "balance_loss_mlp": 1.02341652, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.536183508173937, "language_loss": 0.76547915, "learning_rate": 3.619954868778471e-06, "loss": 0.78753924, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.6927356719970703 }, { "auxiliary_loss_clip": 0.01177866, "auxiliary_loss_mlp": 0.01044249, "balance_loss_clip": 1.05639625, "balance_loss_mlp": 1.03449821, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.8453057223700078, "language_loss": 0.8227697, "learning_rate": 3.6194979093917944e-06, "loss": 0.84499085, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.625425100326538 }, { "auxiliary_loss_clip": 0.01172224, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.05566549, "balance_loss_mlp": 1.02651727, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 2.212053628287465, "language_loss": 0.86642444, "learning_rate": 3.6190407043285724e-06, "loss": 0.88851178, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 4.447360515594482 }, { "auxiliary_loss_clip": 0.01212556, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.05907798, "balance_loss_mlp": 1.03010345, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 2.2643063027009362, "language_loss": 0.75716209, "learning_rate": 3.618583253658163e-06, "loss": 0.77969205, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.720869779586792 }, { "auxiliary_loss_clip": 0.01146328, "auxiliary_loss_mlp": 0.0071439, "balance_loss_clip": 1.05465221, "balance_loss_mlp": 1.00089896, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 1.853880205033495, "language_loss": 0.86266452, "learning_rate": 3.618125557449961e-06, "loss": 0.88127172, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 3.5599210262298584 }, { "auxiliary_loss_clip": 0.01193096, "auxiliary_loss_mlp": 0.0104234, "balance_loss_clip": 1.0589875, "balance_loss_mlp": 1.03195071, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 2.0650289866197484, "language_loss": 0.83506137, "learning_rate": 3.6176676157733983e-06, "loss": 0.8574158, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.6103663444519043 }, { "auxiliary_loss_clip": 0.01156761, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.05382299, "balance_loss_mlp": 1.02295041, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.4526491620383446, "language_loss": 0.7591157, "learning_rate": 3.6172094286979443e-06, "loss": 0.7810145, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 3.5709946155548096 }, { "auxiliary_loss_clip": 0.0117817, "auxiliary_loss_mlp": 0.01034473, "balance_loss_clip": 1.05556238, "balance_loss_mlp": 1.02413738, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.4262576777830807, "language_loss": 0.81402141, "learning_rate": 3.6167509962931064e-06, "loss": 0.83614784, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 2.7264151573181152 }, { "auxiliary_loss_clip": 0.01151742, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 1.05388403, "balance_loss_mlp": 1.0170629, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.8456306441102526, "language_loss": 0.76898575, "learning_rate": 3.6162923186284276e-06, "loss": 0.79077202, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.684427261352539 }, { "auxiliary_loss_clip": 0.01176235, "auxiliary_loss_mlp": 0.0104148, "balance_loss_clip": 1.05537546, "balance_loss_mlp": 1.03144228, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 2.026333106656427, "language_loss": 0.85847902, "learning_rate": 3.6158333957734888e-06, "loss": 0.88065624, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.637906789779663 }, { "auxiliary_loss_clip": 0.01167045, "auxiliary_loss_mlp": 0.01032938, "balance_loss_clip": 1.05452776, "balance_loss_mlp": 1.02328169, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.2067169434337264, "language_loss": 0.82794762, "learning_rate": 3.6153742277979088e-06, "loss": 0.84994745, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.6979997158050537 }, { "auxiliary_loss_clip": 0.01180503, "auxiliary_loss_mlp": 0.01037835, "balance_loss_clip": 1.05731475, "balance_loss_mlp": 1.02682579, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 2.2568749580093668, "language_loss": 0.77899516, "learning_rate": 3.6149148147713434e-06, "loss": 0.80117857, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.6059515476226807 }, { "auxiliary_loss_clip": 0.01205123, "auxiliary_loss_mlp": 0.01035408, "balance_loss_clip": 1.06459904, "balance_loss_mlp": 1.02495337, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 1.9907227138281745, "language_loss": 0.8682791, "learning_rate": 3.614455156763484e-06, "loss": 0.89068443, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.652172327041626 }, { "auxiliary_loss_clip": 0.0114073, "auxiliary_loss_mlp": 0.01039976, "balance_loss_clip": 1.04929662, "balance_loss_mlp": 1.02994502, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.4155781074396008, "language_loss": 0.71097869, "learning_rate": 3.613995253844061e-06, "loss": 0.7327857, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.673729419708252 }, { "auxiliary_loss_clip": 0.01191478, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.05828142, "balance_loss_mlp": 1.02072239, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 1.8922937938394955, "language_loss": 0.80703676, "learning_rate": 3.6135351060828414e-06, "loss": 0.82925737, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.7469348907470703 }, { "auxiliary_loss_clip": 0.0122046, "auxiliary_loss_mlp": 0.0104136, "balance_loss_clip": 1.06507564, "balance_loss_mlp": 1.03017211, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.1520510163540454, "language_loss": 0.6925329, "learning_rate": 3.6130747135496285e-06, "loss": 0.71515107, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.62284517288208 }, { "auxiliary_loss_clip": 0.01209775, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.05962515, "balance_loss_mlp": 1.02160025, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 1.738504413428303, "language_loss": 0.65745842, "learning_rate": 3.6126140763142646e-06, "loss": 0.67987418, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.669570207595825 }, { "auxiliary_loss_clip": 0.01214539, "auxiliary_loss_mlp": 0.01044065, "balance_loss_clip": 1.06163049, "balance_loss_mlp": 1.03293097, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 5.618545495393197, "language_loss": 0.8665033, "learning_rate": 3.6121531944466275e-06, "loss": 0.88908935, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.6136178970336914 }, { "auxiliary_loss_clip": 0.01195996, "auxiliary_loss_mlp": 0.01035557, "balance_loss_clip": 1.05986881, "balance_loss_mlp": 1.02509677, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.4003286672787696, "language_loss": 0.78614068, "learning_rate": 3.611692068016633e-06, "loss": 0.80845618, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.6042284965515137 }, { "auxiliary_loss_clip": 0.01158951, "auxiliary_loss_mlp": 0.01037561, "balance_loss_clip": 1.05264711, "balance_loss_mlp": 1.0270232, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 2.2136315201032692, "language_loss": 0.75270104, "learning_rate": 3.611230697094233e-06, "loss": 0.77466619, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.6866748332977295 }, { "auxiliary_loss_clip": 0.0118385, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.05853295, "balance_loss_mlp": 1.0254724, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.731366026576694, "language_loss": 0.87167931, "learning_rate": 3.6107690817494173e-06, "loss": 0.89387417, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.6777772903442383 }, { "auxiliary_loss_clip": 0.01138705, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.05143142, "balance_loss_mlp": 1.0188868, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 3.1172894351086944, "language_loss": 0.70551085, "learning_rate": 3.6103072220522117e-06, "loss": 0.7271843, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.721430778503418 }, { "auxiliary_loss_clip": 0.01167555, "auxiliary_loss_mlp": 0.01033498, "balance_loss_clip": 1.05634308, "balance_loss_mlp": 1.02322221, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 3.2780669221049368, "language_loss": 0.918145, "learning_rate": 3.609845118072682e-06, "loss": 0.94015551, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.661548614501953 }, { "auxiliary_loss_clip": 0.01202692, "auxiliary_loss_mlp": 0.00714025, "balance_loss_clip": 1.05928969, "balance_loss_mlp": 1.00085735, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.7010884683450411, "language_loss": 0.79689157, "learning_rate": 3.6093827698809276e-06, "loss": 0.81605875, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.7475669384002686 }, { "auxiliary_loss_clip": 0.01192747, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.05517375, "balance_loss_mlp": 1.02321565, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.4781362020491047, "language_loss": 0.84738481, "learning_rate": 3.6089201775470864e-06, "loss": 0.86964011, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.647404670715332 }, { "auxiliary_loss_clip": 0.01150245, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.05260849, "balance_loss_mlp": 1.02694535, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 2.2519083846004464, "language_loss": 0.77413398, "learning_rate": 3.6084573411413334e-06, "loss": 0.79600799, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.750765085220337 }, { "auxiliary_loss_clip": 0.01157423, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.05391824, "balance_loss_mlp": 1.02546525, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 2.142400813375679, "language_loss": 0.808635, "learning_rate": 3.607994260733881e-06, "loss": 0.8305707, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.686760425567627 }, { "auxiliary_loss_clip": 0.01184647, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.05603349, "balance_loss_mlp": 1.02382648, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.59833985055055, "language_loss": 0.74342424, "learning_rate": 3.6075309363949776e-06, "loss": 0.765607, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.6418111324310303 }, { "auxiliary_loss_clip": 0.01214146, "auxiliary_loss_mlp": 0.01043512, "balance_loss_clip": 1.06239581, "balance_loss_mlp": 1.03265238, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 2.217499141610938, "language_loss": 0.81434107, "learning_rate": 3.6070673681949094e-06, "loss": 0.83691764, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 4.355543375015259 }, { "auxiliary_loss_clip": 0.01182591, "auxiliary_loss_mlp": 0.00713588, "balance_loss_clip": 1.06002092, "balance_loss_mlp": 1.00084043, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.8489835724634844, "language_loss": 0.81127203, "learning_rate": 3.606603556203999e-06, "loss": 0.83023381, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.687675952911377 }, { "auxiliary_loss_clip": 0.01196089, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.05674767, "balance_loss_mlp": 1.02614081, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 1.8598500758816798, "language_loss": 0.83787215, "learning_rate": 3.6061395004926066e-06, "loss": 0.86019427, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 3.4157793521881104 }, { "auxiliary_loss_clip": 0.01212298, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.06045878, "balance_loss_mlp": 1.0223577, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 15.654048991006452, "language_loss": 0.84711158, "learning_rate": 3.605675201131129e-06, "loss": 0.86956578, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 2.6969501972198486 }, { "auxiliary_loss_clip": 0.01202474, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.06217003, "balance_loss_mlp": 1.02764964, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.3778695990400487, "language_loss": 0.79513919, "learning_rate": 3.60521065819e-06, "loss": 0.81754261, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 3.5495829582214355 }, { "auxiliary_loss_clip": 0.01184073, "auxiliary_loss_mlp": 0.01033673, "balance_loss_clip": 1.05792105, "balance_loss_mlp": 1.02365947, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.8547460588859048, "language_loss": 0.87736547, "learning_rate": 3.60474587173969e-06, "loss": 0.89954293, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.7017252445220947 }, { "auxiliary_loss_clip": 0.01194658, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.06183958, "balance_loss_mlp": 1.02644467, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 1.958675290015162, "language_loss": 0.8427667, "learning_rate": 3.6042808418507084e-06, "loss": 0.86508071, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 2.634535551071167 }, { "auxiliary_loss_clip": 0.01197151, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.06166267, "balance_loss_mlp": 1.03019667, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 2.2801015324849154, "language_loss": 0.76972085, "learning_rate": 3.6038155685935976e-06, "loss": 0.79209793, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.6386196613311768 }, { "auxiliary_loss_clip": 0.0119473, "auxiliary_loss_mlp": 0.01040697, "balance_loss_clip": 1.05779409, "balance_loss_mlp": 1.02954471, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 2.072048556172505, "language_loss": 0.70730418, "learning_rate": 3.6033500520389404e-06, "loss": 0.72965848, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.6137936115264893 }, { "auxiliary_loss_clip": 0.01076231, "auxiliary_loss_mlp": 0.01005208, "balance_loss_clip": 1.03821158, "balance_loss_mlp": 1.00148892, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.7977782200222588, "language_loss": 0.64812732, "learning_rate": 3.6028842922573553e-06, "loss": 0.66894174, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.3560354709625244 }, { "auxiliary_loss_clip": 0.01093651, "auxiliary_loss_mlp": 0.00705028, "balance_loss_clip": 1.04273927, "balance_loss_mlp": 1.00044203, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8566869905032044, "language_loss": 0.62895185, "learning_rate": 3.602418289319497e-06, "loss": 0.64693862, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.228332042694092 }, { "auxiliary_loss_clip": 0.01143067, "auxiliary_loss_mlp": 0.01036669, "balance_loss_clip": 1.05128384, "balance_loss_mlp": 1.02657759, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 1.7755050659697764, "language_loss": 0.73177612, "learning_rate": 3.601952043296059e-06, "loss": 0.75357354, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.7573904991149902 }, { "auxiliary_loss_clip": 0.01182984, "auxiliary_loss_mlp": 0.01037241, "balance_loss_clip": 1.05544424, "balance_loss_mlp": 1.02679801, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.7463127039175896, "language_loss": 0.80388039, "learning_rate": 3.6014855542577696e-06, "loss": 0.82608271, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.8024799823760986 }, { "auxiliary_loss_clip": 0.01179675, "auxiliary_loss_mlp": 0.01029628, "balance_loss_clip": 1.05917668, "balance_loss_mlp": 1.018929, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 2.3229378503806353, "language_loss": 0.8430723, "learning_rate": 3.6010188222753943e-06, "loss": 0.86516535, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.8666441440582275 }, { "auxiliary_loss_clip": 0.01101534, "auxiliary_loss_mlp": 0.01003557, "balance_loss_clip": 1.03990722, "balance_loss_mlp": 1.0001719, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.9003807826479658, "language_loss": 0.64162815, "learning_rate": 3.6005518474197372e-06, "loss": 0.66267908, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.1489901542663574 }, { "auxiliary_loss_clip": 0.01197694, "auxiliary_loss_mlp": 0.01042933, "balance_loss_clip": 1.06151152, "balance_loss_mlp": 1.03197789, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 1.8814895012188142, "language_loss": 0.78023487, "learning_rate": 3.6000846297616373e-06, "loss": 0.80264115, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.694826126098633 }, { "auxiliary_loss_clip": 0.01217951, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.06529284, "balance_loss_mlp": 1.03500402, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.522143770737377, "language_loss": 0.72657895, "learning_rate": 3.5996171693719717e-06, "loss": 0.74920881, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.6045234203338623 }, { "auxiliary_loss_clip": 0.01122659, "auxiliary_loss_mlp": 0.01003387, "balance_loss_clip": 1.0463388, "balance_loss_mlp": 0.99976295, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8348901745921806, "language_loss": 0.64782077, "learning_rate": 3.5991494663216528e-06, "loss": 0.66908121, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.3084871768951416 }, { "auxiliary_loss_clip": 0.01213138, "auxiliary_loss_mlp": 0.01040778, "balance_loss_clip": 1.06182718, "balance_loss_mlp": 1.03009653, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 2.016402209058151, "language_loss": 0.87635833, "learning_rate": 3.5986815206816314e-06, "loss": 0.89889747, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.6632819175720215 }, { "auxiliary_loss_clip": 0.01212147, "auxiliary_loss_mlp": 0.01036384, "balance_loss_clip": 1.06086683, "balance_loss_mlp": 1.02617931, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 3.0409437809134965, "language_loss": 0.74570715, "learning_rate": 3.598213332522895e-06, "loss": 0.76819241, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.6973180770874023 }, { "auxiliary_loss_clip": 0.01197486, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.05993772, "balance_loss_mlp": 1.02406001, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 2.2296153630959665, "language_loss": 0.77365655, "learning_rate": 3.597744901916466e-06, "loss": 0.79596758, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.677931308746338 }, { "auxiliary_loss_clip": 0.01216436, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.06134832, "balance_loss_mlp": 1.02533114, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 2.180699196381083, "language_loss": 0.76481706, "learning_rate": 3.5972762289334058e-06, "loss": 0.78734338, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.6073110103607178 }, { "auxiliary_loss_clip": 0.01126153, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.05404043, "balance_loss_mlp": 1.02240252, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 2.4200904694321923, "language_loss": 0.84710896, "learning_rate": 3.5968073136448116e-06, "loss": 0.86869311, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.710219144821167 }, { "auxiliary_loss_clip": 0.01201985, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.06088197, "balance_loss_mlp": 1.02153146, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.7795042891508634, "language_loss": 0.91239417, "learning_rate": 3.596338156121818e-06, "loss": 0.93473184, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.61238169670105 }, { "auxiliary_loss_clip": 0.01100538, "auxiliary_loss_mlp": 0.01013193, "balance_loss_clip": 1.03897202, "balance_loss_mlp": 1.0096879, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7530148204464693, "language_loss": 0.59351486, "learning_rate": 3.595868756435595e-06, "loss": 0.61465222, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.3334078788757324 }, { "auxiliary_loss_clip": 0.01168532, "auxiliary_loss_mlp": 0.01038955, "balance_loss_clip": 1.05920696, "balance_loss_mlp": 1.02842879, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.217910061883053, "language_loss": 0.8062259, "learning_rate": 3.5953991146573504e-06, "loss": 0.82830083, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.6714420318603516 }, { "auxiliary_loss_clip": 0.01198868, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.05851591, "balance_loss_mlp": 1.02585006, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 3.779197292256915, "language_loss": 0.83884263, "learning_rate": 3.5949292308583294e-06, "loss": 0.86119908, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 4.422197103500366 }, { "auxiliary_loss_clip": 0.01216281, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.06485391, "balance_loss_mlp": 1.02378571, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.015641331314443, "language_loss": 0.8115083, "learning_rate": 3.594459105109811e-06, "loss": 0.83401752, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 3.4504330158233643 }, { "auxiliary_loss_clip": 0.01201965, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.06206799, "balance_loss_mlp": 1.02711058, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.8176337985258133, "language_loss": 0.81132424, "learning_rate": 3.593988737483115e-06, "loss": 0.8337121, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.709685802459717 }, { "auxiliary_loss_clip": 0.01183656, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.06032848, "balance_loss_mlp": 1.02773631, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 1.9340222435789163, "language_loss": 0.7818107, "learning_rate": 3.5935181280495947e-06, "loss": 0.80402809, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.711808681488037 }, { "auxiliary_loss_clip": 0.0109314, "auxiliary_loss_mlp": 0.01004268, "balance_loss_clip": 1.03477621, "balance_loss_mlp": 1.00066781, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.7985314201675089, "language_loss": 0.54350102, "learning_rate": 3.5930472768806412e-06, "loss": 0.56447506, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 4.147808790206909 }, { "auxiliary_loss_clip": 0.01217131, "auxiliary_loss_mlp": 0.01043318, "balance_loss_clip": 1.06618726, "balance_loss_mlp": 1.0329411, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 1.9261848935388104, "language_loss": 0.77063596, "learning_rate": 3.5925761840476826e-06, "loss": 0.79324043, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 2.633836507797241 }, { "auxiliary_loss_clip": 0.01179459, "auxiliary_loss_mlp": 0.01040313, "balance_loss_clip": 1.0632112, "balance_loss_mlp": 1.02984667, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 2.28325964566224, "language_loss": 0.81103641, "learning_rate": 3.592104849622183e-06, "loss": 0.83323413, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 2.731173276901245 }, { "auxiliary_loss_clip": 0.01137195, "auxiliary_loss_mlp": 0.01037604, "balance_loss_clip": 1.05280828, "balance_loss_mlp": 1.02719712, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.4798274421719408, "language_loss": 0.73149413, "learning_rate": 3.591633273675644e-06, "loss": 0.75324214, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.8066179752349854 }, { "auxiliary_loss_clip": 0.01074735, "auxiliary_loss_mlp": 0.01004189, "balance_loss_clip": 1.04343247, "balance_loss_mlp": 1.00139987, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9027419934415623, "language_loss": 0.58210588, "learning_rate": 3.591161456279602e-06, "loss": 0.60289514, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 3.1005451679229736 }, { "auxiliary_loss_clip": 0.01187666, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.05868196, "balance_loss_mlp": 1.02468896, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.6480561344939682, "language_loss": 0.80005383, "learning_rate": 3.590689397505633e-06, "loss": 0.82228768, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.6689419746398926 }, { "auxiliary_loss_clip": 0.01215186, "auxiliary_loss_mlp": 0.0103888, "balance_loss_clip": 1.06491375, "balance_loss_mlp": 1.02862227, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.731266234613004, "language_loss": 0.86255908, "learning_rate": 3.590217097425347e-06, "loss": 0.88509977, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.6804397106170654 }, { "auxiliary_loss_clip": 0.01216247, "auxiliary_loss_mlp": 0.01038595, "balance_loss_clip": 1.06348729, "balance_loss_mlp": 1.02781868, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 3.0962231512180347, "language_loss": 0.7098074, "learning_rate": 3.589744556110391e-06, "loss": 0.73235583, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.580885648727417 }, { "auxiliary_loss_clip": 0.01175828, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.05461979, "balance_loss_mlp": 1.02239704, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 1.89633185742062, "language_loss": 0.84058869, "learning_rate": 3.58927177363245e-06, "loss": 0.86268592, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.7901813983917236 }, { "auxiliary_loss_clip": 0.0115728, "auxiliary_loss_mlp": 0.01039706, "balance_loss_clip": 1.05432701, "balance_loss_mlp": 1.02823186, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.132190916754157, "language_loss": 0.7261709, "learning_rate": 3.5887987500632447e-06, "loss": 0.74814081, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.7481601238250732 }, { "auxiliary_loss_clip": 0.01171422, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.05881202, "balance_loss_mlp": 1.03649092, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.9475745556477733, "language_loss": 0.83920944, "learning_rate": 3.5883254854745325e-06, "loss": 0.86138982, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.802497386932373 }, { "auxiliary_loss_clip": 0.01202445, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.06063902, "balance_loss_mlp": 1.02765346, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 1.946079662149102, "language_loss": 0.74785537, "learning_rate": 3.587851979938107e-06, "loss": 0.77027571, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.6194982528686523 }, { "auxiliary_loss_clip": 0.01197822, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.06107903, "balance_loss_mlp": 1.02748513, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 1.959169165368408, "language_loss": 0.77333665, "learning_rate": 3.5873782335257985e-06, "loss": 0.79569304, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.5724804401397705 }, { "auxiliary_loss_clip": 0.01167145, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.05943966, "balance_loss_mlp": 1.02880907, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.4925415835210583, "language_loss": 0.78440684, "learning_rate": 3.5869042463094744e-06, "loss": 0.80647445, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.7098934650421143 }, { "auxiliary_loss_clip": 0.0112931, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.05043137, "balance_loss_mlp": 1.03293014, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.907080459670313, "language_loss": 0.76940525, "learning_rate": 3.586430018361038e-06, "loss": 0.79113507, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.7159879207611084 }, { "auxiliary_loss_clip": 0.01169039, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.02171028, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 3.780876846669879, "language_loss": 0.76027483, "learning_rate": 3.5859555497524283e-06, "loss": 0.78228951, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.715630292892456 }, { "auxiliary_loss_clip": 0.01201032, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.06285977, "balance_loss_mlp": 1.03324747, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 3.810059724723499, "language_loss": 0.91892803, "learning_rate": 3.5854808405556237e-06, "loss": 0.94137728, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.618767499923706 }, { "auxiliary_loss_clip": 0.01166889, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.05612516, "balance_loss_mlp": 1.02896261, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.6397304075882504, "language_loss": 0.75385118, "learning_rate": 3.5850058908426355e-06, "loss": 0.77591145, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.682107448577881 }, { "auxiliary_loss_clip": 0.01183102, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.05682302, "balance_loss_mlp": 1.02074325, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 1.931938584601312, "language_loss": 0.85439098, "learning_rate": 3.584530700685514e-06, "loss": 0.87652951, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.6726415157318115 }, { "auxiliary_loss_clip": 0.01174972, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.05947447, "balance_loss_mlp": 1.02730823, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.1123676339943804, "language_loss": 0.88735378, "learning_rate": 3.5840552701563448e-06, "loss": 0.90948176, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.717454671859741 }, { "auxiliary_loss_clip": 0.01216449, "auxiliary_loss_mlp": 0.01044131, "balance_loss_clip": 1.06371284, "balance_loss_mlp": 1.03350306, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.181614603000004, "language_loss": 0.8187952, "learning_rate": 3.5835795993272513e-06, "loss": 0.84140098, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.7057735919952393 }, { "auxiliary_loss_clip": 0.01093173, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.04811263, "balance_loss_mlp": 1.0275532, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 2.4909561444226, "language_loss": 0.71026993, "learning_rate": 3.583103688270391e-06, "loss": 0.73158407, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 3.1655614376068115 }, { "auxiliary_loss_clip": 0.01163597, "auxiliary_loss_mlp": 0.01032859, "balance_loss_clip": 1.05405474, "balance_loss_mlp": 1.0227623, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.5857661091991706, "language_loss": 0.89336383, "learning_rate": 3.58262753705796e-06, "loss": 0.91532838, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 4.653745889663696 }, { "auxiliary_loss_clip": 0.01102225, "auxiliary_loss_mlp": 0.01006098, "balance_loss_clip": 1.04445982, "balance_loss_mlp": 1.00317788, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7641409144728184, "language_loss": 0.55539644, "learning_rate": 3.5821511457621902e-06, "loss": 0.57647967, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 4.0620362758636475 }, { "auxiliary_loss_clip": 0.0117332, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05736017, "balance_loss_mlp": 1.02852833, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 3.7365808235847533, "language_loss": 0.80740738, "learning_rate": 3.5816745144553497e-06, "loss": 0.82953799, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.6098363399505615 }, { "auxiliary_loss_clip": 0.01141422, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.05530167, "balance_loss_mlp": 1.02070212, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 3.4793627170485424, "language_loss": 0.75498486, "learning_rate": 3.5811976432097424e-06, "loss": 0.77670729, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.713529348373413 }, { "auxiliary_loss_clip": 0.01199574, "auxiliary_loss_mlp": 0.00713932, "balance_loss_clip": 1.06439948, "balance_loss_mlp": 1.00088215, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 1.9061078725690603, "language_loss": 0.84104609, "learning_rate": 3.58072053209771e-06, "loss": 0.86018121, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 3.5631256103515625 }, { "auxiliary_loss_clip": 0.01173414, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.05566764, "balance_loss_mlp": 1.02689195, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.213123652319052, "language_loss": 0.78971839, "learning_rate": 3.5802431811916296e-06, "loss": 0.81183398, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 2.6715331077575684 }, { "auxiliary_loss_clip": 0.01175895, "auxiliary_loss_mlp": 0.01035329, "balance_loss_clip": 1.05802894, "balance_loss_mlp": 1.02567279, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 2.3577771882048775, "language_loss": 0.8077175, "learning_rate": 3.579765590563916e-06, "loss": 0.82982969, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.6449246406555176 }, { "auxiliary_loss_clip": 0.01188879, "auxiliary_loss_mlp": 0.01042704, "balance_loss_clip": 1.06052947, "balance_loss_mlp": 1.03196919, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 2.7247955220682303, "language_loss": 0.81325608, "learning_rate": 3.579287760287017e-06, "loss": 0.83557194, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.6174933910369873 }, { "auxiliary_loss_clip": 0.01196247, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.0617044, "balance_loss_mlp": 1.02135503, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 2.3972582538671, "language_loss": 0.72509849, "learning_rate": 3.578809690433421e-06, "loss": 0.74737775, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.804471015930176 }, { "auxiliary_loss_clip": 0.01216655, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.06334221, "balance_loss_mlp": 1.02464461, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.5057205206434165, "language_loss": 0.81961089, "learning_rate": 3.578331381075651e-06, "loss": 0.84212983, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.5512008666992188 }, { "auxiliary_loss_clip": 0.01195906, "auxiliary_loss_mlp": 0.01038179, "balance_loss_clip": 1.05841565, "balance_loss_mlp": 1.02760553, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.602177209371545, "language_loss": 0.69789732, "learning_rate": 3.5778528322862646e-06, "loss": 0.72023815, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.7053017616271973 }, { "auxiliary_loss_clip": 0.012008, "auxiliary_loss_mlp": 0.01043505, "balance_loss_clip": 1.06197953, "balance_loss_mlp": 1.03344369, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.5603763936307975, "language_loss": 0.86455309, "learning_rate": 3.5773740441378585e-06, "loss": 0.88699615, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.6369903087615967 }, { "auxiliary_loss_clip": 0.01194702, "auxiliary_loss_mlp": 0.01036654, "balance_loss_clip": 1.06132507, "balance_loss_mlp": 1.02683735, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 2.0821569289778927, "language_loss": 0.73736119, "learning_rate": 3.5768950167030633e-06, "loss": 0.75967467, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.879920721054077 }, { "auxiliary_loss_clip": 0.01166953, "auxiliary_loss_mlp": 0.01042397, "balance_loss_clip": 1.05426478, "balance_loss_mlp": 1.03167963, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 1.7584792131065028, "language_loss": 0.78419226, "learning_rate": 3.576415750054548e-06, "loss": 0.80628574, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.70013165473938 }, { "auxiliary_loss_clip": 0.01170219, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.057163, "balance_loss_mlp": 1.02261543, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 2.4778449398574964, "language_loss": 0.85421968, "learning_rate": 3.5759362442650172e-06, "loss": 0.87625307, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.6644535064697266 }, { "auxiliary_loss_clip": 0.01192849, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.05947614, "balance_loss_mlp": 1.02521682, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 2.0120515893996105, "language_loss": 0.84847051, "learning_rate": 3.5754564994072113e-06, "loss": 0.87075657, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.6945278644561768 }, { "auxiliary_loss_clip": 0.01173459, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.05569148, "balance_loss_mlp": 1.02500081, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 2.230957035568008, "language_loss": 0.60342479, "learning_rate": 3.5749765155539067e-06, "loss": 0.62550616, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.72711181640625 }, { "auxiliary_loss_clip": 0.01159257, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.05463004, "balance_loss_mlp": 1.02392602, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 3.268430411149922, "language_loss": 0.92565519, "learning_rate": 3.574496292777917e-06, "loss": 0.94760001, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.698676347732544 }, { "auxiliary_loss_clip": 0.01183835, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.05881906, "balance_loss_mlp": 1.02323031, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 2.176841225022306, "language_loss": 0.71492618, "learning_rate": 3.574015831152092e-06, "loss": 0.73710358, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.7107656002044678 }, { "auxiliary_loss_clip": 0.01167495, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.05647874, "balance_loss_mlp": 1.02562928, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.295066968300137, "language_loss": 0.83376199, "learning_rate": 3.573535130749316e-06, "loss": 0.85579169, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.590343475341797 }, { "auxiliary_loss_clip": 0.01171089, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.05973244, "balance_loss_mlp": 1.02398252, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.8535857097470243, "language_loss": 0.7381922, "learning_rate": 3.5730541916425127e-06, "loss": 0.76024401, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.664031505584717 }, { "auxiliary_loss_clip": 0.0116219, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.0558126, "balance_loss_mlp": 1.02176058, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 2.45947740458711, "language_loss": 0.86338824, "learning_rate": 3.572573013904639e-06, "loss": 0.88532567, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.7085978984832764 }, { "auxiliary_loss_clip": 0.01210216, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.06165028, "balance_loss_mlp": 1.02272701, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 2.0426886015752217, "language_loss": 0.91958499, "learning_rate": 3.572091597608689e-06, "loss": 0.94201517, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.5505354404449463 }, { "auxiliary_loss_clip": 0.0118305, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.05842805, "balance_loss_mlp": 1.02370358, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 5.962068324020568, "language_loss": 0.73486137, "learning_rate": 3.571609942827694e-06, "loss": 0.75702739, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.7510130405426025 }, { "auxiliary_loss_clip": 0.0117263, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.05436635, "balance_loss_mlp": 1.01998949, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 1.6487699238168698, "language_loss": 0.88287294, "learning_rate": 3.57112804963472e-06, "loss": 0.90489769, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.627138137817383 }, { "auxiliary_loss_clip": 0.01158339, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.06004703, "balance_loss_mlp": 1.02226996, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 3.728400685462266, "language_loss": 0.76438069, "learning_rate": 3.57064591810287e-06, "loss": 0.78628016, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.6572771072387695 }, { "auxiliary_loss_clip": 0.01209703, "auxiliary_loss_mlp": 0.00713017, "balance_loss_clip": 1.06142998, "balance_loss_mlp": 1.0008297, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.1507844804692793, "language_loss": 0.80351412, "learning_rate": 3.570163548305284e-06, "loss": 0.82274133, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 3.57922625541687 }, { "auxiliary_loss_clip": 0.01177943, "auxiliary_loss_mlp": 0.01035065, "balance_loss_clip": 1.05715764, "balance_loss_mlp": 1.02483082, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 3.136372811115447, "language_loss": 0.69667411, "learning_rate": 3.569680940315135e-06, "loss": 0.71880424, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 3.3818767070770264 }, { "auxiliary_loss_clip": 0.01165028, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.05672121, "balance_loss_mlp": 1.02337337, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 2.083630551917327, "language_loss": 0.81912076, "learning_rate": 3.5691980942056356e-06, "loss": 0.84111238, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.72270131111145 }, { "auxiliary_loss_clip": 0.01194108, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.05739641, "balance_loss_mlp": 1.02442098, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 1.9535650007380159, "language_loss": 0.79384279, "learning_rate": 3.5687150100500332e-06, "loss": 0.81612587, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.5969104766845703 }, { "auxiliary_loss_clip": 0.01196962, "auxiliary_loss_mlp": 0.01035359, "balance_loss_clip": 1.06008136, "balance_loss_mlp": 1.02534556, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.797305137348625, "language_loss": 0.74492264, "learning_rate": 3.568231687921611e-06, "loss": 0.76724583, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 3.589895486831665 }, { "auxiliary_loss_clip": 0.01208014, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.06201339, "balance_loss_mlp": 1.02428973, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.5617247372208782, "language_loss": 0.80543089, "learning_rate": 3.5677481278936883e-06, "loss": 0.82784384, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 2.636923313140869 }, { "auxiliary_loss_clip": 0.01128151, "auxiliary_loss_mlp": 0.01005509, "balance_loss_clip": 1.06804788, "balance_loss_mlp": 1.00265968, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.842517981191934, "language_loss": 0.57857138, "learning_rate": 3.5672643300396214e-06, "loss": 0.59990799, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.2951643466949463 }, { "auxiliary_loss_clip": 0.01162765, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.05922818, "balance_loss_mlp": 1.02215064, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.6746616871521174, "language_loss": 0.67244744, "learning_rate": 3.566780294432802e-06, "loss": 0.69439697, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 2.7229106426239014 }, { "auxiliary_loss_clip": 0.0120978, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.06134772, "balance_loss_mlp": 1.02927291, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.2369166544126933, "language_loss": 0.74439096, "learning_rate": 3.566296021146657e-06, "loss": 0.76688403, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.5852084159851074 }, { "auxiliary_loss_clip": 0.01214208, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.064749, "balance_loss_mlp": 1.02319789, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 2.243607492880279, "language_loss": 0.73174751, "learning_rate": 3.565811510254652e-06, "loss": 0.75422281, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.698394536972046 }, { "auxiliary_loss_clip": 0.01110854, "auxiliary_loss_mlp": 0.0101259, "balance_loss_clip": 1.04753137, "balance_loss_mlp": 1.00990736, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8580191189303099, "language_loss": 0.58260489, "learning_rate": 3.5653267618302845e-06, "loss": 0.60383928, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.2634880542755127 }, { "auxiliary_loss_clip": 0.01211433, "auxiliary_loss_mlp": 0.01028269, "balance_loss_clip": 1.06249797, "balance_loss_mlp": 1.01814175, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 1.8891041376007773, "language_loss": 0.8550964, "learning_rate": 3.564841775947093e-06, "loss": 0.87749338, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.641676902770996 }, { "auxiliary_loss_clip": 0.01158079, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.05419123, "balance_loss_mlp": 1.02533793, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.041311879687113, "language_loss": 0.7631042, "learning_rate": 3.5643565526786475e-06, "loss": 0.78503335, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.819221258163452 }, { "auxiliary_loss_clip": 0.01211836, "auxiliary_loss_mlp": 0.01037547, "balance_loss_clip": 1.06273937, "balance_loss_mlp": 1.02776003, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.6529915867325997, "language_loss": 0.77131748, "learning_rate": 3.5638710920985574e-06, "loss": 0.79381132, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.67405104637146 }, { "auxiliary_loss_clip": 0.01199403, "auxiliary_loss_mlp": 0.00713557, "balance_loss_clip": 1.06021345, "balance_loss_mlp": 1.00085998, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.8030271738057917, "language_loss": 0.82111931, "learning_rate": 3.5633853942804655e-06, "loss": 0.84024894, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.6645402908325195 }, { "auxiliary_loss_clip": 0.01159353, "auxiliary_loss_mlp": 0.01028754, "balance_loss_clip": 1.05442226, "balance_loss_mlp": 1.01869226, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 3.6798679893978026, "language_loss": 0.76455069, "learning_rate": 3.5628994592980527e-06, "loss": 0.78643179, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.6353063583374023 }, { "auxiliary_loss_clip": 0.01211192, "auxiliary_loss_mlp": 0.01039989, "balance_loss_clip": 1.06183398, "balance_loss_mlp": 1.03023803, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.8950701903057146, "language_loss": 0.70345199, "learning_rate": 3.562413287225034e-06, "loss": 0.72596383, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.6188111305236816 }, { "auxiliary_loss_clip": 0.01188313, "auxiliary_loss_mlp": 0.01038929, "balance_loss_clip": 1.06107867, "balance_loss_mlp": 1.02907634, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.599367533383737, "language_loss": 0.89170897, "learning_rate": 3.5619268781351623e-06, "loss": 0.91398144, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.593646287918091 }, { "auxiliary_loss_clip": 0.01174759, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.06142831, "balance_loss_mlp": 1.03131771, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 2.0362624846580832, "language_loss": 0.76638615, "learning_rate": 3.5614402321022256e-06, "loss": 0.78854227, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.6795618534088135 }, { "auxiliary_loss_clip": 0.01136135, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.05495632, "balance_loss_mlp": 1.0310117, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.766108597300903, "language_loss": 0.87047201, "learning_rate": 3.5609533492000463e-06, "loss": 0.89224333, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.7548038959503174 }, { "auxiliary_loss_clip": 0.01173966, "auxiliary_loss_mlp": 0.01035509, "balance_loss_clip": 1.06173348, "balance_loss_mlp": 1.0259186, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.004347931600013, "language_loss": 0.78822476, "learning_rate": 3.560466229502485e-06, "loss": 0.81031954, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.737761974334717 }, { "auxiliary_loss_clip": 0.01175739, "auxiliary_loss_mlp": 0.00713844, "balance_loss_clip": 1.06279337, "balance_loss_mlp": 1.00081277, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.4823989778930926, "language_loss": 0.89483654, "learning_rate": 3.5599788730834384e-06, "loss": 0.91373241, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.6169283390045166 }, { "auxiliary_loss_clip": 0.0119791, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.06085706, "balance_loss_mlp": 1.02591753, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.3832654396080204, "language_loss": 0.78569049, "learning_rate": 3.559491280016836e-06, "loss": 0.80802619, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.6410064697265625 }, { "auxiliary_loss_clip": 0.01179179, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.05987859, "balance_loss_mlp": 1.02013946, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.9427540585698146, "language_loss": 0.71010995, "learning_rate": 3.5590034503766465e-06, "loss": 0.73220444, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.6551287174224854 }, { "auxiliary_loss_clip": 0.01211813, "auxiliary_loss_mlp": 0.01031133, "balance_loss_clip": 1.06435752, "balance_loss_mlp": 1.02212036, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 8.346658725586122, "language_loss": 0.81399846, "learning_rate": 3.558515384236874e-06, "loss": 0.83642793, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.599367380142212 }, { "auxiliary_loss_clip": 0.01146437, "auxiliary_loss_mlp": 0.00713731, "balance_loss_clip": 1.05304837, "balance_loss_mlp": 1.00079274, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 2.220287713500763, "language_loss": 0.83967471, "learning_rate": 3.558027081671556e-06, "loss": 0.85827637, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.71329665184021 }, { "auxiliary_loss_clip": 0.01195858, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.05973315, "balance_loss_mlp": 1.02297521, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 2.030042953824937, "language_loss": 0.68661803, "learning_rate": 3.557538542754769e-06, "loss": 0.70890999, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 3.5460448265075684 }, { "auxiliary_loss_clip": 0.01215082, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.06582117, "balance_loss_mlp": 1.02283549, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 1.9228380146381496, "language_loss": 0.66772294, "learning_rate": 3.557049767560623e-06, "loss": 0.69020104, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 3.512108087539673 }, { "auxiliary_loss_clip": 0.01147792, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05861282, "balance_loss_mlp": 1.02200532, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 1.9030483207465483, "language_loss": 0.85679698, "learning_rate": 3.5565607561632655e-06, "loss": 0.87860131, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.707486629486084 }, { "auxiliary_loss_clip": 0.01175594, "auxiliary_loss_mlp": 0.0103024, "balance_loss_clip": 1.06057942, "balance_loss_mlp": 1.02041745, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.4257106743676156, "language_loss": 0.78833526, "learning_rate": 3.5560715086368787e-06, "loss": 0.81039357, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 2.9049477577209473 }, { "auxiliary_loss_clip": 0.0117477, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.06126833, "balance_loss_mlp": 1.02361643, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.1309672388279384, "language_loss": 0.82077754, "learning_rate": 3.5555820250556816e-06, "loss": 0.84286094, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 3.5292553901672363 }, { "auxiliary_loss_clip": 0.01179576, "auxiliary_loss_mlp": 0.0104395, "balance_loss_clip": 1.0612421, "balance_loss_mlp": 1.03304791, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.7776779241773477, "language_loss": 0.6952492, "learning_rate": 3.5550923054939278e-06, "loss": 0.71748447, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 2.6336231231689453 }, { "auxiliary_loss_clip": 0.01136702, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.05097938, "balance_loss_mlp": 1.02367735, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 1.9673395120473538, "language_loss": 0.7443983, "learning_rate": 3.5546023500259083e-06, "loss": 0.76610357, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.7583348751068115 }, { "auxiliary_loss_clip": 0.0114945, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.05510795, "balance_loss_mlp": 1.02192926, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 2.2018349742365637, "language_loss": 0.81039178, "learning_rate": 3.5541121587259477e-06, "loss": 0.83220673, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 2.678804874420166 }, { "auxiliary_loss_clip": 0.01133911, "auxiliary_loss_mlp": 0.01027403, "balance_loss_clip": 1.06482041, "balance_loss_mlp": 1.0248158, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8326876496325804, "language_loss": 0.57887292, "learning_rate": 3.553621731668408e-06, "loss": 0.6004861, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.2091104984283447 }, { "auxiliary_loss_clip": 0.01186896, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.05807257, "balance_loss_mlp": 1.02103055, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 1.9586387096430296, "language_loss": 0.8320365, "learning_rate": 3.553131068927688e-06, "loss": 0.85421371, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.680187702178955 }, { "auxiliary_loss_clip": 0.01157755, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.05756545, "balance_loss_mlp": 1.02272773, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 2.680441462237069, "language_loss": 0.8056466, "learning_rate": 3.552640170578219e-06, "loss": 0.82754457, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.6823275089263916 }, { "auxiliary_loss_clip": 0.0117772, "auxiliary_loss_mlp": 0.01031937, "balance_loss_clip": 1.05856633, "balance_loss_mlp": 1.02191186, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 3.707644916185044, "language_loss": 0.7793768, "learning_rate": 3.5521490366944703e-06, "loss": 0.80147338, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.692460775375366 }, { "auxiliary_loss_clip": 0.01160827, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.0584228, "balance_loss_mlp": 1.03206229, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.083106128944533, "language_loss": 0.80111372, "learning_rate": 3.5516576673509474e-06, "loss": 0.82313728, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.6530299186706543 }, { "auxiliary_loss_clip": 0.01215201, "auxiliary_loss_mlp": 0.01032301, "balance_loss_clip": 1.0664804, "balance_loss_mlp": 1.02164364, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.6537510677516591, "language_loss": 0.86189258, "learning_rate": 3.5511660626221896e-06, "loss": 0.88436759, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.7113213539123535 }, { "auxiliary_loss_clip": 0.01174314, "auxiliary_loss_mlp": 0.00714563, "balance_loss_clip": 1.06133056, "balance_loss_mlp": 1.00084233, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 3.400600081035225, "language_loss": 0.88592637, "learning_rate": 3.5506742225827744e-06, "loss": 0.90481514, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.7038512229919434 }, { "auxiliary_loss_clip": 0.01161119, "auxiliary_loss_mlp": 0.01042267, "balance_loss_clip": 1.05930805, "balance_loss_mlp": 1.03192532, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.3596518271483093, "language_loss": 0.90266764, "learning_rate": 3.5501821473073116e-06, "loss": 0.92470145, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.708305597305298 }, { "auxiliary_loss_clip": 0.01156062, "auxiliary_loss_mlp": 0.01044518, "balance_loss_clip": 1.05820262, "balance_loss_mlp": 1.03456426, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.2270775162718373, "language_loss": 0.86749077, "learning_rate": 3.54968983687045e-06, "loss": 0.88949662, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.636338233947754 }, { "auxiliary_loss_clip": 0.0118166, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.05955398, "balance_loss_mlp": 1.02701366, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 4.294988919114049, "language_loss": 0.89526832, "learning_rate": 3.549197291346872e-06, "loss": 0.91745865, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.675809860229492 }, { "auxiliary_loss_clip": 0.01198575, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.06382775, "balance_loss_mlp": 1.02385712, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 3.3165861582842076, "language_loss": 0.79516333, "learning_rate": 3.548704510811297e-06, "loss": 0.8174839, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.604729652404785 }, { "auxiliary_loss_clip": 0.01146549, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.05328178, "balance_loss_mlp": 1.03040349, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 2.4229906031429946, "language_loss": 0.74161839, "learning_rate": 3.5482114953384787e-06, "loss": 0.76349068, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.751631259918213 }, { "auxiliary_loss_clip": 0.01195957, "auxiliary_loss_mlp": 0.01030675, "balance_loss_clip": 1.0606221, "balance_loss_mlp": 1.02073264, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.307772993734464, "language_loss": 0.84760094, "learning_rate": 3.5477182450032077e-06, "loss": 0.86986727, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.6372451782226562 }, { "auxiliary_loss_clip": 0.01191304, "auxiliary_loss_mlp": 0.01034013, "balance_loss_clip": 1.06113482, "balance_loss_mlp": 1.02385056, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 1.9699763120283997, "language_loss": 0.83439839, "learning_rate": 3.5472247598803097e-06, "loss": 0.85665148, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.676652193069458 }, { "auxiliary_loss_clip": 0.01212536, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 1.06299496, "balance_loss_mlp": 1.02203369, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 2.8076200357296326, "language_loss": 0.85284621, "learning_rate": 3.546731040044645e-06, "loss": 0.87528837, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.628411293029785 }, { "auxiliary_loss_clip": 0.01212336, "auxiliary_loss_mlp": 0.01037119, "balance_loss_clip": 1.06528246, "balance_loss_mlp": 1.02716506, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.7705560831369989, "language_loss": 0.7524842, "learning_rate": 3.546237085571112e-06, "loss": 0.77497876, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.622875928878784 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.06323171, "balance_loss_mlp": 1.02436125, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 2.1178466743006084, "language_loss": 0.72869134, "learning_rate": 3.5457428965346425e-06, "loss": 0.75098097, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.647225856781006 }, { "auxiliary_loss_clip": 0.01127307, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.05420649, "balance_loss_mlp": 1.03053236, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.5882655827197318, "language_loss": 0.74873781, "learning_rate": 3.545248473010205e-06, "loss": 0.77041674, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.915891408920288 }, { "auxiliary_loss_clip": 0.0121271, "auxiliary_loss_mlp": 0.00714346, "balance_loss_clip": 1.06280231, "balance_loss_mlp": 1.00078785, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.8005206198699202, "language_loss": 0.87747186, "learning_rate": 3.544753815072802e-06, "loss": 0.8967424, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 3.621037721633911 }, { "auxiliary_loss_clip": 0.01103111, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.04693413, "balance_loss_mlp": 1.02570081, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 1.8947503153884437, "language_loss": 0.88330674, "learning_rate": 3.544258922797474e-06, "loss": 0.90469462, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.882289171218872 }, { "auxiliary_loss_clip": 0.0121122, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.06412029, "balance_loss_mlp": 1.02065325, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 2.1366994402813626, "language_loss": 0.77949727, "learning_rate": 3.543763796259295e-06, "loss": 0.80190891, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 3.8386311531066895 }, { "auxiliary_loss_clip": 0.01194925, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.06273031, "balance_loss_mlp": 1.02386558, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.8021329835242414, "language_loss": 0.90943766, "learning_rate": 3.5432684355333754e-06, "loss": 0.93173093, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 2.7294721603393555 }, { "auxiliary_loss_clip": 0.01196833, "auxiliary_loss_mlp": 0.0103545, "balance_loss_clip": 1.06344604, "balance_loss_mlp": 1.0259726, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 2.107909966232605, "language_loss": 0.76734293, "learning_rate": 3.5427728406948613e-06, "loss": 0.78966576, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 3.5259788036346436 }, { "auxiliary_loss_clip": 0.01129016, "auxiliary_loss_mlp": 0.01007272, "balance_loss_clip": 1.0662508, "balance_loss_mlp": 1.004673, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7579210120351136, "language_loss": 0.57942307, "learning_rate": 3.542277011818934e-06, "loss": 0.60078585, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.4256958961486816 }, { "auxiliary_loss_clip": 0.01181792, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.06562352, "balance_loss_mlp": 1.02279007, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.35234251509492, "language_loss": 0.74349469, "learning_rate": 3.5417809489808104e-06, "loss": 0.76563907, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.841672420501709 }, { "auxiliary_loss_clip": 0.01200154, "auxiliary_loss_mlp": 0.01030947, "balance_loss_clip": 1.06657982, "balance_loss_mlp": 1.02115953, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 1.778464628538289, "language_loss": 0.72137779, "learning_rate": 3.5412846522557422e-06, "loss": 0.74368876, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 2.6846439838409424 }, { "auxiliary_loss_clip": 0.01212318, "auxiliary_loss_mlp": 0.01030245, "balance_loss_clip": 1.06578374, "balance_loss_mlp": 1.02016604, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.0644704685117263, "language_loss": 0.74620879, "learning_rate": 3.540788121719018e-06, "loss": 0.76863438, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.6120989322662354 }, { "auxiliary_loss_clip": 0.01153039, "auxiliary_loss_mlp": 0.01024809, "balance_loss_clip": 1.0594393, "balance_loss_mlp": 1.01505828, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 1.8968661306785806, "language_loss": 0.81685632, "learning_rate": 3.5402913574459604e-06, "loss": 0.83863485, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.6831912994384766 }, { "auxiliary_loss_clip": 0.01122025, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.0526309, "balance_loss_mlp": 1.02116823, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.7866769853292594, "language_loss": 0.85955083, "learning_rate": 3.5397943595119297e-06, "loss": 0.88107526, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.866351366043091 }, { "auxiliary_loss_clip": 0.01173094, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.0609926, "balance_loss_mlp": 1.02458191, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 2.5377134778653985, "language_loss": 0.77352834, "learning_rate": 3.5392971279923177e-06, "loss": 0.79559886, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.6802480220794678 }, { "auxiliary_loss_clip": 0.01156161, "auxiliary_loss_mlp": 0.01029718, "balance_loss_clip": 1.05671358, "balance_loss_mlp": 1.0197165, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.2211607717561486, "language_loss": 0.82658851, "learning_rate": 3.5387996629625557e-06, "loss": 0.84844732, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.707881450653076 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.01003727, "balance_loss_clip": 1.07982171, "balance_loss_mlp": 1.00096118, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.8012375572216593, "language_loss": 0.54982245, "learning_rate": 3.5383019644981083e-06, "loss": 0.57153177, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.2332749366760254 }, { "auxiliary_loss_clip": 0.01175257, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.06014884, "balance_loss_mlp": 1.0287708, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.803645378687907, "language_loss": 0.73096848, "learning_rate": 3.5378040326744763e-06, "loss": 0.75310791, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.6508209705352783 }, { "auxiliary_loss_clip": 0.01162899, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.0631249, "balance_loss_mlp": 1.02324152, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.0967746885702314, "language_loss": 0.85523915, "learning_rate": 3.5373058675671946e-06, "loss": 0.87719852, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.7265799045562744 }, { "auxiliary_loss_clip": 0.01139292, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.05789077, "balance_loss_mlp": 1.02512777, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.1869005317788424, "language_loss": 0.73029339, "learning_rate": 3.536807469251836e-06, "loss": 0.75204313, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.7330005168914795 }, { "auxiliary_loss_clip": 0.01165961, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.05893958, "balance_loss_mlp": 1.02250814, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 2.7033564007356055, "language_loss": 0.8267771, "learning_rate": 3.5363088378040055e-06, "loss": 0.84875679, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.7384886741638184 }, { "auxiliary_loss_clip": 0.0116745, "auxiliary_loss_mlp": 0.00704299, "balance_loss_clip": 1.08049965, "balance_loss_mlp": 1.00036705, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7562380719054262, "language_loss": 0.64325523, "learning_rate": 3.5358099732993463e-06, "loss": 0.66197276, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.1269049644470215 }, { "auxiliary_loss_clip": 0.01185983, "auxiliary_loss_mlp": 0.01032749, "balance_loss_clip": 1.06261206, "balance_loss_mlp": 1.02308679, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 1.9918904672483433, "language_loss": 0.89554954, "learning_rate": 3.535310875813535e-06, "loss": 0.91773689, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.7009499073028564 }, { "auxiliary_loss_clip": 0.01196761, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.06629527, "balance_loss_mlp": 1.02185142, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 1.6990312682840296, "language_loss": 0.81509912, "learning_rate": 3.5348115454222843e-06, "loss": 0.83738202, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.718475341796875 }, { "auxiliary_loss_clip": 0.01173482, "auxiliary_loss_mlp": 0.01036208, "balance_loss_clip": 1.05958259, "balance_loss_mlp": 1.02686191, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 2.3968635833625007, "language_loss": 0.86019611, "learning_rate": 3.5343119822013425e-06, "loss": 0.88229299, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.7721152305603027 }, { "auxiliary_loss_clip": 0.01200171, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.06408787, "balance_loss_mlp": 1.02862716, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 1.9864943831420097, "language_loss": 0.77625209, "learning_rate": 3.533812186226493e-06, "loss": 0.79863548, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.7047417163848877 }, { "auxiliary_loss_clip": 0.01209251, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.06668913, "balance_loss_mlp": 1.02182317, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.8086370725751242, "language_loss": 0.76016271, "learning_rate": 3.5333121575735545e-06, "loss": 0.78256321, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.618422269821167 }, { "auxiliary_loss_clip": 0.01178061, "auxiliary_loss_mlp": 0.01036054, "balance_loss_clip": 1.06386435, "balance_loss_mlp": 1.0259037, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 1.7314670844273383, "language_loss": 0.75850952, "learning_rate": 3.532811896318381e-06, "loss": 0.78065068, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.826265335083008 }, { "auxiliary_loss_clip": 0.01162154, "auxiliary_loss_mlp": 0.01028408, "balance_loss_clip": 1.06011701, "balance_loss_mlp": 1.01837087, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 3.787675707942063, "language_loss": 0.81491494, "learning_rate": 3.5323114025368615e-06, "loss": 0.8368206, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 3.636288642883301 }, { "auxiliary_loss_clip": 0.01189604, "auxiliary_loss_mlp": 0.01038376, "balance_loss_clip": 1.06029713, "balance_loss_mlp": 1.02887523, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.807584955153649, "language_loss": 0.80739051, "learning_rate": 3.53181067630492e-06, "loss": 0.82967031, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 3.5285229682922363 }, { "auxiliary_loss_clip": 0.01169118, "auxiliary_loss_mlp": 0.01030619, "balance_loss_clip": 1.05951166, "balance_loss_mlp": 1.02065337, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.8410990059553267, "language_loss": 0.75792027, "learning_rate": 3.5313097176985175e-06, "loss": 0.7799176, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.663276195526123 }, { "auxiliary_loss_clip": 0.01197844, "auxiliary_loss_mlp": 0.01031692, "balance_loss_clip": 1.06769145, "balance_loss_mlp": 1.02270949, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.8787977803871407, "language_loss": 0.8155911, "learning_rate": 3.5308085267936482e-06, "loss": 0.83788645, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 3.459075450897217 }, { "auxiliary_loss_clip": 0.01132044, "auxiliary_loss_mlp": 0.00712443, "balance_loss_clip": 1.05949962, "balance_loss_mlp": 1.00068974, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.7725085043156559, "language_loss": 0.89951801, "learning_rate": 3.530307103666342e-06, "loss": 0.91796279, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 2.818830728530884 }, { "auxiliary_loss_clip": 0.01166487, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.06175125, "balance_loss_mlp": 1.02616787, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 1.9331105025866973, "language_loss": 0.79853928, "learning_rate": 3.5298054483926658e-06, "loss": 0.82056564, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 3.639050245285034 }, { "auxiliary_loss_clip": 0.01203097, "auxiliary_loss_mlp": 0.01035262, "balance_loss_clip": 1.06737113, "balance_loss_mlp": 1.02586794, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 2.7238710789038287, "language_loss": 0.82851911, "learning_rate": 3.5293035610487187e-06, "loss": 0.85090268, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 2.667782783508301 }, { "auxiliary_loss_clip": 0.01134085, "auxiliary_loss_mlp": 0.01008892, "balance_loss_clip": 1.07527924, "balance_loss_mlp": 1.00543523, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7271925941743576, "language_loss": 0.61974466, "learning_rate": 3.5288014417106374e-06, "loss": 0.64117444, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.297342538833618 }, { "auxiliary_loss_clip": 0.01166037, "auxiliary_loss_mlp": 0.01028282, "balance_loss_clip": 1.06479764, "balance_loss_mlp": 1.01873326, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.9306853744267267, "language_loss": 0.75803679, "learning_rate": 3.528299090454593e-06, "loss": 0.77997994, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 2.7990872859954834 }, { "auxiliary_loss_clip": 0.01198063, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.06436265, "balance_loss_mlp": 1.02025676, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.4794375439442518, "language_loss": 0.82999051, "learning_rate": 3.527796507356792e-06, "loss": 0.85227072, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.6260690689086914 }, { "auxiliary_loss_clip": 0.01200542, "auxiliary_loss_mlp": 0.01031466, "balance_loss_clip": 1.06450319, "balance_loss_mlp": 1.02165484, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 2.775915115516534, "language_loss": 0.9012509, "learning_rate": 3.527293692493475e-06, "loss": 0.92357099, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.5686593055725098 }, { "auxiliary_loss_clip": 0.01200859, "auxiliary_loss_mlp": 0.01029017, "balance_loss_clip": 1.06644893, "balance_loss_mlp": 1.01949799, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 2.704714780215944, "language_loss": 0.7326566, "learning_rate": 3.52679064594092e-06, "loss": 0.75495541, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.658583402633667 }, { "auxiliary_loss_clip": 0.01131298, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.05152202, "balance_loss_mlp": 1.01812077, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.0206903086275934, "language_loss": 0.74850374, "learning_rate": 3.5262873677754375e-06, "loss": 0.77008832, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.7360401153564453 }, { "auxiliary_loss_clip": 0.01212948, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.0674988, "balance_loss_mlp": 1.02276373, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.7378103141000718, "language_loss": 0.80708718, "learning_rate": 3.5257838580733745e-06, "loss": 0.8295356, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.654571771621704 }, { "auxiliary_loss_clip": 0.01201158, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.06727612, "balance_loss_mlp": 1.01995313, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 2.0535710932376077, "language_loss": 0.87059623, "learning_rate": 3.5252801169111138e-06, "loss": 0.89289957, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.672907829284668 }, { "auxiliary_loss_clip": 0.01175473, "auxiliary_loss_mlp": 0.01042336, "balance_loss_clip": 1.06372166, "balance_loss_mlp": 1.03281689, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.8212236896536176, "language_loss": 0.80240685, "learning_rate": 3.524776144365072e-06, "loss": 0.82458496, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.6884877681732178 }, { "auxiliary_loss_clip": 0.01176676, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.06848681, "balance_loss_mlp": 1.0284723, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.6364507043890382, "language_loss": 0.79283679, "learning_rate": 3.5242719405117016e-06, "loss": 0.81498772, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.7060256004333496 }, { "auxiliary_loss_clip": 0.01180652, "auxiliary_loss_mlp": 0.00713049, "balance_loss_clip": 1.06264901, "balance_loss_mlp": 1.00089049, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 2.5405958100518338, "language_loss": 0.74831867, "learning_rate": 3.5237675054274893e-06, "loss": 0.76725566, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.639577865600586 }, { "auxiliary_loss_clip": 0.01196246, "auxiliary_loss_mlp": 0.0103603, "balance_loss_clip": 1.06633043, "balance_loss_mlp": 1.0259093, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 2.1170652193181763, "language_loss": 0.79899669, "learning_rate": 3.5232628391889584e-06, "loss": 0.82131946, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.7111072540283203 }, { "auxiliary_loss_clip": 0.01144237, "auxiliary_loss_mlp": 0.01035553, "balance_loss_clip": 1.0599407, "balance_loss_mlp": 1.02643919, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 3.316356108352845, "language_loss": 0.64060748, "learning_rate": 3.522757941872666e-06, "loss": 0.66240537, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.772901773452759 }, { "auxiliary_loss_clip": 0.01215911, "auxiliary_loss_mlp": 0.0071262, "balance_loss_clip": 1.07048106, "balance_loss_mlp": 1.00083852, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.7139404103009168, "language_loss": 0.82466137, "learning_rate": 3.5222528135552042e-06, "loss": 0.8439467, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.6551902294158936 }, { "auxiliary_loss_clip": 0.01194453, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.06738186, "balance_loss_mlp": 1.02055323, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 2.139992656130849, "language_loss": 0.80231786, "learning_rate": 3.521747454313201e-06, "loss": 0.82455969, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.6039280891418457 }, { "auxiliary_loss_clip": 0.01154372, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.05900121, "balance_loss_mlp": 1.01997554, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 3.4992137989681393, "language_loss": 0.66665757, "learning_rate": 3.521241864223319e-06, "loss": 0.68849635, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.7323601245880127 }, { "auxiliary_loss_clip": 0.01177482, "auxiliary_loss_mlp": 0.01013341, "balance_loss_clip": 1.10729504, "balance_loss_mlp": 1.00890684, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.8083139194536985, "language_loss": 0.61958408, "learning_rate": 3.5207360433622552e-06, "loss": 0.64149231, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.287496566772461 }, { "auxiliary_loss_clip": 0.01175969, "auxiliary_loss_mlp": 0.01033678, "balance_loss_clip": 1.06627965, "balance_loss_mlp": 1.02487445, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.551388797376857, "language_loss": 0.7455827, "learning_rate": 3.5202299918067437e-06, "loss": 0.76767921, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.8673856258392334 }, { "auxiliary_loss_clip": 0.01198498, "auxiliary_loss_mlp": 0.01027355, "balance_loss_clip": 1.07015634, "balance_loss_mlp": 1.01874781, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.3077968072751722, "language_loss": 0.70065695, "learning_rate": 3.519723709633551e-06, "loss": 0.72291547, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.6671347618103027 }, { "auxiliary_loss_clip": 0.01171962, "auxiliary_loss_mlp": 0.01032728, "balance_loss_clip": 1.06386518, "balance_loss_mlp": 1.02356052, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 1.8044107375861365, "language_loss": 0.83551949, "learning_rate": 3.519217196919479e-06, "loss": 0.85756636, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 3.588968515396118 }, { "auxiliary_loss_clip": 0.01185425, "auxiliary_loss_mlp": 0.01026379, "balance_loss_clip": 1.06742525, "balance_loss_mlp": 1.017712, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 2.216459089470618, "language_loss": 0.7320385, "learning_rate": 3.518710453741367e-06, "loss": 0.75415659, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 3.5885069370269775 }, { "auxiliary_loss_clip": 0.01165596, "auxiliary_loss_mlp": 0.00712537, "balance_loss_clip": 1.05897498, "balance_loss_mlp": 1.00080276, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.110612055372916, "language_loss": 0.67887253, "learning_rate": 3.518203480176086e-06, "loss": 0.69765383, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.6991958618164062 }, { "auxiliary_loss_clip": 0.01115096, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.0551126, "balance_loss_mlp": 1.01983142, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.8076255420147205, "language_loss": 0.80999506, "learning_rate": 3.517696276300545e-06, "loss": 0.83143175, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 3.8074402809143066 }, { "auxiliary_loss_clip": 0.0119289, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.06511068, "balance_loss_mlp": 1.01875544, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.354292800087785, "language_loss": 0.69271278, "learning_rate": 3.517188842191685e-06, "loss": 0.71491981, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 2.7820866107940674 }, { "auxiliary_loss_clip": 0.01189769, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 1.0616858, "balance_loss_mlp": 1.02026749, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.5553437091744453, "language_loss": 0.73983073, "learning_rate": 3.5166811779264837e-06, "loss": 0.76202357, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 3.636564254760742 }, { "auxiliary_loss_clip": 0.01210579, "auxiliary_loss_mlp": 0.01029142, "balance_loss_clip": 1.06594419, "balance_loss_mlp": 1.01990891, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 1.7687630187019945, "language_loss": 0.77988172, "learning_rate": 3.5161732835819545e-06, "loss": 0.80227894, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 2.62817645072937 }, { "auxiliary_loss_clip": 0.01211449, "auxiliary_loss_mlp": 0.01033865, "balance_loss_clip": 1.06673265, "balance_loss_mlp": 1.02496636, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 2.386743190670824, "language_loss": 0.8334682, "learning_rate": 3.515665159235143e-06, "loss": 0.85592133, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.6329433917999268 }, { "auxiliary_loss_clip": 0.01174222, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.06010151, "balance_loss_mlp": 1.02121413, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.5905347000178063, "language_loss": 0.75192791, "learning_rate": 3.5151568049631318e-06, "loss": 0.77396321, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.660029172897339 }, { "auxiliary_loss_clip": 0.01211178, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.06573617, "balance_loss_mlp": 1.01899445, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 1.6813822542130066, "language_loss": 0.80453551, "learning_rate": 3.5146482208430385e-06, "loss": 0.82692546, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.7592597007751465 }, { "auxiliary_loss_clip": 0.01109253, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.04606676, "balance_loss_mlp": 1.0224458, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 3.049241758084063, "language_loss": 0.68353236, "learning_rate": 3.514139406952014e-06, "loss": 0.70494133, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.819101095199585 }, { "auxiliary_loss_clip": 0.0119092, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.06267786, "balance_loss_mlp": 1.02139831, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 3.702769111802068, "language_loss": 0.83626771, "learning_rate": 3.5136303633672454e-06, "loss": 0.85847634, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.6828811168670654 }, { "auxiliary_loss_clip": 0.01168888, "auxiliary_loss_mlp": 0.00712562, "balance_loss_clip": 1.06136811, "balance_loss_mlp": 1.00086308, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.7707625652618715, "language_loss": 0.74823451, "learning_rate": 3.5131210901659544e-06, "loss": 0.76704895, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.733020067214966 }, { "auxiliary_loss_clip": 0.01156226, "auxiliary_loss_mlp": 0.01023055, "balance_loss_clip": 1.05860186, "balance_loss_mlp": 1.01475215, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 3.078403517647968, "language_loss": 0.83031142, "learning_rate": 3.5126115874253967e-06, "loss": 0.85210425, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.7437727451324463 }, { "auxiliary_loss_clip": 0.01158133, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.05965662, "balance_loss_mlp": 1.01870942, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 2.021456773985943, "language_loss": 0.81071681, "learning_rate": 3.5121018552228644e-06, "loss": 0.83258224, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.786219835281372 }, { "auxiliary_loss_clip": 0.01154973, "auxiliary_loss_mlp": 0.01028572, "balance_loss_clip": 1.05623984, "balance_loss_mlp": 1.01900005, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 1.9773089790382674, "language_loss": 0.76655817, "learning_rate": 3.5115918936356827e-06, "loss": 0.78839368, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.6822738647460938 }, { "auxiliary_loss_clip": 0.0113623, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.05176234, "balance_loss_mlp": 1.02090931, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 1.837611552284942, "language_loss": 0.78959072, "learning_rate": 3.5110817027412123e-06, "loss": 0.81124789, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.70139479637146 }, { "auxiliary_loss_clip": 0.01151644, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.05478036, "balance_loss_mlp": 1.02487326, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 2.4407318851705466, "language_loss": 0.68852508, "learning_rate": 3.5105712826168493e-06, "loss": 0.71038043, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.6686930656433105 }, { "auxiliary_loss_clip": 0.01189324, "auxiliary_loss_mlp": 0.00712004, "balance_loss_clip": 1.06066585, "balance_loss_mlp": 1.0008055, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 2.498272874419082, "language_loss": 0.7112087, "learning_rate": 3.5100606333400235e-06, "loss": 0.73022199, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.6478192806243896 }, { "auxiliary_loss_clip": 0.01180243, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.05712116, "balance_loss_mlp": 1.02388716, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.6934253792635587, "language_loss": 0.77209067, "learning_rate": 3.5095497549882006e-06, "loss": 0.79422736, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.630256414413452 }, { "auxiliary_loss_clip": 0.01190727, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.06316352, "balance_loss_mlp": 1.02170253, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 2.8489194148500046, "language_loss": 0.7276057, "learning_rate": 3.50903864763888e-06, "loss": 0.74982208, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.662914276123047 }, { "auxiliary_loss_clip": 0.01192842, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.06050909, "balance_loss_mlp": 1.01611018, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 1.9645906708509437, "language_loss": 0.76529658, "learning_rate": 3.5085273113695965e-06, "loss": 0.78747565, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.8695266246795654 }, { "auxiliary_loss_clip": 0.01208778, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.06440628, "balance_loss_mlp": 1.02271032, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 1.9798108378779862, "language_loss": 0.78540421, "learning_rate": 3.508015746257919e-06, "loss": 0.80780709, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.657365560531616 }, { "auxiliary_loss_clip": 0.01158348, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.05664396, "balance_loss_mlp": 1.02371979, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 1.873881805772497, "language_loss": 0.83334982, "learning_rate": 3.5075039523814518e-06, "loss": 0.85525751, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.6681549549102783 }, { "auxiliary_loss_clip": 0.01192224, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.05778861, "balance_loss_mlp": 1.01951742, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.4432539822270445, "language_loss": 0.82084668, "learning_rate": 3.506991929817834e-06, "loss": 0.84306461, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.59299898147583 }, { "auxiliary_loss_clip": 0.01206182, "auxiliary_loss_mlp": 0.01023708, "balance_loss_clip": 1.06570888, "balance_loss_mlp": 1.01526833, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.7532038232325684, "language_loss": 0.82561302, "learning_rate": 3.506479678644738e-06, "loss": 0.84791195, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.5895164012908936 }, { "auxiliary_loss_clip": 0.01129677, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.05237484, "balance_loss_mlp": 1.02326024, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 2.4205275699616946, "language_loss": 0.74083751, "learning_rate": 3.505967198939873e-06, "loss": 0.76246262, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 3.6664798259735107 }, { "auxiliary_loss_clip": 0.01169147, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.05330825, "balance_loss_mlp": 1.02385592, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.5369905989435195, "language_loss": 0.78410697, "learning_rate": 3.5054544907809813e-06, "loss": 0.80612385, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 3.6078081130981445 }, { "auxiliary_loss_clip": 0.01169738, "auxiliary_loss_mlp": 0.00712958, "balance_loss_clip": 1.0582521, "balance_loss_mlp": 1.00089908, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.0573003616172487, "language_loss": 0.80579472, "learning_rate": 3.50494155424584e-06, "loss": 0.82462168, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 3.5386009216308594 }, { "auxiliary_loss_clip": 0.01191042, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.05947995, "balance_loss_mlp": 1.02075148, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.6540318243715277, "language_loss": 0.8342663, "learning_rate": 3.504428389412262e-06, "loss": 0.85647285, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.632359743118286 }, { "auxiliary_loss_clip": 0.01185562, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.05809212, "balance_loss_mlp": 1.0192399, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 6.690617276527134, "language_loss": 0.73418725, "learning_rate": 3.5039149963580927e-06, "loss": 0.75633264, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 2.679511308670044 }, { "auxiliary_loss_clip": 0.01167374, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.05739379, "balance_loss_mlp": 1.02592134, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.1179487760267577, "language_loss": 0.70400441, "learning_rate": 3.503401375161215e-06, "loss": 0.72602904, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 3.595360279083252 }, { "auxiliary_loss_clip": 0.0120314, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.06079817, "balance_loss_mlp": 1.02091861, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 2.075145126544793, "language_loss": 0.83553767, "learning_rate": 3.502887525899544e-06, "loss": 0.85786676, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 2.606438398361206 }, { "auxiliary_loss_clip": 0.01171332, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.05650067, "balance_loss_mlp": 1.02219486, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 1.6917327807633948, "language_loss": 0.83147299, "learning_rate": 3.50237344865103e-06, "loss": 0.85350168, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.67832612991333 }, { "auxiliary_loss_clip": 0.01206531, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.06146061, "balance_loss_mlp": 1.02500045, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 2.234170355244803, "language_loss": 0.76359236, "learning_rate": 3.501859143493658e-06, "loss": 0.78600061, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.7168760299682617 }, { "auxiliary_loss_clip": 0.01170809, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.07782888, "balance_loss_mlp": 1.01891947, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9366284503381362, "language_loss": 0.60601866, "learning_rate": 3.5013446105054488e-06, "loss": 0.62796092, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 2.9735400676727295 }, { "auxiliary_loss_clip": 0.01139784, "auxiliary_loss_mlp": 0.01030562, "balance_loss_clip": 1.051247, "balance_loss_mlp": 1.02099562, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 2.3131499296836435, "language_loss": 0.75234056, "learning_rate": 3.5008298497644555e-06, "loss": 0.77404404, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.680241107940674 }, { "auxiliary_loss_clip": 0.01158656, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.05521917, "balance_loss_mlp": 1.02221656, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 2.1292287881023446, "language_loss": 0.87974232, "learning_rate": 3.500314861348767e-06, "loss": 0.9016422, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.7378058433532715 }, { "auxiliary_loss_clip": 0.01148446, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.05665874, "balance_loss_mlp": 1.02353382, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 2.537789954447323, "language_loss": 0.77488101, "learning_rate": 3.499799645336507e-06, "loss": 0.79668975, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.6319222450256348 }, { "auxiliary_loss_clip": 0.01190374, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.06044853, "balance_loss_mlp": 1.02099824, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.4739396267199643, "language_loss": 0.86806619, "learning_rate": 3.4992842018058336e-06, "loss": 0.89026487, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.73313045501709 }, { "auxiliary_loss_clip": 0.01158801, "auxiliary_loss_mlp": 0.01028559, "balance_loss_clip": 1.05376959, "balance_loss_mlp": 1.01963067, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.426450589315638, "language_loss": 0.88451356, "learning_rate": 3.4987685308349384e-06, "loss": 0.90638721, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.656461477279663 }, { "auxiliary_loss_clip": 0.01151642, "auxiliary_loss_mlp": 0.01029263, "balance_loss_clip": 1.05081773, "balance_loss_mlp": 1.02028072, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.0592232684489082, "language_loss": 0.61537826, "learning_rate": 3.4982526325020497e-06, "loss": 0.63718736, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.7027764320373535 }, { "auxiliary_loss_clip": 0.01173517, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.05695665, "balance_loss_mlp": 1.02029765, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.323056546654434, "language_loss": 0.81859732, "learning_rate": 3.4977365068854273e-06, "loss": 0.84062821, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.575968027114868 }, { "auxiliary_loss_clip": 0.01164567, "auxiliary_loss_mlp": 0.01027921, "balance_loss_clip": 1.05557525, "balance_loss_mlp": 1.01829481, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.7635209218168018, "language_loss": 0.73568916, "learning_rate": 3.4972201540633676e-06, "loss": 0.75761408, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.6977698802948 }, { "auxiliary_loss_clip": 0.01164574, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.05808198, "balance_loss_mlp": 1.03153551, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 2.2546466491750894, "language_loss": 0.85090148, "learning_rate": 3.4967035741142008e-06, "loss": 0.87295097, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.62650465965271 }, { "auxiliary_loss_clip": 0.01172278, "auxiliary_loss_mlp": 0.01024812, "balance_loss_clip": 1.06661248, "balance_loss_mlp": 1.0162828, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 2.0783529433789254, "language_loss": 0.81857109, "learning_rate": 3.4961867671162917e-06, "loss": 0.84054202, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.676030158996582 }, { "auxiliary_loss_clip": 0.0120825, "auxiliary_loss_mlp": 0.01029262, "balance_loss_clip": 1.06210434, "balance_loss_mlp": 1.01967716, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 2.594709477645065, "language_loss": 0.77581215, "learning_rate": 3.4956697331480402e-06, "loss": 0.79818726, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.5699195861816406 }, { "auxiliary_loss_clip": 0.01160015, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.05375266, "balance_loss_mlp": 1.02129579, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.5027340536561473, "language_loss": 0.8005234, "learning_rate": 3.495152472287879e-06, "loss": 0.82242966, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.725971221923828 }, { "auxiliary_loss_clip": 0.01161284, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.05957115, "balance_loss_mlp": 1.01727211, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 1.8485011585766242, "language_loss": 0.73918313, "learning_rate": 3.4946349846142766e-06, "loss": 0.76106572, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.694375991821289 }, { "auxiliary_loss_clip": 0.0120397, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.05952942, "balance_loss_mlp": 1.02134204, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 2.374965719015931, "language_loss": 0.75805175, "learning_rate": 3.4941172702057353e-06, "loss": 0.7803967, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.6130709648132324 }, { "auxiliary_loss_clip": 0.01173096, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.05986726, "balance_loss_mlp": 1.0253942, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.4202355403951827, "language_loss": 0.80810201, "learning_rate": 3.4935993291407924e-06, "loss": 0.83018243, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.708625555038452 }, { "auxiliary_loss_clip": 0.01170518, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.0584383, "balance_loss_mlp": 1.01972437, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.220786693828613, "language_loss": 0.70895958, "learning_rate": 3.4930811614980183e-06, "loss": 0.73095793, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.6669530868530273 }, { "auxiliary_loss_clip": 0.0118536, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.06028247, "balance_loss_mlp": 1.02762508, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.8067776371665383, "language_loss": 0.78526211, "learning_rate": 3.4925627673560198e-06, "loss": 0.80748427, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.70405650138855 }, { "auxiliary_loss_clip": 0.01163326, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.06084394, "balance_loss_mlp": 1.02725744, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 2.2433383973474115, "language_loss": 0.88601112, "learning_rate": 3.4920441467934357e-06, "loss": 0.90800965, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 4.8361804485321045 }, { "auxiliary_loss_clip": 0.01148421, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.05470777, "balance_loss_mlp": 1.02084446, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 1.8660008519180353, "language_loss": 0.83028626, "learning_rate": 3.491525299888941e-06, "loss": 0.8520757, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 3.5678610801696777 }, { "auxiliary_loss_clip": 0.01134605, "auxiliary_loss_mlp": 0.00704623, "balance_loss_clip": 1.0774374, "balance_loss_mlp": 1.00017858, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8842434027499138, "language_loss": 0.62614119, "learning_rate": 3.491006226721244e-06, "loss": 0.64453346, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.3191134929656982 }, { "auxiliary_loss_clip": 0.01179273, "auxiliary_loss_mlp": 0.00713137, "balance_loss_clip": 1.06283212, "balance_loss_mlp": 1.0007925, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.154436750115468, "language_loss": 0.78072548, "learning_rate": 3.4904869273690882e-06, "loss": 0.7996496, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.8988771438598633 }, { "auxiliary_loss_clip": 0.01195466, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.0619483, "balance_loss_mlp": 1.02574682, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 1.985625598280465, "language_loss": 0.89123654, "learning_rate": 3.489967401911251e-06, "loss": 0.91354191, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 3.5052034854888916 }, { "auxiliary_loss_clip": 0.0121008, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.06306148, "balance_loss_mlp": 1.02473223, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.6802281322700883, "language_loss": 0.69377929, "learning_rate": 3.4894476504265428e-06, "loss": 0.71623099, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.8004469871520996 }, { "auxiliary_loss_clip": 0.01136987, "auxiliary_loss_mlp": 0.01013101, "balance_loss_clip": 1.05792713, "balance_loss_mlp": 1.00957286, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7456539162276875, "language_loss": 0.54499876, "learning_rate": 3.4889276729938104e-06, "loss": 0.56649959, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.1214096546173096 }, { "auxiliary_loss_clip": 0.01172791, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.0611248, "balance_loss_mlp": 1.01879442, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 2.77771262505198, "language_loss": 0.80650991, "learning_rate": 3.488407469691934e-06, "loss": 0.82852143, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.663843870162964 }, { "auxiliary_loss_clip": 0.0116832, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.05389905, "balance_loss_mlp": 1.02481246, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 1.9906822793442795, "language_loss": 0.80703014, "learning_rate": 3.487887040599828e-06, "loss": 0.82906079, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.6703741550445557 }, { "auxiliary_loss_clip": 0.01210843, "auxiliary_loss_mlp": 0.01033134, "balance_loss_clip": 1.06494975, "balance_loss_mlp": 1.0233115, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.2579901845590173, "language_loss": 0.75945765, "learning_rate": 3.4873663857964407e-06, "loss": 0.78189743, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.602631092071533 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.01030687, "balance_loss_clip": 1.06127572, "balance_loss_mlp": 1.02116787, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 2.44652705344337, "language_loss": 0.66531193, "learning_rate": 3.4868455053607556e-06, "loss": 0.6871016, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.7470836639404297 }, { "auxiliary_loss_clip": 0.01194549, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.06002343, "balance_loss_mlp": 1.02098751, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 1.908146398515167, "language_loss": 0.70859087, "learning_rate": 3.486324399371789e-06, "loss": 0.73084682, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.5940890312194824 }, { "auxiliary_loss_clip": 0.01156691, "auxiliary_loss_mlp": 0.01029388, "balance_loss_clip": 1.0581392, "balance_loss_mlp": 1.02048314, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.8883670118952367, "language_loss": 0.78342015, "learning_rate": 3.485803067908593e-06, "loss": 0.80528098, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.7188150882720947 }, { "auxiliary_loss_clip": 0.010991, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.04524732, "balance_loss_mlp": 1.01875329, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 2.116351247509541, "language_loss": 0.79898727, "learning_rate": 3.485281511050253e-06, "loss": 0.82026124, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.8644235134124756 }, { "auxiliary_loss_clip": 0.01194462, "auxiliary_loss_mlp": 0.01030028, "balance_loss_clip": 1.06171131, "balance_loss_mlp": 1.02039611, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 2.915051744315488, "language_loss": 0.90005195, "learning_rate": 3.484759728875889e-06, "loss": 0.92229688, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.596928119659424 }, { "auxiliary_loss_clip": 0.01132242, "auxiliary_loss_mlp": 0.01026892, "balance_loss_clip": 1.05584359, "balance_loss_mlp": 1.01759386, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.8152263855653623, "language_loss": 0.80822295, "learning_rate": 3.4842377214646543e-06, "loss": 0.82981431, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.6773431301116943 }, { "auxiliary_loss_clip": 0.01206807, "auxiliary_loss_mlp": 0.01027047, "balance_loss_clip": 1.06283426, "balance_loss_mlp": 1.01789176, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.9211570398036761, "language_loss": 0.66727763, "learning_rate": 3.483715488895737e-06, "loss": 0.6896162, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.6310813426971436 }, { "auxiliary_loss_clip": 0.01140361, "auxiliary_loss_mlp": 0.01029886, "balance_loss_clip": 1.05391026, "balance_loss_mlp": 1.02044499, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 1.696495395833548, "language_loss": 0.78588092, "learning_rate": 3.48319303124836e-06, "loss": 0.80758333, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.7397677898406982 }, { "auxiliary_loss_clip": 0.01175992, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.0644443, "balance_loss_mlp": 1.0262996, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.349972713931593, "language_loss": 0.6712079, "learning_rate": 3.4826703486017798e-06, "loss": 0.69333112, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.681812047958374 }, { "auxiliary_loss_clip": 0.01187394, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.05899835, "balance_loss_mlp": 1.02012539, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 2.0301512924076794, "language_loss": 0.76748645, "learning_rate": 3.4821474410352867e-06, "loss": 0.78965437, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.5980732440948486 }, { "auxiliary_loss_clip": 0.01093372, "auxiliary_loss_mlp": 0.01016495, "balance_loss_clip": 1.05322981, "balance_loss_mlp": 1.01408672, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.9070789343078768, "language_loss": 0.62660098, "learning_rate": 3.481624308628205e-06, "loss": 0.64769971, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.3804643154144287 }, { "auxiliary_loss_clip": 0.01172368, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.05736446, "balance_loss_mlp": 1.02358854, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 3.539501081557183, "language_loss": 1.00592947, "learning_rate": 3.481100951459893e-06, "loss": 1.02798247, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.647310256958008 }, { "auxiliary_loss_clip": 0.01188579, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.06202197, "balance_loss_mlp": 1.02269983, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.8109999923515354, "language_loss": 0.78724712, "learning_rate": 3.4805773696097453e-06, "loss": 0.80945098, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.613351345062256 }, { "auxiliary_loss_clip": 0.01171726, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.06208789, "balance_loss_mlp": 1.02757561, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.146061486559152, "language_loss": 0.87425369, "learning_rate": 3.4800535631571874e-06, "loss": 0.89634204, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.6203064918518066 }, { "auxiliary_loss_clip": 0.01179254, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.06036615, "balance_loss_mlp": 1.02108598, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 3.566389924740293, "language_loss": 0.76467431, "learning_rate": 3.4795295321816804e-06, "loss": 0.7867732, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.635857343673706 }, { "auxiliary_loss_clip": 0.01160928, "auxiliary_loss_mlp": 0.01025925, "balance_loss_clip": 1.05473113, "balance_loss_mlp": 1.01706815, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 4.0069437055799115, "language_loss": 0.90944421, "learning_rate": 3.47900527676272e-06, "loss": 0.9313128, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 3.4992847442626953 }, { "auxiliary_loss_clip": 0.01206279, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.0638026, "balance_loss_mlp": 1.02439284, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 1.9939932567979919, "language_loss": 0.88650942, "learning_rate": 3.478480796979835e-06, "loss": 0.90891147, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 3.5912013053894043 }, { "auxiliary_loss_clip": 0.01168852, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.05731714, "balance_loss_mlp": 1.02135336, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.8258626196897694, "language_loss": 0.77933002, "learning_rate": 3.4779560929125894e-06, "loss": 0.80132592, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 3.5402214527130127 }, { "auxiliary_loss_clip": 0.01114738, "auxiliary_loss_mlp": 0.01013371, "balance_loss_clip": 1.06309617, "balance_loss_mlp": 1.00919867, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.7000779582054517, "language_loss": 0.5703482, "learning_rate": 3.4774311646405783e-06, "loss": 0.59162933, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.336500406265259 }, { "auxiliary_loss_clip": 0.0114523, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.05192161, "balance_loss_mlp": 1.02076411, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 1.940593254787867, "language_loss": 0.83269477, "learning_rate": 3.476906012243435e-06, "loss": 0.85444558, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.719296932220459 }, { "auxiliary_loss_clip": 0.01178914, "auxiliary_loss_mlp": 0.01027015, "balance_loss_clip": 1.05740452, "balance_loss_mlp": 1.01779461, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.5807717491796227, "language_loss": 0.81081355, "learning_rate": 3.476380635800824e-06, "loss": 0.83287287, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 3.667198419570923 }, { "auxiliary_loss_clip": 0.01169941, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 1.05741727, "balance_loss_mlp": 1.01942825, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.236797308746971, "language_loss": 0.86516613, "learning_rate": 3.475855035392444e-06, "loss": 0.88715518, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 2.6530649662017822 }, { "auxiliary_loss_clip": 0.01119084, "auxiliary_loss_mlp": 0.01027035, "balance_loss_clip": 1.05091691, "balance_loss_mlp": 1.01718879, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 2.4435335499848425, "language_loss": 0.71633047, "learning_rate": 3.475329211098029e-06, "loss": 0.73779166, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.05550217628479 }, { "auxiliary_loss_clip": 0.01142416, "auxiliary_loss_mlp": 0.01028494, "balance_loss_clip": 1.05649948, "balance_loss_mlp": 1.01902843, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.7099576111962906, "language_loss": 0.82380033, "learning_rate": 3.4748031629973453e-06, "loss": 0.84550947, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.7572054862976074 }, { "auxiliary_loss_clip": 0.01081763, "auxiliary_loss_mlp": 0.01007546, "balance_loss_clip": 1.0524981, "balance_loss_mlp": 1.00423241, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.904968947948901, "language_loss": 0.56527007, "learning_rate": 3.4742768911701944e-06, "loss": 0.58616316, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.4630377292633057 }, { "auxiliary_loss_clip": 0.01192668, "auxiliary_loss_mlp": 0.01038969, "balance_loss_clip": 1.0613184, "balance_loss_mlp": 1.02889585, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.1894586973955574, "language_loss": 0.70770186, "learning_rate": 3.4737503956964113e-06, "loss": 0.73001826, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.7873024940490723 }, { "auxiliary_loss_clip": 0.01162321, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.05475998, "balance_loss_mlp": 1.01846123, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.302782213187179, "language_loss": 0.67736328, "learning_rate": 3.473223676655865e-06, "loss": 0.69927245, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.6836321353912354 }, { "auxiliary_loss_clip": 0.01162163, "auxiliary_loss_mlp": 0.01036489, "balance_loss_clip": 1.05278575, "balance_loss_mlp": 1.02641559, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.9156690505485396, "language_loss": 0.7957601, "learning_rate": 3.472696734128459e-06, "loss": 0.81774664, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.648737907409668 }, { "auxiliary_loss_clip": 0.01185787, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.05781317, "balance_loss_mlp": 1.02280474, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 2.009657993179718, "language_loss": 0.75474155, "learning_rate": 3.4721695681941286e-06, "loss": 0.77691936, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.6754257678985596 }, { "auxiliary_loss_clip": 0.01167596, "auxiliary_loss_mlp": 0.00712934, "balance_loss_clip": 1.05465233, "balance_loss_mlp": 1.00096488, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 2.554674314639196, "language_loss": 0.82200587, "learning_rate": 3.471642178932845e-06, "loss": 0.84081113, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.6479785442352295 }, { "auxiliary_loss_clip": 0.01171582, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.05504704, "balance_loss_mlp": 1.02438378, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 2.4278039867802135, "language_loss": 0.8994689, "learning_rate": 3.471114566424613e-06, "loss": 0.9215126, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.7016632556915283 }, { "auxiliary_loss_clip": 0.01166357, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.05735373, "balance_loss_mlp": 1.02135825, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 2.137771641321994, "language_loss": 0.75553989, "learning_rate": 3.4705867307494715e-06, "loss": 0.77750456, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.6230051517486572 }, { "auxiliary_loss_clip": 0.01189518, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.05878162, "balance_loss_mlp": 1.02325249, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.4529271859541097, "language_loss": 0.84715158, "learning_rate": 3.470058671987492e-06, "loss": 0.8693701, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.624484062194824 }, { "auxiliary_loss_clip": 0.01192094, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.05806291, "balance_loss_mlp": 1.02267146, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 2.0639818461616573, "language_loss": 0.84567451, "learning_rate": 3.4695303902187805e-06, "loss": 0.8679173, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.620389223098755 }, { "auxiliary_loss_clip": 0.01147415, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.05044913, "balance_loss_mlp": 1.02636909, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 1.9816134019887088, "language_loss": 0.78978997, "learning_rate": 3.469001885523478e-06, "loss": 0.81162715, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.7614645957946777 }, { "auxiliary_loss_clip": 0.01202255, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.05896795, "balance_loss_mlp": 1.01838899, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.6412264151851137, "language_loss": 0.80966592, "learning_rate": 3.4684731579817568e-06, "loss": 0.83196342, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.6324596405029297 }, { "auxiliary_loss_clip": 0.01117742, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.05181503, "balance_loss_mlp": 1.02354026, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.606904871926908, "language_loss": 0.7661112, "learning_rate": 3.4679442076738247e-06, "loss": 0.78761649, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.7765426635742188 }, { "auxiliary_loss_clip": 0.01206014, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.06007707, "balance_loss_mlp": 1.019907, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 5.1753547211813355, "language_loss": 0.83834976, "learning_rate": 3.4674150346799245e-06, "loss": 0.86070454, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.646401882171631 }, { "auxiliary_loss_clip": 0.0116834, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.05467594, "balance_loss_mlp": 1.02035308, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.0373280575875783, "language_loss": 0.79558432, "learning_rate": 3.4668856390803295e-06, "loss": 0.81756413, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.614623785018921 }, { "auxiliary_loss_clip": 0.01172184, "auxiliary_loss_mlp": 0.01030944, "balance_loss_clip": 1.05247593, "balance_loss_mlp": 1.02193141, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 2.1852986769648624, "language_loss": 0.89611638, "learning_rate": 3.4663560209553495e-06, "loss": 0.91814756, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.6242752075195312 }, { "auxiliary_loss_clip": 0.01157105, "auxiliary_loss_mlp": 0.0102893, "balance_loss_clip": 1.05013442, "balance_loss_mlp": 1.01966715, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.680773343406847, "language_loss": 0.79245526, "learning_rate": 3.4658261803853267e-06, "loss": 0.81431568, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.637523651123047 }, { "auxiliary_loss_clip": 0.01168942, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.05942702, "balance_loss_mlp": 1.02007866, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 2.257584739227022, "language_loss": 0.8077445, "learning_rate": 3.4652961174506383e-06, "loss": 0.82972932, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 3.5911359786987305 }, { "auxiliary_loss_clip": 0.01109158, "auxiliary_loss_mlp": 0.01005247, "balance_loss_clip": 1.03865457, "balance_loss_mlp": 1.00267243, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9727032030803914, "language_loss": 0.58124423, "learning_rate": 3.464765832231694e-06, "loss": 0.60238826, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 4.408897399902344 }, { "auxiliary_loss_clip": 0.0118738, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.05852699, "balance_loss_mlp": 1.02667224, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.9428101106921813, "language_loss": 0.70703357, "learning_rate": 3.4642353248089373e-06, "loss": 0.72926784, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.5711686611175537 }, { "auxiliary_loss_clip": 0.01161525, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 1.05339706, "balance_loss_mlp": 1.02247429, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.801597814182979, "language_loss": 0.80186689, "learning_rate": 3.463704595262846e-06, "loss": 0.82380247, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 3.5484344959259033 }, { "auxiliary_loss_clip": 0.01151425, "auxiliary_loss_mlp": 0.01029142, "balance_loss_clip": 1.0550909, "balance_loss_mlp": 1.02028465, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 2.7746199805653116, "language_loss": 0.70732653, "learning_rate": 3.463173643673931e-06, "loss": 0.72913218, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.7244350910186768 }, { "auxiliary_loss_clip": 0.01109035, "auxiliary_loss_mlp": 0.01002011, "balance_loss_clip": 1.0331552, "balance_loss_mlp": 0.99924487, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.9055516195330978, "language_loss": 0.63511461, "learning_rate": 3.4626424701227387e-06, "loss": 0.65622503, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 4.088858127593994 }, { "auxiliary_loss_clip": 0.01125917, "auxiliary_loss_mlp": 0.01001027, "balance_loss_clip": 1.0378902, "balance_loss_mlp": 0.99846363, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8249937260187951, "language_loss": 0.55746239, "learning_rate": 3.4621110746898452e-06, "loss": 0.57873183, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 3.240710735321045 }, { "auxiliary_loss_clip": 0.01188184, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.05766094, "balance_loss_mlp": 1.02475858, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 2.116906162314476, "language_loss": 0.74783325, "learning_rate": 3.4615794574558654e-06, "loss": 0.77004886, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.6348085403442383 }, { "auxiliary_loss_clip": 0.01168622, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.05387282, "balance_loss_mlp": 1.02184844, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 10.268164122965945, "language_loss": 0.83824396, "learning_rate": 3.4610476185014436e-06, "loss": 0.86023641, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.6466047763824463 }, { "auxiliary_loss_clip": 0.01201078, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.05582285, "balance_loss_mlp": 1.02397799, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 1.8634724775332776, "language_loss": 0.79423803, "learning_rate": 3.4605155579072597e-06, "loss": 0.81658041, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.6118860244750977 }, { "auxiliary_loss_clip": 0.01135414, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.05456996, "balance_loss_mlp": 1.02496624, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 2.4333032316492313, "language_loss": 0.71061391, "learning_rate": 3.459983275754027e-06, "loss": 0.73231107, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.7805123329162598 }, { "auxiliary_loss_clip": 0.01203407, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.05894041, "balance_loss_mlp": 1.02043009, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 3.18805176525969, "language_loss": 0.80081207, "learning_rate": 3.4594507721224918e-06, "loss": 0.82314128, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.590059280395508 }, { "auxiliary_loss_clip": 0.01168542, "auxiliary_loss_mlp": 0.01036368, "balance_loss_clip": 1.05265498, "balance_loss_mlp": 1.02769578, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 2.269900625318536, "language_loss": 0.8173725, "learning_rate": 3.4589180470934353e-06, "loss": 0.83942163, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.6287431716918945 }, { "auxiliary_loss_clip": 0.01189295, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.05457413, "balance_loss_mlp": 1.02410793, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 1.9712205824028997, "language_loss": 0.77033043, "learning_rate": 3.4583851007476713e-06, "loss": 0.79256415, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.595355749130249 }, { "auxiliary_loss_clip": 0.01154931, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.05380321, "balance_loss_mlp": 1.02101016, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.6731603424767805, "language_loss": 0.68749547, "learning_rate": 3.4578519331660464e-06, "loss": 0.70934743, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.6380598545074463 }, { "auxiliary_loss_clip": 0.01182423, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.05758774, "balance_loss_mlp": 1.02490592, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.2314737869299535, "language_loss": 0.8198818, "learning_rate": 3.4573185444294426e-06, "loss": 0.84203959, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.652538299560547 }, { "auxiliary_loss_clip": 0.01166615, "auxiliary_loss_mlp": 0.0071273, "balance_loss_clip": 1.05295038, "balance_loss_mlp": 1.00112343, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.5919176773164354, "language_loss": 0.78853035, "learning_rate": 3.456784934618774e-06, "loss": 0.80732381, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.6669914722442627 }, { "auxiliary_loss_clip": 0.01164392, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.05241835, "balance_loss_mlp": 1.02071679, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 3.9093466627580633, "language_loss": 0.79917961, "learning_rate": 3.4562511038149897e-06, "loss": 0.82111704, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.7032394409179688 }, { "auxiliary_loss_clip": 0.01060131, "auxiliary_loss_mlp": 0.01011711, "balance_loss_clip": 1.02820396, "balance_loss_mlp": 1.00914752, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8595623965048611, "language_loss": 0.57693094, "learning_rate": 3.4557170520990705e-06, "loss": 0.5976494, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.377438545227051 }, { "auxiliary_loss_clip": 0.01181706, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.05788732, "balance_loss_mlp": 1.02490103, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 1.5991877393292997, "language_loss": 0.86247438, "learning_rate": 3.4551827795520324e-06, "loss": 0.88462645, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.6339094638824463 }, { "auxiliary_loss_clip": 0.01180859, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.05247235, "balance_loss_mlp": 1.01842213, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 2.261711715399465, "language_loss": 0.85269433, "learning_rate": 3.4546482862549226e-06, "loss": 0.87477112, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.6864094734191895 }, { "auxiliary_loss_clip": 0.01142593, "auxiliary_loss_mlp": 0.01035896, "balance_loss_clip": 1.04915071, "balance_loss_mlp": 1.02739644, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.660864997917317, "language_loss": 0.78645754, "learning_rate": 3.4541135722888253e-06, "loss": 0.80824244, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.758788824081421 }, { "auxiliary_loss_clip": 0.01198334, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.05556166, "balance_loss_mlp": 1.02572191, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 5.108820011878091, "language_loss": 0.8005172, "learning_rate": 3.453578637734854e-06, "loss": 0.82284081, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.74310302734375 }, { "auxiliary_loss_clip": 0.01203838, "auxiliary_loss_mlp": 0.01033527, "balance_loss_clip": 1.06100082, "balance_loss_mlp": 1.02456832, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 4.156317854890191, "language_loss": 0.78660715, "learning_rate": 3.4530434826741605e-06, "loss": 0.80898082, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.6082165241241455 }, { "auxiliary_loss_clip": 0.01162906, "auxiliary_loss_mlp": 0.01023208, "balance_loss_clip": 1.05463791, "balance_loss_mlp": 1.01485693, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.8133819667534923, "language_loss": 0.68925571, "learning_rate": 3.452508107187926e-06, "loss": 0.71111685, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 2.863661766052246 }, { "auxiliary_loss_clip": 0.01118128, "auxiliary_loss_mlp": 0.01024467, "balance_loss_clip": 1.04697132, "balance_loss_mlp": 1.01509738, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 2.1935430213376503, "language_loss": 0.7731415, "learning_rate": 3.451972511357366e-06, "loss": 0.79456753, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.716158390045166 }, { "auxiliary_loss_clip": 0.01179432, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.05580461, "balance_loss_mlp": 1.02085924, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.7024800052628393, "language_loss": 0.8528372, "learning_rate": 3.45143669526373e-06, "loss": 0.87492955, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 3.6412434577941895 }, { "auxiliary_loss_clip": 0.01095658, "auxiliary_loss_mlp": 0.01003457, "balance_loss_clip": 1.03404737, "balance_loss_mlp": 1.00096536, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7840128890499642, "language_loss": 0.63258332, "learning_rate": 3.450900658988302e-06, "loss": 0.65357441, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 4.086545944213867 }, { "auxiliary_loss_clip": 0.01159083, "auxiliary_loss_mlp": 0.01030426, "balance_loss_clip": 1.05442071, "balance_loss_mlp": 1.02151465, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 3.0607315482939317, "language_loss": 0.77798593, "learning_rate": 3.450364402612397e-06, "loss": 0.79988104, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 3.7185611724853516 }, { "auxiliary_loss_clip": 0.01164149, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.05342174, "balance_loss_mlp": 1.0218575, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 2.233327905795491, "language_loss": 0.83892894, "learning_rate": 3.449827926217366e-06, "loss": 0.86088091, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.6028900146484375 }, { "auxiliary_loss_clip": 0.01171541, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.05219626, "balance_loss_mlp": 1.01854134, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 1.7914703246121124, "language_loss": 0.80541897, "learning_rate": 3.449291229884591e-06, "loss": 0.82740951, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.707152843475342 }, { "auxiliary_loss_clip": 0.01154188, "auxiliary_loss_mlp": 0.01028068, "balance_loss_clip": 1.051265, "balance_loss_mlp": 1.01902044, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 2.1028509127072263, "language_loss": 0.86678958, "learning_rate": 3.4487543136954887e-06, "loss": 0.88861215, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 3.6293702125549316 }, { "auxiliary_loss_clip": 0.01150899, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02391613, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 2.1310342705458587, "language_loss": 0.90983391, "learning_rate": 3.448217177731509e-06, "loss": 0.93167341, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.706526756286621 }, { "auxiliary_loss_clip": 0.01163644, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.05723333, "balance_loss_mlp": 1.02230144, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 3.2027126986793486, "language_loss": 0.77812898, "learning_rate": 3.4476798220741348e-06, "loss": 0.80007863, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.6376793384552 }, { "auxiliary_loss_clip": 0.01200939, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.05996048, "balance_loss_mlp": 1.01885819, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.764761413651103, "language_loss": 0.78270394, "learning_rate": 3.4471422468048826e-06, "loss": 0.80498964, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.6059987545013428 }, { "auxiliary_loss_clip": 0.01177955, "auxiliary_loss_mlp": 0.01030696, "balance_loss_clip": 1.05657816, "balance_loss_mlp": 1.02165353, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 3.0525808228653997, "language_loss": 0.72819257, "learning_rate": 3.4466044520053022e-06, "loss": 0.75027907, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.632659435272217 }, { "auxiliary_loss_clip": 0.01155469, "auxiliary_loss_mlp": 0.01029299, "balance_loss_clip": 1.05118477, "balance_loss_mlp": 1.01988149, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.9592936806854633, "language_loss": 0.60199451, "learning_rate": 3.446066437756977e-06, "loss": 0.62384218, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.639787435531616 }, { "auxiliary_loss_clip": 0.01166006, "auxiliary_loss_mlp": 0.01024938, "balance_loss_clip": 1.05556059, "balance_loss_mlp": 1.01607442, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 3.7680544444213036, "language_loss": 0.75611085, "learning_rate": 3.4455282041415224e-06, "loss": 0.77802026, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.6698696613311768 }, { "auxiliary_loss_clip": 0.01155896, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.05521333, "balance_loss_mlp": 1.02676761, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 3.1963627075587753, "language_loss": 0.87001008, "learning_rate": 3.4449897512405894e-06, "loss": 0.89192665, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.7499101161956787 }, { "auxiliary_loss_clip": 0.01117678, "auxiliary_loss_mlp": 0.0071278, "balance_loss_clip": 1.0499382, "balance_loss_mlp": 1.00122392, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 1.896169981755788, "language_loss": 0.75063562, "learning_rate": 3.444451079135859e-06, "loss": 0.76894021, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.800837516784668 }, { "auxiliary_loss_clip": 0.01124389, "auxiliary_loss_mlp": 0.00713148, "balance_loss_clip": 1.0472374, "balance_loss_mlp": 1.00107646, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 1.9908963953271794, "language_loss": 0.74092579, "learning_rate": 3.4439121879090493e-06, "loss": 0.75930119, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.7101006507873535 }, { "auxiliary_loss_clip": 0.01170917, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.05298615, "balance_loss_mlp": 1.02408183, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 2.199572576271222, "language_loss": 0.83194858, "learning_rate": 3.4433730776419082e-06, "loss": 0.85398495, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.681516647338867 }, { "auxiliary_loss_clip": 0.01189151, "auxiliary_loss_mlp": 0.00712933, "balance_loss_clip": 1.0567646, "balance_loss_mlp": 1.00121713, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 2.4642444897629256, "language_loss": 0.80726707, "learning_rate": 3.4428337484162183e-06, "loss": 0.82628787, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.6875860691070557 }, { "auxiliary_loss_clip": 0.01162236, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.05305147, "balance_loss_mlp": 1.02156413, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 1.8613721034011486, "language_loss": 0.84584528, "learning_rate": 3.442294200313797e-06, "loss": 0.86777025, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.711503267288208 }, { "auxiliary_loss_clip": 0.01123737, "auxiliary_loss_mlp": 0.01002689, "balance_loss_clip": 1.03996181, "balance_loss_mlp": 0.99997061, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.7814606884022197, "language_loss": 0.52712095, "learning_rate": 3.4417544334164916e-06, "loss": 0.54838514, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.273787498474121 }, { "auxiliary_loss_clip": 0.01148771, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.05423021, "balance_loss_mlp": 1.01784563, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.7514660909135344, "language_loss": 0.77276045, "learning_rate": 3.4412144478061854e-06, "loss": 0.79451364, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.786586046218872 }, { "auxiliary_loss_clip": 0.01081511, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.04426694, "balance_loss_mlp": 1.02234554, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 2.5364395168903546, "language_loss": 0.75595546, "learning_rate": 3.4406742435647925e-06, "loss": 0.77708894, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 2.8349316120147705 }, { "auxiliary_loss_clip": 0.01183041, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.05841589, "balance_loss_mlp": 1.01820397, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 1.9210945438700529, "language_loss": 0.78749466, "learning_rate": 3.440133820774263e-06, "loss": 0.80959725, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.632056713104248 }, { "auxiliary_loss_clip": 0.01171415, "auxiliary_loss_mlp": 0.01037367, "balance_loss_clip": 1.05575728, "balance_loss_mlp": 1.0275563, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 2.28733394066729, "language_loss": 0.81509936, "learning_rate": 3.439593179516578e-06, "loss": 0.83718717, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.7321038246154785 }, { "auxiliary_loss_clip": 0.01172144, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 1.0554682, "balance_loss_mlp": 1.01610279, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 2.693470179096924, "language_loss": 0.81095767, "learning_rate": 3.4390523198737524e-06, "loss": 0.83293068, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.6407551765441895 }, { "auxiliary_loss_clip": 0.01203424, "auxiliary_loss_mlp": 0.00712977, "balance_loss_clip": 1.06049109, "balance_loss_mlp": 1.00111854, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 2.324142108806525, "language_loss": 0.73669058, "learning_rate": 3.4385112419278333e-06, "loss": 0.75585461, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.6552488803863525 }, { "auxiliary_loss_clip": 0.01112637, "auxiliary_loss_mlp": 0.01001961, "balance_loss_clip": 1.03984642, "balance_loss_mlp": 0.99930251, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7968229264075074, "language_loss": 0.64810026, "learning_rate": 3.4379699457609033e-06, "loss": 0.6692462, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 3.0228145122528076 }, { "auxiliary_loss_clip": 0.01158866, "auxiliary_loss_mlp": 0.01026748, "balance_loss_clip": 1.05222428, "balance_loss_mlp": 1.01694858, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 2.1330787142162584, "language_loss": 0.89943993, "learning_rate": 3.4374284314550755e-06, "loss": 0.92129612, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.6712801456451416 }, { "auxiliary_loss_clip": 0.01200138, "auxiliary_loss_mlp": 0.01033152, "balance_loss_clip": 1.05707026, "balance_loss_mlp": 1.02408624, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 2.2534160097514992, "language_loss": 0.80751014, "learning_rate": 3.436886699092498e-06, "loss": 0.82984293, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 4.3817126750946045 }, { "auxiliary_loss_clip": 0.01203987, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.05911255, "balance_loss_mlp": 1.0202713, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 3.504817880257466, "language_loss": 0.71826792, "learning_rate": 3.4363447487553502e-06, "loss": 0.74060243, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 3.3137667179107666 }, { "auxiliary_loss_clip": 0.01164422, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.055287, "balance_loss_mlp": 1.02065146, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 2.0717491258779934, "language_loss": 0.77559763, "learning_rate": 3.4358025805258455e-06, "loss": 0.79753602, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.7338714599609375 }, { "auxiliary_loss_clip": 0.01137644, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.02014422, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 2.5684679069396013, "language_loss": 0.83417249, "learning_rate": 3.435260194486232e-06, "loss": 0.85584766, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.812394857406616 }, { "auxiliary_loss_clip": 0.01167147, "auxiliary_loss_mlp": 0.0102907, "balance_loss_clip": 1.05534983, "balance_loss_mlp": 1.01971865, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 3.5978478709837804, "language_loss": 0.82026368, "learning_rate": 3.4347175907187875e-06, "loss": 0.84222591, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 3.575589179992676 }, { "auxiliary_loss_clip": 0.0118118, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.05514908, "balance_loss_mlp": 1.01742482, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.7993495632153387, "language_loss": 0.87826645, "learning_rate": 3.4341747693058254e-06, "loss": 0.90033889, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 2.659339427947998 }, { "auxiliary_loss_clip": 0.0107668, "auxiliary_loss_mlp": 0.01029746, "balance_loss_clip": 1.04187787, "balance_loss_mlp": 1.02058494, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.9425453028453543, "language_loss": 0.77305222, "learning_rate": 3.4336317303296916e-06, "loss": 0.79411644, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 2.9662764072418213 }, { "auxiliary_loss_clip": 0.0118047, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.05489075, "balance_loss_mlp": 1.01728964, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.6539618273694003, "language_loss": 0.75595975, "learning_rate": 3.4330884738727635e-06, "loss": 0.77802444, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 2.759172201156616 }, { "auxiliary_loss_clip": 0.01127458, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.05013704, "balance_loss_mlp": 1.02410793, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 2.3957036142382684, "language_loss": 0.705172, "learning_rate": 3.4325450000174535e-06, "loss": 0.72678292, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.652076244354248 }, { "auxiliary_loss_clip": 0.01129725, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.05191791, "balance_loss_mlp": 1.02441192, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 2.1853780387190382, "language_loss": 0.7430203, "learning_rate": 3.4320013088462067e-06, "loss": 0.76465863, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.724113941192627 }, { "auxiliary_loss_clip": 0.01152261, "auxiliary_loss_mlp": 0.0102748, "balance_loss_clip": 1.05001211, "balance_loss_mlp": 1.01905799, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.7570674503097554, "language_loss": 0.81493163, "learning_rate": 3.431457400441499e-06, "loss": 0.83672905, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.6896798610687256 }, { "auxiliary_loss_clip": 0.01036149, "auxiliary_loss_mlp": 0.01005587, "balance_loss_clip": 1.02787876, "balance_loss_mlp": 1.00295246, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 1.0782398989322448, "language_loss": 0.60776985, "learning_rate": 3.4309132748858424e-06, "loss": 0.62818718, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.2879586219787598 }, { "auxiliary_loss_clip": 0.0118117, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 1.05760074, "balance_loss_mlp": 1.02774346, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.628522160301827, "language_loss": 0.83792794, "learning_rate": 3.430368932261779e-06, "loss": 0.86010307, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 2.6586074829101562 }, { "auxiliary_loss_clip": 0.01164145, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.0537045, "balance_loss_mlp": 1.02575517, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 2.2486014072631373, "language_loss": 0.75009537, "learning_rate": 3.429824372651886e-06, "loss": 0.77208757, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.6847355365753174 }, { "auxiliary_loss_clip": 0.01145094, "auxiliary_loss_mlp": 0.01027422, "balance_loss_clip": 1.05331457, "balance_loss_mlp": 1.01804066, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 3.8759610055597014, "language_loss": 0.83636928, "learning_rate": 3.4292795961387732e-06, "loss": 0.85809445, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.776093006134033 }, { "auxiliary_loss_clip": 0.01201548, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.05938196, "balance_loss_mlp": 1.02146864, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.250514011103861, "language_loss": 0.87760758, "learning_rate": 3.4287346028050818e-06, "loss": 0.89992583, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.5375356674194336 }, { "auxiliary_loss_clip": 0.01168306, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.05549312, "balance_loss_mlp": 1.02067077, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.7151861732182612, "language_loss": 0.79855824, "learning_rate": 3.4281893927334866e-06, "loss": 0.8205297, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.6650683879852295 }, { "auxiliary_loss_clip": 0.01183774, "auxiliary_loss_mlp": 0.01031062, "balance_loss_clip": 1.05545211, "balance_loss_mlp": 1.02190113, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 2.09876553844208, "language_loss": 0.75195813, "learning_rate": 3.4276439660066963e-06, "loss": 0.7741065, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.63893985748291 }, { "auxiliary_loss_clip": 0.01196059, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.05490065, "balance_loss_mlp": 1.01851463, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.3460557436545293, "language_loss": 0.8415643, "learning_rate": 3.427098322707452e-06, "loss": 0.86379117, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.648404836654663 }, { "auxiliary_loss_clip": 0.01190789, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.06317544, "balance_loss_mlp": 1.01942348, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 2.4514142401609167, "language_loss": 0.89718139, "learning_rate": 3.426552462918526e-06, "loss": 0.9193719, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.592318058013916 }, { "auxiliary_loss_clip": 0.01200763, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.06089497, "balance_loss_mlp": 1.02232814, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.544583234920542, "language_loss": 0.73448229, "learning_rate": 3.426006386722726e-06, "loss": 0.75680232, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.6282544136047363 }, { "auxiliary_loss_clip": 0.01153714, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.05577612, "balance_loss_mlp": 1.01875067, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 2.3532422280143526, "language_loss": 0.92450845, "learning_rate": 3.4254600942028914e-06, "loss": 0.94632125, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.6575329303741455 }, { "auxiliary_loss_clip": 0.0116415, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.05539107, "balance_loss_mlp": 1.02160704, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.269001027976043, "language_loss": 0.82437307, "learning_rate": 3.424913585441893e-06, "loss": 0.84631515, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.6387407779693604 }, { "auxiliary_loss_clip": 0.01180873, "auxiliary_loss_mlp": 0.01032067, "balance_loss_clip": 1.05652177, "balance_loss_mlp": 1.02315652, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 2.043860012328802, "language_loss": 0.87428272, "learning_rate": 3.4243668605226374e-06, "loss": 0.89641213, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.6230998039245605 }, { "auxiliary_loss_clip": 0.01151192, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.05427694, "balance_loss_mlp": 1.02048504, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.5940327156073186, "language_loss": 0.83163762, "learning_rate": 3.423819919528061e-06, "loss": 0.85345227, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.683678150177002 }, { "auxiliary_loss_clip": 0.01139965, "auxiliary_loss_mlp": 0.01027706, "balance_loss_clip": 1.0494833, "balance_loss_mlp": 1.01861072, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 2.9160708177720465, "language_loss": 0.78318918, "learning_rate": 3.4232727625411355e-06, "loss": 0.8048659, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 3.5741329193115234 }, { "auxiliary_loss_clip": 0.01114491, "auxiliary_loss_mlp": 0.01026119, "balance_loss_clip": 1.0506078, "balance_loss_mlp": 1.01718438, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.906622689895817, "language_loss": 0.86332846, "learning_rate": 3.4227253896448626e-06, "loss": 0.88473457, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 3.592926502227783 }, { "auxiliary_loss_clip": 0.01197584, "auxiliary_loss_mlp": 0.01023753, "balance_loss_clip": 1.05643511, "balance_loss_mlp": 1.01540208, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 2.463742625605479, "language_loss": 0.82525694, "learning_rate": 3.42217780092228e-06, "loss": 0.84747028, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 3.4065024852752686 }, { "auxiliary_loss_clip": 0.01082398, "auxiliary_loss_mlp": 0.01004218, "balance_loss_clip": 1.03184938, "balance_loss_mlp": 1.00142884, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 1.1299019431457544, "language_loss": 0.60311878, "learning_rate": 3.421629996456456e-06, "loss": 0.62398493, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.190777063369751 }, { "auxiliary_loss_clip": 0.01179382, "auxiliary_loss_mlp": 0.01024415, "balance_loss_clip": 1.05462921, "balance_loss_mlp": 1.01563537, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 3.6762379407414345, "language_loss": 0.82685471, "learning_rate": 3.421081976330491e-06, "loss": 0.84889269, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.581921339035034 }, { "auxiliary_loss_clip": 0.01162864, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.05305648, "balance_loss_mlp": 1.02408457, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 1.7860495486880132, "language_loss": 0.87481904, "learning_rate": 3.4205337406275207e-06, "loss": 0.89677864, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 3.5980539321899414 }, { "auxiliary_loss_clip": 0.0119718, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.05501354, "balance_loss_mlp": 1.02198744, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.586634498568021, "language_loss": 0.7540074, "learning_rate": 3.4199852894307114e-06, "loss": 0.77628636, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.6105852127075195 }, { "auxiliary_loss_clip": 0.0112449, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.05515933, "balance_loss_mlp": 1.02098441, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.1767939614071303, "language_loss": 0.78699166, "learning_rate": 3.419436622823262e-06, "loss": 0.80854559, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.765347957611084 }, { "auxiliary_loss_clip": 0.01165085, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.05436897, "balance_loss_mlp": 1.01850247, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.6531973959618445, "language_loss": 0.74408913, "learning_rate": 3.4188877408884063e-06, "loss": 0.76600707, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.6798529624938965 }, { "auxiliary_loss_clip": 0.01158911, "auxiliary_loss_mlp": 0.01029327, "balance_loss_clip": 1.05300474, "balance_loss_mlp": 1.02079129, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.546177353345084, "language_loss": 0.65630203, "learning_rate": 3.4183386437094088e-06, "loss": 0.67818439, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.6185014247894287 }, { "auxiliary_loss_clip": 0.01164092, "auxiliary_loss_mlp": 0.0102529, "balance_loss_clip": 1.05042934, "balance_loss_mlp": 1.01596224, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 8.024541204118503, "language_loss": 0.82452095, "learning_rate": 3.417789331369565e-06, "loss": 0.8464148, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.6315529346466064 }, { "auxiliary_loss_clip": 0.01200826, "auxiliary_loss_mlp": 0.01028466, "balance_loss_clip": 1.05790019, "balance_loss_mlp": 1.01947761, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 2.208801119403613, "language_loss": 0.90996492, "learning_rate": 3.4172398039522088e-06, "loss": 0.93225777, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.7129647731781006 }, { "auxiliary_loss_clip": 0.01181627, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.05362451, "balance_loss_mlp": 1.02075315, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 2.5973898822574295, "language_loss": 0.79877049, "learning_rate": 3.4166900615407e-06, "loss": 0.82088566, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.694749593734741 }, { "auxiliary_loss_clip": 0.01180117, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.05456781, "balance_loss_mlp": 1.01982296, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 1.8450583488542343, "language_loss": 0.74840701, "learning_rate": 3.416140104218436e-06, "loss": 0.77049744, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.6936910152435303 }, { "auxiliary_loss_clip": 0.01096205, "auxiliary_loss_mlp": 0.0070422, "balance_loss_clip": 1.03681839, "balance_loss_mlp": 1.00069678, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8401346464657873, "language_loss": 0.69733095, "learning_rate": 3.4155899320688437e-06, "loss": 0.71533513, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.2621636390686035 }, { "auxiliary_loss_clip": 0.01122104, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.05098915, "balance_loss_mlp": 1.01993275, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.5067431944130796, "language_loss": 0.74262154, "learning_rate": 3.415039545175384e-06, "loss": 0.76412994, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.703080177307129 }, { "auxiliary_loss_clip": 0.01184359, "auxiliary_loss_mlp": 0.01026159, "balance_loss_clip": 1.05516338, "balance_loss_mlp": 1.01727176, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.5563161491550983, "language_loss": 0.65276134, "learning_rate": 3.414488943621551e-06, "loss": 0.67486656, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.610626697540283 }, { "auxiliary_loss_clip": 0.01182944, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.05777168, "balance_loss_mlp": 1.02739608, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 1.8120727441218794, "language_loss": 0.73644495, "learning_rate": 3.41393812749087e-06, "loss": 0.75864041, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.6158783435821533 }, { "auxiliary_loss_clip": 0.0116253, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.05469596, "balance_loss_mlp": 1.02592778, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.737742218703258, "language_loss": 0.72242558, "learning_rate": 3.4133870968668984e-06, "loss": 0.7443952, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.569082736968994 }, { "auxiliary_loss_clip": 0.01165521, "auxiliary_loss_mlp": 0.01031231, "balance_loss_clip": 1.05395222, "balance_loss_mlp": 1.02206957, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 3.0606291843486635, "language_loss": 0.78259569, "learning_rate": 3.412835851833229e-06, "loss": 0.80456328, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.716832399368286 }, { "auxiliary_loss_clip": 0.01183235, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.05989122, "balance_loss_mlp": 1.02611172, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.990505449253445, "language_loss": 0.77649289, "learning_rate": 3.4122843924734834e-06, "loss": 0.7986697, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.6615352630615234 }, { "auxiliary_loss_clip": 0.01160314, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.05348325, "balance_loss_mlp": 1.02594829, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 2.1071604647431266, "language_loss": 0.88099146, "learning_rate": 3.411732718871319e-06, "loss": 0.90294707, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.68962025642395 }, { "auxiliary_loss_clip": 0.01199168, "auxiliary_loss_mlp": 0.01029508, "balance_loss_clip": 1.06093001, "balance_loss_mlp": 1.02139568, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.9111240240055833, "language_loss": 0.78587425, "learning_rate": 3.4111808311104227e-06, "loss": 0.8081609, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.636989116668701 }, { "auxiliary_loss_clip": 0.01170725, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.05197763, "balance_loss_mlp": 1.02335441, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.698381132788969, "language_loss": 0.69306386, "learning_rate": 3.410628729274517e-06, "loss": 0.71509546, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.754784345626831 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.0071298, "balance_loss_clip": 1.05373645, "balance_loss_mlp": 1.00099409, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.8636402867033077, "language_loss": 0.82610345, "learning_rate": 3.4100764134473546e-06, "loss": 0.84484398, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.677182197570801 }, { "auxiliary_loss_clip": 0.01197412, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.0579145, "balance_loss_mlp": 1.02369785, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.2875609235186567, "language_loss": 0.85098869, "learning_rate": 3.4095238837127215e-06, "loss": 0.87328577, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.581907272338867 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.05147219, "balance_loss_mlp": 1.02048683, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 1.9905879991125452, "language_loss": 0.79591191, "learning_rate": 3.4089711401544355e-06, "loss": 0.81767666, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 3.595738410949707 }, { "auxiliary_loss_clip": 0.0118062, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.05466783, "balance_loss_mlp": 1.02638018, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.500893435501235, "language_loss": 0.67823523, "learning_rate": 3.4084181828563486e-06, "loss": 0.7003957, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.610836982727051 }, { "auxiliary_loss_clip": 0.0112776, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.04950547, "balance_loss_mlp": 1.02905488, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.808209100037763, "language_loss": 0.70418632, "learning_rate": 3.4078650119023428e-06, "loss": 0.72584271, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 3.6052191257476807 }, { "auxiliary_loss_clip": 0.01119257, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.04732752, "balance_loss_mlp": 1.02531266, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.5143440512669475, "language_loss": 0.73944211, "learning_rate": 3.4073116273763337e-06, "loss": 0.76098013, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.709552764892578 }, { "auxiliary_loss_clip": 0.01170448, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.05462766, "balance_loss_mlp": 1.01877546, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 6.591077454954256, "language_loss": 0.81145322, "learning_rate": 3.40675802936227e-06, "loss": 0.83344376, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.67912220954895 }, { "auxiliary_loss_clip": 0.01159596, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.05525661, "balance_loss_mlp": 1.02168989, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 1.8631067900372242, "language_loss": 0.71878111, "learning_rate": 3.4062042179441318e-06, "loss": 0.74068171, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 3.8220736980438232 }, { "auxiliary_loss_clip": 0.01180026, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.05809116, "balance_loss_mlp": 1.02273631, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.0371010528490476, "language_loss": 0.80654287, "learning_rate": 3.4056501932059314e-06, "loss": 0.8286562, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.5929369926452637 }, { "auxiliary_loss_clip": 0.01127244, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.04319072, "balance_loss_mlp": 1.03251922, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 1.164735308026092, "language_loss": 0.58102548, "learning_rate": 3.405095955231715e-06, "loss": 0.60265529, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.1254875659942627 }, { "auxiliary_loss_clip": 0.01184962, "auxiliary_loss_mlp": 0.01029495, "balance_loss_clip": 1.05465376, "balance_loss_mlp": 1.02111411, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 3.263587123816308, "language_loss": 0.94117785, "learning_rate": 3.4045415041055585e-06, "loss": 0.9633224, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.571162462234497 }, { "auxiliary_loss_clip": 0.01169742, "auxiliary_loss_mlp": 0.01032094, "balance_loss_clip": 1.05429685, "balance_loss_mlp": 1.02314091, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.421943489188237, "language_loss": 0.78089166, "learning_rate": 3.4039868399115728e-06, "loss": 0.80290997, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.669959783554077 }, { "auxiliary_loss_clip": 0.01119556, "auxiliary_loss_mlp": 0.01040448, "balance_loss_clip": 1.05241799, "balance_loss_mlp": 1.03175807, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.8781629428270574, "language_loss": 0.80493617, "learning_rate": 3.4034319627339003e-06, "loss": 0.82653624, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.6949195861816406 }, { "auxiliary_loss_clip": 0.01168087, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.05597353, "balance_loss_mlp": 1.02618337, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.42882540596662, "language_loss": 0.69446731, "learning_rate": 3.402876872656715e-06, "loss": 0.71650791, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.6599273681640625 }, { "auxiliary_loss_clip": 0.01163804, "auxiliary_loss_mlp": 0.01030907, "balance_loss_clip": 1.0561645, "balance_loss_mlp": 1.02236545, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 2.2143867257058196, "language_loss": 0.89721245, "learning_rate": 3.402321569764223e-06, "loss": 0.91915953, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.6424131393432617 }, { "auxiliary_loss_clip": 0.01134754, "auxiliary_loss_mlp": 0.00712708, "balance_loss_clip": 1.05032897, "balance_loss_mlp": 1.00106525, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 1.8128768831993745, "language_loss": 0.83431184, "learning_rate": 3.4017660541406635e-06, "loss": 0.85278642, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.6962685585021973 }, { "auxiliary_loss_clip": 0.01171595, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 1.05337036, "balance_loss_mlp": 1.02912593, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.7315283151158172, "language_loss": 0.74340677, "learning_rate": 3.4012103258703092e-06, "loss": 0.76551032, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.6861531734466553 }, { "auxiliary_loss_clip": 0.01148902, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.05142713, "balance_loss_mlp": 1.0259037, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 2.0379160291530467, "language_loss": 0.82945418, "learning_rate": 3.4006543850374616e-06, "loss": 0.85128677, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.709010601043701 }, { "auxiliary_loss_clip": 0.01183315, "auxiliary_loss_mlp": 0.01034978, "balance_loss_clip": 1.05460131, "balance_loss_mlp": 1.02638292, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 2.062189971819152, "language_loss": 0.7515505, "learning_rate": 3.400098231726458e-06, "loss": 0.77373338, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.61283278465271 }, { "auxiliary_loss_clip": 0.01152017, "auxiliary_loss_mlp": 0.01033544, "balance_loss_clip": 1.04928255, "balance_loss_mlp": 1.02389431, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 2.47697649717562, "language_loss": 0.8674798, "learning_rate": 3.3995418660216657e-06, "loss": 0.88933545, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.659482717514038 }, { "auxiliary_loss_clip": 0.01201775, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.05589557, "balance_loss_mlp": 1.02040875, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.5445644323489836, "language_loss": 0.80407459, "learning_rate": 3.3989852880074848e-06, "loss": 0.82639623, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.6753833293914795 }, { "auxiliary_loss_clip": 0.01088908, "auxiliary_loss_mlp": 0.0100769, "balance_loss_clip": 1.04179192, "balance_loss_mlp": 1.0055083, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7472296597091843, "language_loss": 0.60558325, "learning_rate": 3.398428497768348e-06, "loss": 0.62654918, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.3166351318359375 }, { "auxiliary_loss_clip": 0.011556, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 1.05006647, "balance_loss_mlp": 1.02267146, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.7669214084435352, "language_loss": 0.71921074, "learning_rate": 3.3978714953887205e-06, "loss": 0.74108708, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.676903486251831 }, { "auxiliary_loss_clip": 0.01116462, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.04336536, "balance_loss_mlp": 1.02494001, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.7024848250086233, "language_loss": 0.86159015, "learning_rate": 3.397314280953098e-06, "loss": 0.88309383, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.704285144805908 }, { "auxiliary_loss_clip": 0.01158226, "auxiliary_loss_mlp": 0.01035012, "balance_loss_clip": 1.05234683, "balance_loss_mlp": 1.02591634, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 2.0023212431588466, "language_loss": 0.80467439, "learning_rate": 3.3967568545460108e-06, "loss": 0.82660675, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.6615357398986816 }, { "auxiliary_loss_clip": 0.01175547, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.05435336, "balance_loss_mlp": 1.01844144, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 2.687173394153017, "language_loss": 0.80724502, "learning_rate": 3.3961992162520185e-06, "loss": 0.82927406, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.594985008239746 }, { "auxiliary_loss_clip": 0.01179695, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.05464149, "balance_loss_mlp": 1.0256592, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.0496435187217426, "language_loss": 0.71714824, "learning_rate": 3.3956413661557156e-06, "loss": 0.73929274, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.641282320022583 }, { "auxiliary_loss_clip": 0.0115807, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.05213046, "balance_loss_mlp": 1.02542567, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.342275154413283, "language_loss": 0.66457611, "learning_rate": 3.3950833043417273e-06, "loss": 0.68649936, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.6621217727661133 }, { "auxiliary_loss_clip": 0.01184624, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.05887151, "balance_loss_mlp": 1.02013659, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 3.664792842008677, "language_loss": 0.7352916, "learning_rate": 3.3945250308947105e-06, "loss": 0.75742912, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 4.401453495025635 }, { "auxiliary_loss_clip": 0.0111316, "auxiliary_loss_mlp": 0.01006656, "balance_loss_clip": 1.04153442, "balance_loss_mlp": 1.00355613, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.1827754274078561, "language_loss": 0.68332773, "learning_rate": 3.3939665458993556e-06, "loss": 0.70452589, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.1877923011779785 }, { "auxiliary_loss_clip": 0.0115599, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.05234993, "balance_loss_mlp": 1.02488899, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 2.161857727352798, "language_loss": 0.77049446, "learning_rate": 3.3934078494403843e-06, "loss": 0.79238796, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 3.556288719177246 }, { "auxiliary_loss_clip": 0.010984, "auxiliary_loss_mlp": 0.007135, "balance_loss_clip": 1.04925704, "balance_loss_mlp": 1.00109458, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 1.9306937361085088, "language_loss": 0.81424761, "learning_rate": 3.3928489416025495e-06, "loss": 0.83236659, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.8326523303985596 }, { "auxiliary_loss_clip": 0.01164313, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.0543623, "balance_loss_mlp": 1.02486122, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.179952120627867, "language_loss": 0.79069984, "learning_rate": 3.392289822470638e-06, "loss": 0.81268108, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.661774158477783 }, { "auxiliary_loss_clip": 0.01163401, "auxiliary_loss_mlp": 0.01031353, "balance_loss_clip": 1.05308533, "balance_loss_mlp": 1.02253139, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 3.1833425664578985, "language_loss": 0.7609092, "learning_rate": 3.3917304921294674e-06, "loss": 0.78285676, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 3.4049203395843506 }, { "auxiliary_loss_clip": 0.01180046, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.05321646, "balance_loss_mlp": 1.02088785, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 2.0737996467583013, "language_loss": 0.80823958, "learning_rate": 3.3911709506638876e-06, "loss": 0.83034122, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.595858573913574 }, { "auxiliary_loss_clip": 0.01135312, "auxiliary_loss_mlp": 0.00712834, "balance_loss_clip": 1.04670787, "balance_loss_mlp": 1.00094151, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 4.382916203884769, "language_loss": 0.81553149, "learning_rate": 3.390611198158781e-06, "loss": 0.83401293, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.7465970516204834 }, { "auxiliary_loss_clip": 0.01200434, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.05804121, "balance_loss_mlp": 1.02516699, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.174375142183059, "language_loss": 0.89877367, "learning_rate": 3.3900512346990612e-06, "loss": 0.9211297, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.5344295501708984 }, { "auxiliary_loss_clip": 0.01131172, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.04666889, "balance_loss_mlp": 1.01677418, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.8783652938443092, "language_loss": 0.65553308, "learning_rate": 3.389491060369674e-06, "loss": 0.67710888, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 2.881859064102173 }, { "auxiliary_loss_clip": 0.0112551, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.04973888, "balance_loss_mlp": 1.02189124, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 2.0070344512948948, "language_loss": 0.89215255, "learning_rate": 3.388930675255598e-06, "loss": 0.91371685, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.673800230026245 }, { "auxiliary_loss_clip": 0.011681, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.05262947, "balance_loss_mlp": 1.01775837, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 4.6473180071857945, "language_loss": 0.79639387, "learning_rate": 3.388370079441843e-06, "loss": 0.81835431, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.627930164337158 }, { "auxiliary_loss_clip": 0.01152966, "auxiliary_loss_mlp": 0.0103131, "balance_loss_clip": 1.05644536, "balance_loss_mlp": 1.0220952, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 1.955324592657311, "language_loss": 0.92708802, "learning_rate": 3.3878092730134505e-06, "loss": 0.9489308, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.6288514137268066 }, { "auxiliary_loss_clip": 0.01175155, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.05531192, "balance_loss_mlp": 1.02181184, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 2.048286750596755, "language_loss": 0.80561727, "learning_rate": 3.3872482560554947e-06, "loss": 0.82768583, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.5877890586853027 }, { "auxiliary_loss_clip": 0.01101194, "auxiliary_loss_mlp": 0.01016058, "balance_loss_clip": 1.03231108, "balance_loss_mlp": 1.0128392, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.8050836952783035, "language_loss": 0.56979227, "learning_rate": 3.386687028653082e-06, "loss": 0.59096479, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.214683771133423 }, { "auxiliary_loss_clip": 0.0113395, "auxiliary_loss_mlp": 0.01030998, "balance_loss_clip": 1.05163634, "balance_loss_mlp": 1.02171779, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.8433275800866684, "language_loss": 0.85089922, "learning_rate": 3.386125590891349e-06, "loss": 0.8725487, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.7324094772338867 }, { "auxiliary_loss_clip": 0.01155124, "auxiliary_loss_mlp": 0.01028253, "balance_loss_clip": 1.05200863, "balance_loss_mlp": 1.01999807, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 1.9989913987039896, "language_loss": 0.82819486, "learning_rate": 3.3855639428554657e-06, "loss": 0.85002857, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.6305413246154785 }, { "auxiliary_loss_clip": 0.01137796, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.04968596, "balance_loss_mlp": 1.02005911, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 2.0071945260644175, "language_loss": 0.80449164, "learning_rate": 3.385002084630635e-06, "loss": 0.82616115, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.727024793624878 }, { "auxiliary_loss_clip": 0.01188334, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.05705547, "balance_loss_mlp": 1.01880527, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 2.1772883143934174, "language_loss": 0.84553325, "learning_rate": 3.384440016302088e-06, "loss": 0.8677026, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.6365630626678467 }, { "auxiliary_loss_clip": 0.01177304, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.05493009, "balance_loss_mlp": 1.02208304, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.0722762099748055, "language_loss": 0.62209821, "learning_rate": 3.3838777379550923e-06, "loss": 0.64418387, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.632857084274292 }, { "auxiliary_loss_clip": 0.01169042, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.05597413, "balance_loss_mlp": 1.0204128, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 2.1943289865141105, "language_loss": 0.78136933, "learning_rate": 3.383315249674944e-06, "loss": 0.80335504, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.717973232269287 }, { "auxiliary_loss_clip": 0.0115236, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.05483902, "balance_loss_mlp": 1.03128624, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 3.208763721196978, "language_loss": 0.85959566, "learning_rate": 3.3827525515469715e-06, "loss": 0.88152081, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.6835405826568604 }, { "auxiliary_loss_clip": 0.01140904, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.049757, "balance_loss_mlp": 1.0206753, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 2.825795085201793, "language_loss": 0.70952213, "learning_rate": 3.3821896436565367e-06, "loss": 0.73122668, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.733396053314209 }, { "auxiliary_loss_clip": 0.01187861, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.06050467, "balance_loss_mlp": 1.02167726, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.8338337284265098, "language_loss": 0.70876515, "learning_rate": 3.381626526089032e-06, "loss": 0.73095196, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.619157314300537 }, { "auxiliary_loss_clip": 0.0116101, "auxiliary_loss_mlp": 0.01027858, "balance_loss_clip": 1.05165327, "balance_loss_mlp": 1.01845837, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 2.177205903228475, "language_loss": 0.78767073, "learning_rate": 3.3810631989298815e-06, "loss": 0.80955946, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.695211887359619 }, { "auxiliary_loss_clip": 0.01138139, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.05271065, "balance_loss_mlp": 1.02181005, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.187775678053507, "language_loss": 0.84441316, "learning_rate": 3.3804996622645423e-06, "loss": 0.86610651, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.7326409816741943 }, { "auxiliary_loss_clip": 0.01200248, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.05901444, "balance_loss_mlp": 1.02328455, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 4.094390623210936, "language_loss": 0.89485896, "learning_rate": 3.3799359161785015e-06, "loss": 0.91718799, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 3.615081787109375 }, { "auxiliary_loss_clip": 0.01180597, "auxiliary_loss_mlp": 0.01027289, "balance_loss_clip": 1.05612874, "balance_loss_mlp": 1.01750159, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.796463292802837, "language_loss": 0.85774875, "learning_rate": 3.3793719607572798e-06, "loss": 0.87982762, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 3.54256534576416 }, { "auxiliary_loss_clip": 0.01148376, "auxiliary_loss_mlp": 0.01027054, "balance_loss_clip": 1.04960346, "balance_loss_mlp": 1.01780295, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 2.3547418242298406, "language_loss": 0.76889348, "learning_rate": 3.378807796086428e-06, "loss": 0.79064775, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 3.5321640968322754 }, { "auxiliary_loss_clip": 0.01198462, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.05920255, "balance_loss_mlp": 1.01867676, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 2.679549762359801, "language_loss": 0.76991665, "learning_rate": 3.37824342225153e-06, "loss": 0.79218298, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.6171579360961914 }, { "auxiliary_loss_clip": 0.01135378, "auxiliary_loss_mlp": 0.01029266, "balance_loss_clip": 1.05223846, "balance_loss_mlp": 1.02037323, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 2.608531650652872, "language_loss": 0.77626252, "learning_rate": 3.3776788393382006e-06, "loss": 0.7979089, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.7517788410186768 }, { "auxiliary_loss_clip": 0.01198973, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.05836463, "balance_loss_mlp": 1.02380943, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 2.835085737370056, "language_loss": 0.76632893, "learning_rate": 3.3771140474320872e-06, "loss": 0.78865576, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 3.557053327560425 }, { "auxiliary_loss_clip": 0.01161341, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.05754685, "balance_loss_mlp": 1.02574825, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 1.7964032426655083, "language_loss": 0.79551703, "learning_rate": 3.3765490466188664e-06, "loss": 0.81748694, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.7794554233551025 }, { "auxiliary_loss_clip": 0.01148301, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.0538404, "balance_loss_mlp": 1.01905608, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 6.136495070318198, "language_loss": 0.74322355, "learning_rate": 3.3759838369842508e-06, "loss": 0.76499343, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.6645729541778564 }, { "auxiliary_loss_clip": 0.01151548, "auxiliary_loss_mlp": 0.01032541, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.02259946, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 3.8270820148355824, "language_loss": 0.73344779, "learning_rate": 3.375418418613981e-06, "loss": 0.75528866, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.6875109672546387 }, { "auxiliary_loss_clip": 0.01169618, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.05687642, "balance_loss_mlp": 1.02142739, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 5.464490818776602, "language_loss": 0.8336463, "learning_rate": 3.374852791593831e-06, "loss": 0.85564899, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 2.582435369491577 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.05090833, "balance_loss_mlp": 1.02004838, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 2.5569901545291818, "language_loss": 0.54036558, "learning_rate": 3.374286956009605e-06, "loss": 0.56209588, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.7046449184417725 }, { "auxiliary_loss_clip": 0.01184346, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.06035876, "balance_loss_mlp": 1.0272665, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.348090610692671, "language_loss": 0.75046813, "learning_rate": 3.3737209119471405e-06, "loss": 0.77267909, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.5687975883483887 }, { "auxiliary_loss_clip": 0.01190826, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.06007218, "balance_loss_mlp": 1.01930726, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 3.9251046169675328, "language_loss": 0.63635486, "learning_rate": 3.373154659492306e-06, "loss": 0.65855628, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.6060891151428223 }, { "auxiliary_loss_clip": 0.01170778, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.05702353, "balance_loss_mlp": 1.01775908, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 1.8942286793743088, "language_loss": 0.85404313, "learning_rate": 3.3725881987310016e-06, "loss": 0.87601888, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.6023964881896973 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.05497515, "balance_loss_mlp": 1.01760221, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.836123359344509, "language_loss": 0.87922233, "learning_rate": 3.372021529749159e-06, "loss": 0.90113997, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.6612963676452637 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.05378497, "balance_loss_mlp": 1.02431762, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 2.2923175043105704, "language_loss": 0.9244405, "learning_rate": 3.3714546526327405e-06, "loss": 0.94600809, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.7367842197418213 }, { "auxiliary_loss_clip": 0.01155971, "auxiliary_loss_mlp": 0.01034324, "balance_loss_clip": 1.05371487, "balance_loss_mlp": 1.0249064, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 1.983146878697135, "language_loss": 0.87937021, "learning_rate": 3.3708875674677423e-06, "loss": 0.90127313, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.6754190921783447 }, { "auxiliary_loss_clip": 0.01181029, "auxiliary_loss_mlp": 0.01029808, "balance_loss_clip": 1.06233501, "balance_loss_mlp": 1.01975322, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 3.3661598955806644, "language_loss": 0.83413947, "learning_rate": 3.37032027434019e-06, "loss": 0.85624778, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.6894681453704834 }, { "auxiliary_loss_clip": 0.01194208, "auxiliary_loss_mlp": 0.01034183, "balance_loss_clip": 1.05901563, "balance_loss_mlp": 1.02377009, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 1.773221176906481, "language_loss": 0.83163643, "learning_rate": 3.369752773336141e-06, "loss": 0.85392034, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.609661340713501 }, { "auxiliary_loss_clip": 0.01172019, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.05914354, "balance_loss_mlp": 1.02890015, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.6364407529071452, "language_loss": 0.78380245, "learning_rate": 3.3691850645416864e-06, "loss": 0.80591869, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.696127414703369 }, { "auxiliary_loss_clip": 0.01189949, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.05787086, "balance_loss_mlp": 1.02028811, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 2.7458129774177342, "language_loss": 0.83359981, "learning_rate": 3.368617148042945e-06, "loss": 0.8557986, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.571220874786377 }, { "auxiliary_loss_clip": 0.01162948, "auxiliary_loss_mlp": 0.01038803, "balance_loss_clip": 1.05424881, "balance_loss_mlp": 1.02952814, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 2.4420317363156614, "language_loss": 0.84726501, "learning_rate": 3.368049023926071e-06, "loss": 0.86928248, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.7433083057403564 }, { "auxiliary_loss_clip": 0.01183824, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.05966139, "balance_loss_mlp": 1.02485871, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.6346910266700714, "language_loss": 0.83658808, "learning_rate": 3.3674806922772476e-06, "loss": 0.85876667, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.660379409790039 }, { "auxiliary_loss_clip": 0.01159798, "auxiliary_loss_mlp": 0.01026629, "balance_loss_clip": 1.05746126, "balance_loss_mlp": 1.01783085, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.145948383325832, "language_loss": 0.74860883, "learning_rate": 3.3669121531826904e-06, "loss": 0.77047306, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.7640178203582764 }, { "auxiliary_loss_clip": 0.01148451, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.05703592, "balance_loss_mlp": 1.01863229, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 1.9395225563011784, "language_loss": 0.82982612, "learning_rate": 3.366343406728647e-06, "loss": 0.85159081, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.693366289138794 }, { "auxiliary_loss_clip": 0.01175868, "auxiliary_loss_mlp": 0.01029023, "balance_loss_clip": 1.05429411, "balance_loss_mlp": 1.01983213, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 4.333788851228812, "language_loss": 0.68632543, "learning_rate": 3.3657744530013946e-06, "loss": 0.70837432, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.6199426651000977 }, { "auxiliary_loss_clip": 0.01191608, "auxiliary_loss_mlp": 0.01026301, "balance_loss_clip": 1.06109262, "balance_loss_mlp": 1.01693749, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 1.9608663074414812, "language_loss": 0.7115438, "learning_rate": 3.3652052920872437e-06, "loss": 0.73372293, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 3.6950857639312744 }, { "auxiliary_loss_clip": 0.0116846, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.05512857, "balance_loss_mlp": 1.02602434, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.9520443138468802, "language_loss": 0.8581205, "learning_rate": 3.3646359240725355e-06, "loss": 0.88016021, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 3.6188607215881348 }, { "auxiliary_loss_clip": 0.01178439, "auxiliary_loss_mlp": 0.00712822, "balance_loss_clip": 1.05683744, "balance_loss_mlp": 1.00085974, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 1.999789951960809, "language_loss": 0.67840487, "learning_rate": 3.364066349043643e-06, "loss": 0.69731748, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 3.5687854290008545 }, { "auxiliary_loss_clip": 0.01164356, "auxiliary_loss_mlp": 0.01026614, "balance_loss_clip": 1.05589151, "balance_loss_mlp": 1.01816797, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 2.444832900185278, "language_loss": 0.82141989, "learning_rate": 3.363496567086969e-06, "loss": 0.84332955, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.6699323654174805 }, { "auxiliary_loss_clip": 0.01201572, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.06084085, "balance_loss_mlp": 1.02354968, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 3.5310042028813355, "language_loss": 0.75691795, "learning_rate": 3.3629265782889506e-06, "loss": 0.77926254, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.7658300399780273 }, { "auxiliary_loss_clip": 0.01148476, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.05296922, "balance_loss_mlp": 1.0229609, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 1.873405495833831, "language_loss": 0.72155464, "learning_rate": 3.362356382736054e-06, "loss": 0.74337131, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 3.6645333766937256 }, { "auxiliary_loss_clip": 0.01151751, "auxiliary_loss_mlp": 0.01032705, "balance_loss_clip": 1.05128527, "balance_loss_mlp": 1.02402687, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 2.134198465338697, "language_loss": 0.91052312, "learning_rate": 3.361785980514777e-06, "loss": 0.93236774, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 2.663130760192871 }, { "auxiliary_loss_clip": 0.01113991, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.05195165, "balance_loss_mlp": 1.02383327, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 5.069269230519324, "language_loss": 0.76376295, "learning_rate": 3.361215371711649e-06, "loss": 0.7852416, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.805978298187256 }, { "auxiliary_loss_clip": 0.01146552, "auxiliary_loss_mlp": 0.01029921, "balance_loss_clip": 1.0541544, "balance_loss_mlp": 1.02068806, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.764866740542567, "language_loss": 0.83247709, "learning_rate": 3.3606445564132326e-06, "loss": 0.85424185, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.6831281185150146 }, { "auxiliary_loss_clip": 0.01203866, "auxiliary_loss_mlp": 0.00713101, "balance_loss_clip": 1.06232643, "balance_loss_mlp": 1.00084686, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.3265215870026386, "language_loss": 0.82340461, "learning_rate": 3.360073534706118e-06, "loss": 0.8425743, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.5410895347595215 }, { "auxiliary_loss_clip": 0.01169622, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.05577183, "balance_loss_mlp": 1.01726115, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.527579603665139, "language_loss": 0.75830543, "learning_rate": 3.35950230667693e-06, "loss": 0.7802729, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.7983813285827637 }, { "auxiliary_loss_clip": 0.01187754, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.05829036, "balance_loss_mlp": 1.01770782, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.123728178775122, "language_loss": 0.85758233, "learning_rate": 3.358930872412323e-06, "loss": 0.87972963, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.576488733291626 }, { "auxiliary_loss_clip": 0.01183775, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.05844474, "balance_loss_mlp": 1.02605188, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.7262040794875648, "language_loss": 0.80735075, "learning_rate": 3.3583592319989825e-06, "loss": 0.82954156, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.672694683074951 }, { "auxiliary_loss_clip": 0.01193778, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.06045115, "balance_loss_mlp": 1.02228141, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 1.9294071763117506, "language_loss": 0.68806022, "learning_rate": 3.357787385523627e-06, "loss": 0.71031845, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.7219772338867188 }, { "auxiliary_loss_clip": 0.01127461, "auxiliary_loss_mlp": 0.01031581, "balance_loss_clip": 1.04932356, "balance_loss_mlp": 1.02235472, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.546790833495769, "language_loss": 0.82373643, "learning_rate": 3.3572153330730048e-06, "loss": 0.8453269, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.8181028366088867 }, { "auxiliary_loss_clip": 0.01087498, "auxiliary_loss_mlp": 0.01009128, "balance_loss_clip": 1.03885794, "balance_loss_mlp": 1.00581419, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.8550325000775466, "language_loss": 0.64691722, "learning_rate": 3.3566430747338956e-06, "loss": 0.66788352, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.1149582862854004 }, { "auxiliary_loss_clip": 0.01188258, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.05730212, "balance_loss_mlp": 1.02091622, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 2.1710628632841993, "language_loss": 0.86773419, "learning_rate": 3.35607061059311e-06, "loss": 0.88992059, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.6190285682678223 }, { "auxiliary_loss_clip": 0.01197684, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.05933797, "balance_loss_mlp": 1.01910985, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 2.0211012900909333, "language_loss": 0.75013936, "learning_rate": 3.3554979407374917e-06, "loss": 0.7724086, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.6279044151306152 }, { "auxiliary_loss_clip": 0.01186603, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02581334, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 1.655008088038864, "language_loss": 0.73723149, "learning_rate": 3.3549250652539134e-06, "loss": 0.75944293, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.733241319656372 }, { "auxiliary_loss_clip": 0.01168118, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.05356383, "balance_loss_mlp": 1.02180374, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 3.577874754406678, "language_loss": 0.81524825, "learning_rate": 3.3543519842292794e-06, "loss": 0.83724463, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.793409585952759 }, { "auxiliary_loss_clip": 0.01200571, "auxiliary_loss_mlp": 0.00713171, "balance_loss_clip": 1.05995965, "balance_loss_mlp": 1.00083005, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 1.8826690977311753, "language_loss": 0.8374089, "learning_rate": 3.353778697750527e-06, "loss": 0.85654634, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.6871466636657715 }, { "auxiliary_loss_clip": 0.01159206, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.05359006, "balance_loss_mlp": 1.01865172, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.7317580615198527, "language_loss": 0.89340287, "learning_rate": 3.353205205904622e-06, "loss": 0.91527343, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.7942004203796387 }, { "auxiliary_loss_clip": 0.01166965, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.05720544, "balance_loss_mlp": 1.02055788, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 1.9245674641348602, "language_loss": 0.71704316, "learning_rate": 3.3526315087785637e-06, "loss": 0.73901665, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.9094295501708984 }, { "auxiliary_loss_clip": 0.01119241, "auxiliary_loss_mlp": 0.01030067, "balance_loss_clip": 1.05162621, "balance_loss_mlp": 1.02115011, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.811626135272478, "language_loss": 0.81032294, "learning_rate": 3.3520576064593805e-06, "loss": 0.83181602, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.826223373413086 }, { "auxiliary_loss_clip": 0.01193118, "auxiliary_loss_mlp": 0.01031801, "balance_loss_clip": 1.06037927, "balance_loss_mlp": 1.02243662, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.641934341642305, "language_loss": 0.81773257, "learning_rate": 3.3514834990341337e-06, "loss": 0.83998179, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.7618660926818848 }, { "auxiliary_loss_clip": 0.01176644, "auxiliary_loss_mlp": 0.01026545, "balance_loss_clip": 1.05753136, "balance_loss_mlp": 1.01786065, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 5.835806276661247, "language_loss": 0.9353857, "learning_rate": 3.3509091865899144e-06, "loss": 0.95741755, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.6642906665802 }, { "auxiliary_loss_clip": 0.01202255, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.05878901, "balance_loss_mlp": 1.02038383, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 2.2463988668798907, "language_loss": 0.7046349, "learning_rate": 3.350334669213846e-06, "loss": 0.72695148, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 4.38777232170105 }, { "auxiliary_loss_clip": 0.01187507, "auxiliary_loss_mlp": 0.01036725, "balance_loss_clip": 1.06167293, "balance_loss_mlp": 1.02789783, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 1.968933199994368, "language_loss": 0.75520539, "learning_rate": 3.3497599469930816e-06, "loss": 0.7774477, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 3.6164023876190186 }, { "auxiliary_loss_clip": 0.01205441, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.06055808, "balance_loss_mlp": 1.02400851, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.740413439210789, "language_loss": 0.83328408, "learning_rate": 3.349185020014807e-06, "loss": 0.85567474, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.6137144565582275 }, { "auxiliary_loss_clip": 0.01184237, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.05495536, "balance_loss_mlp": 1.02061331, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 1.9762925722554938, "language_loss": 0.74399924, "learning_rate": 3.348609888366237e-06, "loss": 0.76614136, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.6572914123535156 }, { "auxiliary_loss_clip": 0.01113653, "auxiliary_loss_mlp": 0.01028439, "balance_loss_clip": 1.04772961, "balance_loss_mlp": 1.01926017, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.7484919014258478, "language_loss": 0.62904143, "learning_rate": 3.348034552134619e-06, "loss": 0.65046239, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.8186354637145996 }, { "auxiliary_loss_clip": 0.01129122, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.05311108, "balance_loss_mlp": 1.01819563, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 2.1946002944862757, "language_loss": 0.84429032, "learning_rate": 3.3474590114072316e-06, "loss": 0.8658483, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 3.9320380687713623 }, { "auxiliary_loss_clip": 0.01148561, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.05440593, "balance_loss_mlp": 1.01885557, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 1.7691204203423103, "language_loss": 0.82728988, "learning_rate": 3.3468832662713836e-06, "loss": 0.84906137, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.6706392765045166 }, { "auxiliary_loss_clip": 0.01149261, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.05442524, "balance_loss_mlp": 1.01852775, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 3.190047114995235, "language_loss": 0.83464491, "learning_rate": 3.346307316814415e-06, "loss": 0.8564142, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.6972410678863525 }, { "auxiliary_loss_clip": 0.01188092, "auxiliary_loss_mlp": 0.01027783, "balance_loss_clip": 1.06136, "balance_loss_mlp": 1.01808512, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 3.4572787948598673, "language_loss": 0.75781846, "learning_rate": 3.3457311631236965e-06, "loss": 0.7799772, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.647129774093628 }, { "auxiliary_loss_clip": 0.01160731, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 1.05598378, "balance_loss_mlp": 1.01844549, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.7164529330747733, "language_loss": 0.84529138, "learning_rate": 3.345154805286631e-06, "loss": 0.86717451, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.754201889038086 }, { "auxiliary_loss_clip": 0.01178791, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.05677032, "balance_loss_mlp": 1.02064252, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 3.808639682756494, "language_loss": 0.761617, "learning_rate": 3.344578243390651e-06, "loss": 0.7837075, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.6176066398620605 }, { "auxiliary_loss_clip": 0.01170312, "auxiliary_loss_mlp": 0.0102639, "balance_loss_clip": 1.05859971, "balance_loss_mlp": 1.01756299, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 3.7793111485654047, "language_loss": 0.78314704, "learning_rate": 3.3440014775232206e-06, "loss": 0.80511403, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.611001491546631 }, { "auxiliary_loss_clip": 0.01156905, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.05552506, "balance_loss_mlp": 1.01769042, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.4883869189621595, "language_loss": 0.71479452, "learning_rate": 3.343424507771834e-06, "loss": 0.73663163, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.723940372467041 }, { "auxiliary_loss_clip": 0.01155399, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.05662441, "balance_loss_mlp": 1.02575982, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.751002419995342, "language_loss": 0.8646794, "learning_rate": 3.342847334224018e-06, "loss": 0.88657624, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.647325038909912 }, { "auxiliary_loss_clip": 0.01117697, "auxiliary_loss_mlp": 0.01003816, "balance_loss_clip": 1.04357958, "balance_loss_mlp": 1.00105083, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9571772465811088, "language_loss": 0.62443691, "learning_rate": 3.342269956967329e-06, "loss": 0.645652, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.3271524906158447 }, { "auxiliary_loss_clip": 0.01190216, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.06058192, "balance_loss_mlp": 1.02438927, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.4225438681089977, "language_loss": 0.72209448, "learning_rate": 3.341692376089355e-06, "loss": 0.7443434, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.6880855560302734 }, { "auxiliary_loss_clip": 0.01183699, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.05977368, "balance_loss_mlp": 1.02166533, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 4.031182261737202, "language_loss": 0.84144408, "learning_rate": 3.3411145916777146e-06, "loss": 0.86359209, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.6328842639923096 }, { "auxiliary_loss_clip": 0.01164456, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.05659294, "balance_loss_mlp": 1.02273881, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.340180006103668, "language_loss": 0.90999162, "learning_rate": 3.3405366038200566e-06, "loss": 0.93195766, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.6383910179138184 }, { "auxiliary_loss_clip": 0.01177311, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.0623455, "balance_loss_mlp": 1.02480006, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 3.836041039204882, "language_loss": 0.84925735, "learning_rate": 3.3399584126040617e-06, "loss": 0.87137353, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.6899585723876953 }, { "auxiliary_loss_clip": 0.01203186, "auxiliary_loss_mlp": 0.0071272, "balance_loss_clip": 1.06208587, "balance_loss_mlp": 1.00071168, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 2.0396962661531406, "language_loss": 0.90635622, "learning_rate": 3.339380018117441e-06, "loss": 0.92551529, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.6106069087982178 }, { "auxiliary_loss_clip": 0.01185841, "auxiliary_loss_mlp": 0.01024497, "balance_loss_clip": 1.06272531, "balance_loss_mlp": 1.0152638, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 2.736924015387379, "language_loss": 0.78719813, "learning_rate": 3.3388014204479366e-06, "loss": 0.80930156, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.6572983264923096 }, { "auxiliary_loss_clip": 0.01207034, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.06335902, "balance_loss_mlp": 1.02113962, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 2.6943040888031606, "language_loss": 0.91438121, "learning_rate": 3.338222619683321e-06, "loss": 0.93676007, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.6416821479797363 }, { "auxiliary_loss_clip": 0.01175359, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.06166029, "balance_loss_mlp": 1.02640045, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 3.0358125547482944, "language_loss": 0.73910612, "learning_rate": 3.337643615911398e-06, "loss": 0.76121604, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.6643624305725098 }, { "auxiliary_loss_clip": 0.01192347, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.06135261, "balance_loss_mlp": 1.02343774, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 1.893536488554304, "language_loss": 0.78558862, "learning_rate": 3.3370644092200026e-06, "loss": 0.80784708, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.6088247299194336 }, { "auxiliary_loss_clip": 0.01140399, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.05115521, "balance_loss_mlp": 1.02329898, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 2.060288220111248, "language_loss": 0.78611195, "learning_rate": 3.3364849996969985e-06, "loss": 0.80784005, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.8328187465667725 }, { "auxiliary_loss_clip": 0.01188913, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.06222761, "balance_loss_mlp": 1.02638769, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 2.426202192495305, "language_loss": 0.85629714, "learning_rate": 3.335905387430283e-06, "loss": 0.87854129, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.7283990383148193 }, { "auxiliary_loss_clip": 0.01174458, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.05624413, "balance_loss_mlp": 1.02481914, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 3.1639208199799373, "language_loss": 0.82830989, "learning_rate": 3.335325572507782e-06, "loss": 0.8503933, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 4.568315505981445 }, { "auxiliary_loss_clip": 0.01206417, "auxiliary_loss_mlp": 0.00712955, "balance_loss_clip": 1.06522655, "balance_loss_mlp": 1.00086427, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.7866062408536927, "language_loss": 0.73552167, "learning_rate": 3.3347455550174537e-06, "loss": 0.75471532, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 3.519747018814087 }, { "auxiliary_loss_clip": 0.01149385, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.05436337, "balance_loss_mlp": 1.01407301, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.1121691710251462, "language_loss": 0.68565226, "learning_rate": 3.3341653350472864e-06, "loss": 0.70738089, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.6658637523651123 }, { "auxiliary_loss_clip": 0.01211377, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.06336391, "balance_loss_mlp": 1.02183723, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.331566418894808, "language_loss": 0.69573748, "learning_rate": 3.333584912685298e-06, "loss": 0.71817356, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.6475071907043457 }, { "auxiliary_loss_clip": 0.01077198, "auxiliary_loss_mlp": 0.01004977, "balance_loss_clip": 1.03608871, "balance_loss_mlp": 1.00205612, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.8709881663752131, "language_loss": 0.55560571, "learning_rate": 3.3330042880195385e-06, "loss": 0.57642746, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 4.173578977584839 }, { "auxiliary_loss_clip": 0.01167352, "auxiliary_loss_mlp": 0.01025259, "balance_loss_clip": 1.05608904, "balance_loss_mlp": 1.01638365, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 2.252839639943793, "language_loss": 0.78236699, "learning_rate": 3.3324234611380888e-06, "loss": 0.8042931, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.685213565826416 }, { "auxiliary_loss_clip": 0.01147019, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.05546451, "balance_loss_mlp": 1.01914573, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 5.8218014388597314, "language_loss": 0.81587553, "learning_rate": 3.3318424321290596e-06, "loss": 0.83762604, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.6940009593963623 }, { "auxiliary_loss_clip": 0.01076939, "auxiliary_loss_mlp": 0.01005612, "balance_loss_clip": 1.03379762, "balance_loss_mlp": 1.00272703, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.9809981310027176, "language_loss": 0.59881651, "learning_rate": 3.3312612010805917e-06, "loss": 0.61964202, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.3286657333374023 }, { "auxiliary_loss_clip": 0.01158299, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.05412483, "balance_loss_mlp": 1.0178777, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.7618559983933249, "language_loss": 0.69926894, "learning_rate": 3.330679768080858e-06, "loss": 0.72112602, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.7797188758850098 }, { "auxiliary_loss_clip": 0.01187011, "auxiliary_loss_mlp": 0.01034198, "balance_loss_clip": 1.06163073, "balance_loss_mlp": 1.02531683, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.087960253158457, "language_loss": 0.83574665, "learning_rate": 3.3300981332180627e-06, "loss": 0.85795879, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.7124667167663574 }, { "auxiliary_loss_clip": 0.01156996, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.05509043, "balance_loss_mlp": 1.02263069, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 1.9319057643765876, "language_loss": 0.80090451, "learning_rate": 3.3295162965804373e-06, "loss": 0.8227948, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.7807343006134033 }, { "auxiliary_loss_clip": 0.01152345, "auxiliary_loss_mlp": 0.01029723, "balance_loss_clip": 1.0578928, "balance_loss_mlp": 1.02109265, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.1487633632173373, "language_loss": 0.78565884, "learning_rate": 3.328934258256247e-06, "loss": 0.8074795, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.6494996547698975 }, { "auxiliary_loss_clip": 0.01183307, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.05611444, "balance_loss_mlp": 1.01911569, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 3.0607954308992995, "language_loss": 0.67262137, "learning_rate": 3.3283520183337856e-06, "loss": 0.69474, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.65256929397583 }, { "auxiliary_loss_clip": 0.01166351, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.05567861, "balance_loss_mlp": 1.02263379, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.7537049380172813, "language_loss": 0.68985128, "learning_rate": 3.3277695769013797e-06, "loss": 0.71183419, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.644068956375122 }, { "auxiliary_loss_clip": 0.01185963, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.05897677, "balance_loss_mlp": 1.02009225, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.327820577016537, "language_loss": 0.77340734, "learning_rate": 3.327186934047385e-06, "loss": 0.79555976, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.6274681091308594 }, { "auxiliary_loss_clip": 0.01157934, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.05193269, "balance_loss_mlp": 1.02231705, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 2.2252783810448307, "language_loss": 0.66111517, "learning_rate": 3.3266040898601877e-06, "loss": 0.68300527, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.6665782928466797 }, { "auxiliary_loss_clip": 0.01127945, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.05036545, "balance_loss_mlp": 1.02498746, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.8856462555375748, "language_loss": 0.78027117, "learning_rate": 3.3260210444282045e-06, "loss": 0.80189157, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.7095165252685547 }, { "auxiliary_loss_clip": 0.01181444, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.05939794, "balance_loss_mlp": 1.01631594, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.0313896833247274, "language_loss": 0.73368943, "learning_rate": 3.325437797839883e-06, "loss": 0.75575924, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.6515605449676514 }, { "auxiliary_loss_clip": 0.01201897, "auxiliary_loss_mlp": 0.01030594, "balance_loss_clip": 1.06035149, "balance_loss_mlp": 1.02136731, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 3.049436348797285, "language_loss": 0.75288737, "learning_rate": 3.3248543501837015e-06, "loss": 0.77521229, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.5874710083007812 }, { "auxiliary_loss_clip": 0.01144436, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 1.05719948, "balance_loss_mlp": 1.02090919, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 1.861105304908983, "language_loss": 0.77485383, "learning_rate": 3.3242707015481684e-06, "loss": 0.79659593, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.7261104583740234 }, { "auxiliary_loss_clip": 0.01166015, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.0538516, "balance_loss_mlp": 1.0211823, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 2.0103697099258206, "language_loss": 0.8092047, "learning_rate": 3.323686852021823e-06, "loss": 0.83116782, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.6887240409851074 }, { "auxiliary_loss_clip": 0.01153406, "auxiliary_loss_mlp": 0.01029895, "balance_loss_clip": 1.05104351, "balance_loss_mlp": 1.02120399, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 2.335970385429009, "language_loss": 0.79822189, "learning_rate": 3.323102801693235e-06, "loss": 0.82005489, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.6978657245635986 }, { "auxiliary_loss_clip": 0.01177545, "auxiliary_loss_mlp": 0.01027411, "balance_loss_clip": 1.05627096, "balance_loss_mlp": 1.01823235, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.082737959457188, "language_loss": 0.8026396, "learning_rate": 3.322518550651003e-06, "loss": 0.82468915, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.720926523208618 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.05517995, "balance_loss_mlp": 1.02331638, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.7245880469216615, "language_loss": 0.81113935, "learning_rate": 3.3219340989837586e-06, "loss": 0.833197, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.708495616912842 }, { "auxiliary_loss_clip": 0.01169085, "auxiliary_loss_mlp": 0.01024235, "balance_loss_clip": 1.05677843, "balance_loss_mlp": 1.01577687, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.862006834489211, "language_loss": 0.80303609, "learning_rate": 3.3213494467801625e-06, "loss": 0.82496929, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.697244644165039 }, { "auxiliary_loss_clip": 0.0109931, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.04609656, "balance_loss_mlp": 1.02109027, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.5671904420113885, "language_loss": 0.71559393, "learning_rate": 3.3207645941289063e-06, "loss": 0.73688924, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 3.7729883193969727 }, { "auxiliary_loss_clip": 0.01183474, "auxiliary_loss_mlp": 0.00712158, "balance_loss_clip": 1.05985928, "balance_loss_mlp": 1.00067806, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 1.908052887265438, "language_loss": 0.80075467, "learning_rate": 3.320179541118711e-06, "loss": 0.81971097, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 3.839810848236084 }, { "auxiliary_loss_clip": 0.01106948, "auxiliary_loss_mlp": 0.01006825, "balance_loss_clip": 1.036865, "balance_loss_mlp": 1.0038451, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 0.9987846068798529, "language_loss": 0.60266566, "learning_rate": 3.3195942878383293e-06, "loss": 0.62380338, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 4.047684192657471 }, { "auxiliary_loss_clip": 0.0118437, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.05850625, "balance_loss_mlp": 1.02173495, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 1.953200761013257, "language_loss": 0.77654529, "learning_rate": 3.319008834376543e-06, "loss": 0.79869974, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.6528215408325195 }, { "auxiliary_loss_clip": 0.01155011, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.05011368, "balance_loss_mlp": 1.01583898, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.1771189138985862, "language_loss": 0.8892898, "learning_rate": 3.3184231808221654e-06, "loss": 0.91109097, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.681217908859253 }, { "auxiliary_loss_clip": 0.01155118, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.05694175, "balance_loss_mlp": 1.02406633, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.262849928895612, "language_loss": 0.62589145, "learning_rate": 3.3178373272640394e-06, "loss": 0.64777529, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 3.6749255657196045 }, { "auxiliary_loss_clip": 0.01199499, "auxiliary_loss_mlp": 0.01031743, "balance_loss_clip": 1.06032228, "balance_loss_mlp": 1.02245629, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.649043749734089, "language_loss": 0.85151625, "learning_rate": 3.3172512737910387e-06, "loss": 0.87382865, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.6237781047821045 }, { "auxiliary_loss_clip": 0.01184956, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.05711496, "balance_loss_mlp": 1.01940835, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.708051576170102, "language_loss": 0.88571012, "learning_rate": 3.3166650204920674e-06, "loss": 0.90784532, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.701833963394165 }, { "auxiliary_loss_clip": 0.01184241, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.0588665, "balance_loss_mlp": 1.02451062, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.5989241184061427, "language_loss": 0.81407964, "learning_rate": 3.316078567456059e-06, "loss": 0.83626056, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.7151200771331787 }, { "auxiliary_loss_clip": 0.01129022, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 1.0554471, "balance_loss_mlp": 1.02286315, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 1.7282678725741347, "language_loss": 0.75882787, "learning_rate": 3.3154919147719786e-06, "loss": 0.78044212, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.954669237136841 }, { "auxiliary_loss_clip": 0.01185612, "auxiliary_loss_mlp": 0.01035556, "balance_loss_clip": 1.0587225, "balance_loss_mlp": 1.0268656, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.282757641572648, "language_loss": 0.86325878, "learning_rate": 3.31490506252882e-06, "loss": 0.88547045, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.8536105155944824 }, { "auxiliary_loss_clip": 0.01141829, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.0506649, "balance_loss_mlp": 1.02162457, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.7276403473924113, "language_loss": 0.83911133, "learning_rate": 3.31431801081561e-06, "loss": 0.86083192, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.7136945724487305 }, { "auxiliary_loss_clip": 0.01083139, "auxiliary_loss_mlp": 0.01005715, "balance_loss_clip": 1.03152466, "balance_loss_mlp": 1.00324762, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.9091216996388198, "language_loss": 0.67911696, "learning_rate": 3.313730759721402e-06, "loss": 0.70000553, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.348517656326294 }, { "auxiliary_loss_clip": 0.01163153, "auxiliary_loss_mlp": 0.01028669, "balance_loss_clip": 1.05725777, "balance_loss_mlp": 1.02001429, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.2625838467425416, "language_loss": 0.86426973, "learning_rate": 3.313143309335282e-06, "loss": 0.88618797, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.671774387359619 }, { "auxiliary_loss_clip": 0.01151801, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 1.05703032, "balance_loss_mlp": 1.02175212, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 1.9559878626705283, "language_loss": 0.84937912, "learning_rate": 3.3125556597463665e-06, "loss": 0.87120289, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.6699187755584717 }, { "auxiliary_loss_clip": 0.01183102, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.06058955, "balance_loss_mlp": 1.0214715, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 1.6511662066611528, "language_loss": 0.66286552, "learning_rate": 3.311967811043801e-06, "loss": 0.68500268, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.7378838062286377 }, { "auxiliary_loss_clip": 0.01183257, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.0600853, "balance_loss_mlp": 1.02120054, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.218451196362931, "language_loss": 0.81825912, "learning_rate": 3.3113797633167617e-06, "loss": 0.84039962, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.6145102977752686 }, { "auxiliary_loss_clip": 0.01197044, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.0579406, "balance_loss_mlp": 1.0230751, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.3481141780173105, "language_loss": 0.68441308, "learning_rate": 3.310791516654455e-06, "loss": 0.70670772, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.6181368827819824 }, { "auxiliary_loss_clip": 0.01156112, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.05421269, "balance_loss_mlp": 1.01957619, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 3.699104951376861, "language_loss": 0.79046154, "learning_rate": 3.3102030711461177e-06, "loss": 0.8123123, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.649047613143921 }, { "auxiliary_loss_clip": 0.01153336, "auxiliary_loss_mlp": 0.01031063, "balance_loss_clip": 1.05436635, "balance_loss_mlp": 1.021276, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 19.244693339019584, "language_loss": 0.68442953, "learning_rate": 3.3096144268810156e-06, "loss": 0.7062735, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.724727153778076 }, { "auxiliary_loss_clip": 0.01175493, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.05587721, "balance_loss_mlp": 1.02245176, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.983155077048195, "language_loss": 0.72823983, "learning_rate": 3.3090255839484462e-06, "loss": 0.75031757, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.626408338546753 }, { "auxiliary_loss_clip": 0.01164504, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.05198157, "balance_loss_mlp": 1.01998615, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 2.6618947715174057, "language_loss": 0.8534559, "learning_rate": 3.3084365424377366e-06, "loss": 0.87539667, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.6407482624053955 }, { "auxiliary_loss_clip": 0.01079652, "auxiliary_loss_mlp": 0.01009801, "balance_loss_clip": 1.05523419, "balance_loss_mlp": 1.007882, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7382694604603491, "language_loss": 0.55996478, "learning_rate": 3.307847302438245e-06, "loss": 0.5808593, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.29021954536438 }, { "auxiliary_loss_clip": 0.01119244, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.04786384, "balance_loss_mlp": 1.02232492, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 37.893082299418225, "language_loss": 0.77598834, "learning_rate": 3.3072578640393562e-06, "loss": 0.79749596, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 3.0256595611572266 }, { "auxiliary_loss_clip": 0.0116568, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.05454004, "balance_loss_mlp": 1.0175308, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 1.9647516645951086, "language_loss": 0.79724342, "learning_rate": 3.3066682273304886e-06, "loss": 0.81916565, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.701439142227173 }, { "auxiliary_loss_clip": 0.01186727, "auxiliary_loss_mlp": 0.007135, "balance_loss_clip": 1.05821466, "balance_loss_mlp": 1.00080407, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 6.5983346968316985, "language_loss": 0.78307986, "learning_rate": 3.3060783924010904e-06, "loss": 0.80208218, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.6198043823242188 }, { "auxiliary_loss_clip": 0.01151548, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.05437088, "balance_loss_mlp": 1.02340817, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.316378463080622, "language_loss": 0.85064238, "learning_rate": 3.3054883593406387e-06, "loss": 0.87248296, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 3.6115033626556396 }, { "auxiliary_loss_clip": 0.01168948, "auxiliary_loss_mlp": 0.01030083, "balance_loss_clip": 1.05435622, "balance_loss_mlp": 1.02102923, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.5055305585510403, "language_loss": 0.64922726, "learning_rate": 3.3048981282386404e-06, "loss": 0.67121756, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 3.6527960300445557 }, { "auxiliary_loss_clip": 0.01135352, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.04962683, "balance_loss_mlp": 1.01972461, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 2.0008767781397867, "language_loss": 0.82922822, "learning_rate": 3.304307699184634e-06, "loss": 0.85087621, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 3.6032657623291016 }, { "auxiliary_loss_clip": 0.01170874, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 1.06036234, "balance_loss_mlp": 1.02163243, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.671921072327402, "language_loss": 0.79111803, "learning_rate": 3.3037170722681866e-06, "loss": 0.81313115, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.7454683780670166 }, { "auxiliary_loss_clip": 0.01143031, "auxiliary_loss_mlp": 0.01029619, "balance_loss_clip": 1.0527761, "balance_loss_mlp": 1.02073812, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 2.054969863670383, "language_loss": 0.67904854, "learning_rate": 3.3031262475788956e-06, "loss": 0.70077497, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.715660333633423 }, { "auxiliary_loss_clip": 0.01161018, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.05362058, "balance_loss_mlp": 1.02343285, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.72522555856317, "language_loss": 0.73336565, "learning_rate": 3.3025352252063897e-06, "loss": 0.75529766, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 3.5413191318511963 }, { "auxiliary_loss_clip": 0.01181341, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.05878758, "balance_loss_mlp": 1.02302873, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 2.332002183960939, "language_loss": 0.74627054, "learning_rate": 3.3019440052403252e-06, "loss": 0.76840532, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.6618549823760986 }, { "auxiliary_loss_clip": 0.01165741, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.05392885, "balance_loss_mlp": 1.02316403, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 1.8341048989778264, "language_loss": 0.70900542, "learning_rate": 3.30135258777039e-06, "loss": 0.73098952, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 2.707671642303467 }, { "auxiliary_loss_clip": 0.01184808, "auxiliary_loss_mlp": 0.00713102, "balance_loss_clip": 1.05545056, "balance_loss_mlp": 1.0008347, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 2.2094851024424056, "language_loss": 0.70447075, "learning_rate": 3.3007609728863024e-06, "loss": 0.72344989, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.6452906131744385 }, { "auxiliary_loss_clip": 0.01106475, "auxiliary_loss_mlp": 0.01027431, "balance_loss_clip": 1.05022645, "balance_loss_mlp": 1.01818061, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 1.792456294711337, "language_loss": 0.7289362, "learning_rate": 3.300169160677809e-06, "loss": 0.75027525, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.8943581581115723 }, { "auxiliary_loss_clip": 0.01157116, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.05489957, "balance_loss_mlp": 1.02118731, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.7452407273116406, "language_loss": 0.78606182, "learning_rate": 3.2995771512346878e-06, "loss": 0.80793989, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.8973631858825684 }, { "auxiliary_loss_clip": 0.01201774, "auxiliary_loss_mlp": 0.00712923, "balance_loss_clip": 1.0604372, "balance_loss_mlp": 1.00086617, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 2.238434489225264, "language_loss": 0.73507679, "learning_rate": 3.298984944646746e-06, "loss": 0.7542237, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.712719678878784 }, { "auxiliary_loss_clip": 0.01187039, "auxiliary_loss_mlp": 0.00712586, "balance_loss_clip": 1.05947924, "balance_loss_mlp": 1.00079346, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.9649077657878415, "language_loss": 0.81702459, "learning_rate": 3.298392541003822e-06, "loss": 0.83602077, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.6472134590148926 }, { "auxiliary_loss_clip": 0.0116406, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 1.05455053, "balance_loss_mlp": 1.01791477, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.7072884665151324, "language_loss": 0.89571583, "learning_rate": 3.2977999403957806e-06, "loss": 0.91762257, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.6962430477142334 }, { "auxiliary_loss_clip": 0.01200782, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.06136703, "balance_loss_mlp": 1.02412391, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.64160032872478, "language_loss": 0.67192376, "learning_rate": 3.2972071429125207e-06, "loss": 0.69427097, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.676610231399536 }, { "auxiliary_loss_clip": 0.01143333, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.05300975, "balance_loss_mlp": 1.02163219, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 2.192549688950857, "language_loss": 0.88728237, "learning_rate": 3.2966141486439682e-06, "loss": 0.90902495, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7822508811950684 }, { "auxiliary_loss_clip": 0.01117505, "auxiliary_loss_mlp": 0.01030306, "balance_loss_clip": 1.0461328, "balance_loss_mlp": 1.02062583, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 3.362445573200131, "language_loss": 0.64524078, "learning_rate": 3.29602095768008e-06, "loss": 0.6667189, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.858103036880493 }, { "auxiliary_loss_clip": 0.01154901, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.05273855, "balance_loss_mlp": 1.0222578, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 1.858659655265789, "language_loss": 0.63437748, "learning_rate": 3.2954275701108437e-06, "loss": 0.65623701, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.738752603530884 }, { "auxiliary_loss_clip": 0.01126763, "auxiliary_loss_mlp": 0.01036558, "balance_loss_clip": 1.04930615, "balance_loss_mlp": 1.02682447, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 1.9487056398857905, "language_loss": 0.68915141, "learning_rate": 3.294833986026275e-06, "loss": 0.71078461, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.7735891342163086 }, { "auxiliary_loss_clip": 0.01140933, "auxiliary_loss_mlp": 0.01029562, "balance_loss_clip": 1.05244291, "balance_loss_mlp": 1.02079415, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.048869744701036, "language_loss": 0.85509497, "learning_rate": 3.29424020551642e-06, "loss": 0.87679994, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.635303020477295 }, { "auxiliary_loss_clip": 0.01201412, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.05951846, "balance_loss_mlp": 1.02711999, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 2.393480037998195, "language_loss": 0.71826363, "learning_rate": 3.2936462286713546e-06, "loss": 0.74064517, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.573296070098877 }, { "auxiliary_loss_clip": 0.01181109, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.05596733, "balance_loss_mlp": 1.02387309, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.3443239342169093, "language_loss": 0.77156287, "learning_rate": 3.2930520555811846e-06, "loss": 0.79370242, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.640331983566284 }, { "auxiliary_loss_clip": 0.01075616, "auxiliary_loss_mlp": 0.00713846, "balance_loss_clip": 1.04378057, "balance_loss_mlp": 1.00084829, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 2.0007327415238145, "language_loss": 0.80246723, "learning_rate": 3.292457686336046e-06, "loss": 0.82036185, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.873800754547119 }, { "auxiliary_loss_clip": 0.01097533, "auxiliary_loss_mlp": 0.01003796, "balance_loss_clip": 1.03829932, "balance_loss_mlp": 1.00149536, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.8517970549266984, "language_loss": 0.61232489, "learning_rate": 3.291863121026105e-06, "loss": 0.63333815, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.614772319793701 }, { "auxiliary_loss_clip": 0.0117961, "auxiliary_loss_mlp": 0.01034864, "balance_loss_clip": 1.05532169, "balance_loss_mlp": 1.02572083, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.221877262783085, "language_loss": 0.76590091, "learning_rate": 3.2912683597415547e-06, "loss": 0.78804564, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.7329277992248535 }, { "auxiliary_loss_clip": 0.01151414, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.05241466, "balance_loss_mlp": 1.02144635, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.0249924171031997, "language_loss": 0.78484428, "learning_rate": 3.2906734025726213e-06, "loss": 0.80666518, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.7844173908233643 }, { "auxiliary_loss_clip": 0.01187082, "auxiliary_loss_mlp": 0.01033647, "balance_loss_clip": 1.0564189, "balance_loss_mlp": 1.02431345, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 4.549031980255289, "language_loss": 0.88408875, "learning_rate": 3.290078249609559e-06, "loss": 0.90629601, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 4.4043967723846436 }, { "auxiliary_loss_clip": 0.01180361, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.0604434, "balance_loss_mlp": 1.02900922, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.4353772678792067, "language_loss": 0.88304472, "learning_rate": 3.2894829009426514e-06, "loss": 0.90522873, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.6476492881774902 }, { "auxiliary_loss_clip": 0.01177828, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.05573928, "balance_loss_mlp": 1.02075648, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.9623131759196726, "language_loss": 0.77776897, "learning_rate": 3.288887356662213e-06, "loss": 0.79984045, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 3.6494548320770264 }, { "auxiliary_loss_clip": 0.01106364, "auxiliary_loss_mlp": 0.01005218, "balance_loss_clip": 1.03898191, "balance_loss_mlp": 1.00279832, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7846474416261118, "language_loss": 0.59778881, "learning_rate": 3.288291616858588e-06, "loss": 0.61890465, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 3.0699422359466553 }, { "auxiliary_loss_clip": 0.01129876, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.05446351, "balance_loss_mlp": 1.02195787, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.6766497746628735, "language_loss": 0.76602554, "learning_rate": 3.287695681622149e-06, "loss": 0.78763443, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.796882152557373 }, { "auxiliary_loss_clip": 0.01168641, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 1.05341804, "balance_loss_mlp": 1.01883817, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 1.7748594265677553, "language_loss": 0.80881715, "learning_rate": 3.2870995510432982e-06, "loss": 0.8307811, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 3.6220920085906982 }, { "auxiliary_loss_clip": 0.0117386, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.05566168, "balance_loss_mlp": 1.02323413, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 1.8745642369574058, "language_loss": 0.772506, "learning_rate": 3.2865032252124697e-06, "loss": 0.79456031, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.71344256401062 }, { "auxiliary_loss_clip": 0.0116647, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.055686, "balance_loss_mlp": 1.02649617, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.744976783508759, "language_loss": 0.77812982, "learning_rate": 3.2859067042201243e-06, "loss": 0.80014801, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.810715913772583 }, { "auxiliary_loss_clip": 0.01092784, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.04627836, "balance_loss_mlp": 1.01976275, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 2.2151418534948193, "language_loss": 0.78077453, "learning_rate": 3.2853099881567544e-06, "loss": 0.8019861, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.7604634761810303 }, { "auxiliary_loss_clip": 0.01193916, "auxiliary_loss_mlp": 0.01031406, "balance_loss_clip": 1.05760944, "balance_loss_mlp": 1.02268612, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 4.626611070335892, "language_loss": 0.79032016, "learning_rate": 3.284713077112881e-06, "loss": 0.81257337, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.6565539836883545 }, { "auxiliary_loss_clip": 0.01158116, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.05735421, "balance_loss_mlp": 1.02211308, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 2.80758827473736, "language_loss": 0.86742806, "learning_rate": 3.284115971179056e-06, "loss": 0.88932592, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.5913288593292236 }, { "auxiliary_loss_clip": 0.01122481, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.05402017, "balance_loss_mlp": 1.02067804, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.8084925963576157, "language_loss": 0.78569162, "learning_rate": 3.283518670445859e-06, "loss": 0.80720955, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.8069956302642822 }, { "auxiliary_loss_clip": 0.01085175, "auxiliary_loss_mlp": 0.00703979, "balance_loss_clip": 1.03466535, "balance_loss_mlp": 1.00055695, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6846431812123948, "language_loss": 0.54330951, "learning_rate": 3.2829211750038995e-06, "loss": 0.56120104, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.2679944038391113 }, { "auxiliary_loss_clip": 0.01143331, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.0522027, "balance_loss_mlp": 1.01658678, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 1.9384353519735302, "language_loss": 0.89294839, "learning_rate": 3.2823234849438183e-06, "loss": 0.91463763, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.6400227546691895 }, { "auxiliary_loss_clip": 0.01167708, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.05551231, "balance_loss_mlp": 1.02469254, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 2.657897005002268, "language_loss": 0.75996625, "learning_rate": 3.2817256003562836e-06, "loss": 0.78197408, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.6972885131835938 }, { "auxiliary_loss_clip": 0.01120735, "auxiliary_loss_mlp": 0.01028867, "balance_loss_clip": 1.05116034, "balance_loss_mlp": 1.01947892, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.7883455548763894, "language_loss": 0.6575557, "learning_rate": 3.281127521331995e-06, "loss": 0.67905176, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.980675458908081 }, { "auxiliary_loss_clip": 0.01119012, "auxiliary_loss_mlp": 0.01003621, "balance_loss_clip": 1.03467226, "balance_loss_mlp": 1.00114179, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8804507559976907, "language_loss": 0.60682893, "learning_rate": 3.2805292479616798e-06, "loss": 0.62805527, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.09578275680542 }, { "auxiliary_loss_clip": 0.0116892, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.05712616, "balance_loss_mlp": 1.01757181, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.5205318499160034, "language_loss": 0.91964114, "learning_rate": 3.2799307803360955e-06, "loss": 0.94159901, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.739744186401367 }, { "auxiliary_loss_clip": 0.01195208, "auxiliary_loss_mlp": 0.01029576, "balance_loss_clip": 1.0575664, "balance_loss_mlp": 1.02064133, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 2.4353564397183174, "language_loss": 0.81683832, "learning_rate": 3.27933211854603e-06, "loss": 0.83908617, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.646367073059082 }, { "auxiliary_loss_clip": 0.01169444, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 1.05886805, "balance_loss_mlp": 1.02293277, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.7897333765073966, "language_loss": 0.87006772, "learning_rate": 3.278733262682299e-06, "loss": 0.89208835, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.6154849529266357 }, { "auxiliary_loss_clip": 0.01197556, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.05764854, "balance_loss_mlp": 1.01938152, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.54043251365646, "language_loss": 0.8297528, "learning_rate": 3.2781342128357484e-06, "loss": 0.8520084, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.6274092197418213 }, { "auxiliary_loss_clip": 0.01151174, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.05477965, "balance_loss_mlp": 1.02252853, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 3.227542497601485, "language_loss": 0.80608559, "learning_rate": 3.2775349690972547e-06, "loss": 0.82790571, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.663074493408203 }, { "auxiliary_loss_clip": 0.01102878, "auxiliary_loss_mlp": 0.01010651, "balance_loss_clip": 1.03640985, "balance_loss_mlp": 1.00819516, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7846204219647662, "language_loss": 0.51829243, "learning_rate": 3.276935531557722e-06, "loss": 0.53942776, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.345060348510742 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.05096972, "balance_loss_mlp": 1.02430832, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.978486821253971, "language_loss": 0.79640657, "learning_rate": 3.2763359003080837e-06, "loss": 0.81809771, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.689582109451294 }, { "auxiliary_loss_clip": 0.01090192, "auxiliary_loss_mlp": 0.01007084, "balance_loss_clip": 1.03211188, "balance_loss_mlp": 1.00461626, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.801744480617798, "language_loss": 0.62453008, "learning_rate": 3.2757360754393047e-06, "loss": 0.64550292, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.348723888397217 }, { "auxiliary_loss_clip": 0.01180761, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05622339, "balance_loss_mlp": 1.02362478, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 4.040760991286955, "language_loss": 0.6383149, "learning_rate": 3.2751360570423767e-06, "loss": 0.66045272, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.642322540283203 }, { "auxiliary_loss_clip": 0.01163865, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.05600834, "balance_loss_mlp": 1.01851988, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.1953681694683356, "language_loss": 0.76229274, "learning_rate": 3.2745358452083236e-06, "loss": 0.7842074, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 4.67042350769043 }, { "auxiliary_loss_clip": 0.01182206, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.05735207, "balance_loss_mlp": 1.02607083, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.4411897499530135, "language_loss": 0.82328403, "learning_rate": 3.2739354400281955e-06, "loss": 0.84545821, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 3.557131052017212 }, { "auxiliary_loss_clip": 0.01078004, "auxiliary_loss_mlp": 0.00703846, "balance_loss_clip": 1.03169441, "balance_loss_mlp": 1.00042784, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.867968386592477, "language_loss": 0.63713109, "learning_rate": 3.2733348415930744e-06, "loss": 0.65494955, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.3091249465942383 }, { "auxiliary_loss_clip": 0.0114808, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.05542862, "balance_loss_mlp": 1.01836324, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 1.7973955539074087, "language_loss": 0.80738848, "learning_rate": 3.27273404999407e-06, "loss": 0.82914674, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.809788942337036 }, { "auxiliary_loss_clip": 0.01090367, "auxiliary_loss_mlp": 0.01003024, "balance_loss_clip": 1.03198838, "balance_loss_mlp": 1.00048506, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.8013205914810279, "language_loss": 0.6048162, "learning_rate": 3.272133065322322e-06, "loss": 0.62575006, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.2619969844818115 }, { "auxiliary_loss_clip": 0.01196036, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.05695128, "balance_loss_mlp": 1.02308142, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.6838339088496412, "language_loss": 0.79403961, "learning_rate": 3.271531887669e-06, "loss": 0.81631529, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 3.5599048137664795 }, { "auxiliary_loss_clip": 0.01134813, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.0482924, "balance_loss_mlp": 1.02356267, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.624581479143909, "language_loss": 0.63488925, "learning_rate": 3.2709305171253015e-06, "loss": 0.65657496, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.82083797454834 }, { "auxiliary_loss_clip": 0.01181238, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.05829167, "balance_loss_mlp": 1.02246737, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 2.506400756908259, "language_loss": 0.77612603, "learning_rate": 3.2703289537824536e-06, "loss": 0.79825258, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.6113269329071045 }, { "auxiliary_loss_clip": 0.01138164, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.05422401, "balance_loss_mlp": 1.02286112, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.55392780948832, "language_loss": 0.78523302, "learning_rate": 3.269727197731714e-06, "loss": 0.80693066, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.6500399112701416 }, { "auxiliary_loss_clip": 0.01127384, "auxiliary_loss_mlp": 0.01028603, "balance_loss_clip": 1.05193591, "balance_loss_mlp": 1.02007961, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.6547908388646848, "language_loss": 0.77722013, "learning_rate": 3.269125249064367e-06, "loss": 0.79878008, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.718950033187866 }, { "auxiliary_loss_clip": 0.01199816, "auxiliary_loss_mlp": 0.0103005, "balance_loss_clip": 1.05839121, "balance_loss_mlp": 1.02120459, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.7426732305383286, "language_loss": 0.83624732, "learning_rate": 3.2685231078717297e-06, "loss": 0.8585459, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.623004913330078 }, { "auxiliary_loss_clip": 0.01140702, "auxiliary_loss_mlp": 0.00712991, "balance_loss_clip": 1.05296135, "balance_loss_mlp": 1.0008111, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.0956037298019057, "language_loss": 0.75774252, "learning_rate": 3.267920774245145e-06, "loss": 0.77627945, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.7119686603546143 }, { "auxiliary_loss_clip": 0.01182409, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.05774522, "balance_loss_mlp": 1.02530301, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 2.517243384793231, "language_loss": 0.8494069, "learning_rate": 3.2673182482759876e-06, "loss": 0.87157661, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.640887975692749 }, { "auxiliary_loss_clip": 0.01182208, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.05782676, "balance_loss_mlp": 1.02745199, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 1.8663008694027274, "language_loss": 0.66236204, "learning_rate": 3.266715530055659e-06, "loss": 0.68454659, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.5399723052978516 }, { "auxiliary_loss_clip": 0.01170034, "auxiliary_loss_mlp": 0.01030396, "balance_loss_clip": 1.05171192, "balance_loss_mlp": 1.02153301, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 2.3821004037441313, "language_loss": 0.80390596, "learning_rate": 3.2661126196755927e-06, "loss": 0.82591033, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.642327308654785 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01005598, "balance_loss_clip": 1.03067613, "balance_loss_mlp": 1.00324917, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7963770712343338, "language_loss": 0.5599317, "learning_rate": 3.265509517227248e-06, "loss": 0.5811159, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.141850471496582 }, { "auxiliary_loss_clip": 0.01163066, "auxiliary_loss_mlp": 0.01029222, "balance_loss_clip": 1.05091023, "balance_loss_mlp": 1.02048445, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 3.1864938550814084, "language_loss": 0.810422, "learning_rate": 3.264906222802115e-06, "loss": 0.83234489, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.6438612937927246 }, { "auxiliary_loss_clip": 0.01196768, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.05621481, "balance_loss_mlp": 1.02322137, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 2.413012414517776, "language_loss": 0.78436482, "learning_rate": 3.264302736491715e-06, "loss": 0.80665761, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.618408441543579 }, { "auxiliary_loss_clip": 0.01179171, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.058617, "balance_loss_mlp": 1.02018857, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 2.061178399847624, "language_loss": 0.86964387, "learning_rate": 3.263699058387594e-06, "loss": 0.89173484, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.6495730876922607 }, { "auxiliary_loss_clip": 0.01144003, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.04986405, "balance_loss_mlp": 1.02243066, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.74350696749128, "language_loss": 0.90416002, "learning_rate": 3.2630951885813315e-06, "loss": 0.92592281, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.6281046867370605 }, { "auxiliary_loss_clip": 0.01164471, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.05130482, "balance_loss_mlp": 1.01902282, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 1.9547324440569667, "language_loss": 0.78390503, "learning_rate": 3.262491127164533e-06, "loss": 0.80582869, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01171072, "auxiliary_loss_mlp": 0.00712871, "balance_loss_clip": 1.05540645, "balance_loss_mlp": 1.00088763, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.1960004307283723, "language_loss": 0.8018446, "learning_rate": 3.2618868742288337e-06, "loss": 0.82068402, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.6916146278381348 }, { "auxiliary_loss_clip": 0.01181021, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.05767953, "balance_loss_mlp": 1.02451897, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 2.269275625718882, "language_loss": 0.72492051, "learning_rate": 3.261282429865899e-06, "loss": 0.74706745, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.5848865509033203 }, { "auxiliary_loss_clip": 0.01170898, "auxiliary_loss_mlp": 0.00712273, "balance_loss_clip": 1.05739439, "balance_loss_mlp": 1.00072408, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.630317938660815, "language_loss": 0.72194117, "learning_rate": 3.2606777941674225e-06, "loss": 0.74077284, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.6531105041503906 }, { "auxiliary_loss_clip": 0.01124032, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.05076718, "balance_loss_mlp": 1.02407098, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.1612963599130968, "language_loss": 0.84382498, "learning_rate": 3.2600729672251276e-06, "loss": 0.8653996, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 2.68994402885437 }, { "auxiliary_loss_clip": 0.01197264, "auxiliary_loss_mlp": 0.00712972, "balance_loss_clip": 1.05787706, "balance_loss_mlp": 1.00076187, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 2.0073560528315166, "language_loss": 0.64981723, "learning_rate": 3.259467949130765e-06, "loss": 0.66891962, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 3.558602809906006 }, { "auxiliary_loss_clip": 0.01167181, "auxiliary_loss_mlp": 0.01032193, "balance_loss_clip": 1.05510545, "balance_loss_mlp": 1.0231812, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.3713242867607116, "language_loss": 0.82441199, "learning_rate": 3.2588627399761164e-06, "loss": 0.84640574, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 3.5256149768829346 }, { "auxiliary_loss_clip": 0.01166114, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.05530548, "balance_loss_mlp": 1.01848888, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 2.034817329692336, "language_loss": 0.70756221, "learning_rate": 3.2582573398529903e-06, "loss": 0.72949666, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 3.6237926483154297 }, { "auxiliary_loss_clip": 0.01151023, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.05304527, "balance_loss_mlp": 1.02047193, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.8238407860749315, "language_loss": 0.74225426, "learning_rate": 3.2576517488532265e-06, "loss": 0.76406229, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.763843536376953 }, { "auxiliary_loss_clip": 0.01179835, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.05399275, "balance_loss_mlp": 1.02194905, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.817907895600717, "language_loss": 0.87491453, "learning_rate": 3.257045967068692e-06, "loss": 0.89701742, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.598862409591675 }, { "auxiliary_loss_clip": 0.01201054, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.05906844, "balance_loss_mlp": 1.02596676, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.731781118447252, "language_loss": 0.82089317, "learning_rate": 3.2564399945912848e-06, "loss": 0.84324837, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.6247098445892334 }, { "auxiliary_loss_clip": 0.01134938, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.04920495, "balance_loss_mlp": 1.02467287, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.3308774517179196, "language_loss": 0.82386851, "learning_rate": 3.2558338315129287e-06, "loss": 0.84555495, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 3.6596031188964844 }, { "auxiliary_loss_clip": 0.01174107, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.054968, "balance_loss_mlp": 1.02255666, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 2.134326120941438, "language_loss": 0.76131403, "learning_rate": 3.2552274779255785e-06, "loss": 0.7833761, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 2.6881589889526367 }, { "auxiliary_loss_clip": 0.01179057, "auxiliary_loss_mlp": 0.01029045, "balance_loss_clip": 1.0545249, "balance_loss_mlp": 1.01996684, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 1.998064671936765, "language_loss": 0.77346605, "learning_rate": 3.2546209339212184e-06, "loss": 0.79554707, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.647956371307373 }, { "auxiliary_loss_clip": 0.0116579, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.05236447, "balance_loss_mlp": 1.02076292, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.7487442719509807, "language_loss": 0.77700198, "learning_rate": 3.25401419959186e-06, "loss": 0.79895186, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.654876232147217 }, { "auxiliary_loss_clip": 0.01175093, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.0572474, "balance_loss_mlp": 1.02084458, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 6.168805152460574, "language_loss": 0.7679466, "learning_rate": 3.253407275029545e-06, "loss": 0.78999203, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.639817714691162 }, { "auxiliary_loss_clip": 0.0115196, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.05513453, "balance_loss_mlp": 1.02447212, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 2.3806117466619052, "language_loss": 0.79978669, "learning_rate": 3.2528001603263425e-06, "loss": 0.82164299, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.7425990104675293 }, { "auxiliary_loss_clip": 0.01181431, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.05910444, "balance_loss_mlp": 1.02500105, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 1.6851351369439131, "language_loss": 0.81383574, "learning_rate": 3.2521928555743514e-06, "loss": 0.83599472, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.754854202270508 }, { "auxiliary_loss_clip": 0.0115821, "auxiliary_loss_mlp": 0.00713217, "balance_loss_clip": 1.05225182, "balance_loss_mlp": 1.00078464, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 1.9593550696485362, "language_loss": 0.67638052, "learning_rate": 3.2515853608657e-06, "loss": 0.6950947, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.6941540241241455 }, { "auxiliary_loss_clip": 0.01174725, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.05371189, "balance_loss_mlp": 1.02248609, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 2.3341228102890836, "language_loss": 0.75145829, "learning_rate": 3.250977676292545e-06, "loss": 0.77352554, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.6990604400634766 }, { "auxiliary_loss_clip": 0.01165788, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.0536859, "balance_loss_mlp": 1.01782918, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.5256306387225442, "language_loss": 0.7946732, "learning_rate": 3.2503698019470712e-06, "loss": 0.81660253, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.627056837081909 }, { "auxiliary_loss_clip": 0.01178384, "auxiliary_loss_mlp": 0.0102921, "balance_loss_clip": 1.05279565, "balance_loss_mlp": 1.02033484, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 5.788290005771904, "language_loss": 0.78322476, "learning_rate": 3.249761737921492e-06, "loss": 0.80530071, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.6438231468200684 }, { "auxiliary_loss_clip": 0.01162329, "auxiliary_loss_mlp": 0.01031318, "balance_loss_clip": 1.05533862, "balance_loss_mlp": 1.02259803, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 3.672903764894724, "language_loss": 0.7454778, "learning_rate": 3.249153484308051e-06, "loss": 0.76741421, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.7675743103027344 }, { "auxiliary_loss_clip": 0.01119268, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.04765451, "balance_loss_mlp": 1.02113938, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 3.591179761963812, "language_loss": 0.77795172, "learning_rate": 3.2485450411990194e-06, "loss": 0.79944265, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.72118878364563 }, { "auxiliary_loss_clip": 0.01194826, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.05380821, "balance_loss_mlp": 1.02010179, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 2.502697098934329, "language_loss": 0.82156837, "learning_rate": 3.2479364086866983e-06, "loss": 0.84381032, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.649313449859619 }, { "auxiliary_loss_clip": 0.01168545, "auxiliary_loss_mlp": 0.00712583, "balance_loss_clip": 1.05905533, "balance_loss_mlp": 1.00089657, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.8311872973424677, "language_loss": 0.81553036, "learning_rate": 3.247327586863416e-06, "loss": 0.83434159, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.668168783187866 }, { "auxiliary_loss_clip": 0.01152226, "auxiliary_loss_mlp": 0.01029252, "balance_loss_clip": 1.05181479, "balance_loss_mlp": 1.02034068, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.4186045025122302, "language_loss": 0.77076459, "learning_rate": 3.2467185758215304e-06, "loss": 0.79257935, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.7619340419769287 }, { "auxiliary_loss_clip": 0.01153947, "auxiliary_loss_mlp": 0.00712361, "balance_loss_clip": 1.05510569, "balance_loss_mlp": 1.00092602, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.6472267537414154, "language_loss": 0.85841644, "learning_rate": 3.246109375653428e-06, "loss": 0.87707949, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.701894521713257 }, { "auxiliary_loss_clip": 0.01198947, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.05932689, "balance_loss_mlp": 1.0212276, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 2.0447623312722407, "language_loss": 0.78707755, "learning_rate": 3.2454999864515243e-06, "loss": 0.80936503, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.5870723724365234 }, { "auxiliary_loss_clip": 0.01157086, "auxiliary_loss_mlp": 0.00713208, "balance_loss_clip": 1.05010259, "balance_loss_mlp": 1.00085711, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 2.203070240431047, "language_loss": 0.69417703, "learning_rate": 3.244890408308263e-06, "loss": 0.7128799, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.6441967487335205 }, { "auxiliary_loss_clip": 0.01132802, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.04982448, "balance_loss_mlp": 1.0229733, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.3904710085875878, "language_loss": 0.61000919, "learning_rate": 3.2442806413161165e-06, "loss": 0.63165402, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 2.758042335510254 }, { "auxiliary_loss_clip": 0.01136144, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.05114734, "balance_loss_mlp": 1.01985228, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.028288871400844, "language_loss": 0.7626394, "learning_rate": 3.243670685567586e-06, "loss": 0.78429574, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 3.7105302810668945 }, { "auxiliary_loss_clip": 0.01162747, "auxiliary_loss_mlp": 0.00712677, "balance_loss_clip": 1.05369699, "balance_loss_mlp": 1.00081611, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.0693165462065077, "language_loss": 0.80310524, "learning_rate": 3.2430605411552012e-06, "loss": 0.82185948, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 3.5347437858581543 }, { "auxiliary_loss_clip": 0.01079282, "auxiliary_loss_mlp": 0.01006242, "balance_loss_clip": 1.03099918, "balance_loss_mlp": 1.00385821, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8829704640380798, "language_loss": 0.70604569, "learning_rate": 3.2424502081715205e-06, "loss": 0.72690094, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 4.242029190063477 }, { "auxiliary_loss_clip": 0.01165101, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.05346775, "balance_loss_mlp": 1.02473724, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 2.0561076269850576, "language_loss": 0.77995527, "learning_rate": 3.241839686709132e-06, "loss": 0.80194813, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.7439961433410645 }, { "auxiliary_loss_clip": 0.01178643, "auxiliary_loss_mlp": 0.01032408, "balance_loss_clip": 1.05402112, "balance_loss_mlp": 1.02256095, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 3.0026847018169804, "language_loss": 0.82429868, "learning_rate": 3.2412289768606495e-06, "loss": 0.8464092, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.614872932434082 }, { "auxiliary_loss_clip": 0.01182057, "auxiliary_loss_mlp": 0.0102534, "balance_loss_clip": 1.05517483, "balance_loss_mlp": 1.01639938, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 1.811474340377324, "language_loss": 0.82908618, "learning_rate": 3.240618078718718e-06, "loss": 0.85116017, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.7029781341552734 }, { "auxiliary_loss_clip": 0.01144299, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 1.05058408, "balance_loss_mlp": 1.02434087, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 2.166859171145511, "language_loss": 0.74329793, "learning_rate": 3.240006992376011e-06, "loss": 0.76508212, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 3.6515040397644043 }, { "auxiliary_loss_clip": 0.01169378, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.05616879, "balance_loss_mlp": 1.01860523, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.185182879798361, "language_loss": 0.76542592, "learning_rate": 3.2393957179252284e-06, "loss": 0.78738844, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.648334264755249 }, { "auxiliary_loss_clip": 0.01199313, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.05823052, "balance_loss_mlp": 1.02203822, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 6.510338437615475, "language_loss": 0.80617726, "learning_rate": 3.2387842554591016e-06, "loss": 0.82848144, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.67206072807312 }, { "auxiliary_loss_clip": 0.01200821, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.05975914, "balance_loss_mlp": 1.02085114, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 3.7834487130604066, "language_loss": 0.87877965, "learning_rate": 3.238172605070388e-06, "loss": 0.90109134, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.5254480838775635 }, { "auxiliary_loss_clip": 0.01179785, "auxiliary_loss_mlp": 0.00714043, "balance_loss_clip": 1.05498004, "balance_loss_mlp": 1.00083649, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.325437902550166, "language_loss": 0.78719193, "learning_rate": 3.2375607668518745e-06, "loss": 0.80613017, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.6238598823547363 }, { "auxiliary_loss_clip": 0.01152692, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.05070949, "balance_loss_mlp": 1.02244544, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.1470615434117657, "language_loss": 0.89524913, "learning_rate": 3.236948740896377e-06, "loss": 0.91709149, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.6301138401031494 }, { "auxiliary_loss_clip": 0.01181037, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.05566108, "balance_loss_mlp": 1.01936579, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.5184334918663163, "language_loss": 0.8421644, "learning_rate": 3.2363365272967384e-06, "loss": 0.86426377, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.7024641036987305 }, { "auxiliary_loss_clip": 0.01184287, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.06095171, "balance_loss_mlp": 1.02271378, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 1.9354587364368425, "language_loss": 0.81424069, "learning_rate": 3.235724126145832e-06, "loss": 0.83639663, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.6061344146728516 }, { "auxiliary_loss_clip": 0.01171354, "auxiliary_loss_mlp": 0.01032678, "balance_loss_clip": 1.05287361, "balance_loss_mlp": 1.02376103, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.4933455496887502, "language_loss": 0.77566081, "learning_rate": 3.235111537536558e-06, "loss": 0.79770112, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.6575512886047363 }, { "auxiliary_loss_clip": 0.01182394, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.05674267, "balance_loss_mlp": 1.01861048, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 1.7812074441922248, "language_loss": 0.82704389, "learning_rate": 3.2344987615618456e-06, "loss": 0.84914285, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.666935682296753 }, { "auxiliary_loss_clip": 0.01149813, "auxiliary_loss_mlp": 0.01029959, "balance_loss_clip": 1.05589437, "balance_loss_mlp": 1.02112544, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.7486330999044504, "language_loss": 0.78381902, "learning_rate": 3.2338857983146533e-06, "loss": 0.8056168, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.8463282585144043 }, { "auxiliary_loss_clip": 0.01157121, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.05535626, "balance_loss_mlp": 1.02432275, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.8966322418783426, "language_loss": 0.76662183, "learning_rate": 3.233272647887966e-06, "loss": 0.78853106, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.612037181854248 }, { "auxiliary_loss_clip": 0.01201499, "auxiliary_loss_mlp": 0.01035508, "balance_loss_clip": 1.06081605, "balance_loss_mlp": 1.02602482, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.73199495532542, "language_loss": 0.90074837, "learning_rate": 3.2326593103747985e-06, "loss": 0.92311841, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.6343483924865723 }, { "auxiliary_loss_clip": 0.0118257, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.06060874, "balance_loss_mlp": 1.02648914, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 8.612514418616907, "language_loss": 0.84935796, "learning_rate": 3.2320457858681936e-06, "loss": 0.87153983, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.626885175704956 }, { "auxiliary_loss_clip": 0.0116542, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.0551542, "balance_loss_mlp": 1.02522492, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.8151382225853383, "language_loss": 0.84670234, "learning_rate": 3.2314320744612228e-06, "loss": 0.86869514, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.8005268573760986 }, { "auxiliary_loss_clip": 0.0117679, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.05569267, "balance_loss_mlp": 1.02582049, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 2.5749206120299863, "language_loss": 0.76677775, "learning_rate": 3.2308181762469854e-06, "loss": 0.78889668, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.690800905227661 }, { "auxiliary_loss_clip": 0.01203162, "auxiliary_loss_mlp": 0.01036749, "balance_loss_clip": 1.05854928, "balance_loss_mlp": 1.02677131, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.0942329905656023, "language_loss": 0.78844547, "learning_rate": 3.230204091318609e-06, "loss": 0.81084454, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.6986632347106934 }, { "auxiliary_loss_clip": 0.01193409, "auxiliary_loss_mlp": 0.00713125, "balance_loss_clip": 1.05521059, "balance_loss_mlp": 1.00070941, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 2.316048812592329, "language_loss": 0.84479588, "learning_rate": 3.2295898197692503e-06, "loss": 0.8638612, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.5514752864837646 }, { "auxiliary_loss_clip": 0.01196495, "auxiliary_loss_mlp": 0.01032567, "balance_loss_clip": 1.05834198, "balance_loss_mlp": 1.02446067, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.6601491859967084, "language_loss": 0.79326844, "learning_rate": 3.228975361692094e-06, "loss": 0.81555903, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.677029848098755 }, { "auxiliary_loss_clip": 0.01187104, "auxiliary_loss_mlp": 0.00713959, "balance_loss_clip": 1.05631351, "balance_loss_mlp": 1.000772, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.3368904716059835, "language_loss": 0.80520207, "learning_rate": 3.228360717180352e-06, "loss": 0.82421267, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.670118570327759 }, { "auxiliary_loss_clip": 0.01114673, "auxiliary_loss_mlp": 0.00703844, "balance_loss_clip": 1.033283, "balance_loss_mlp": 1.00034666, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.80858109693556, "language_loss": 0.59415668, "learning_rate": 3.227745886327266e-06, "loss": 0.61234182, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.129953145980835 }, { "auxiliary_loss_clip": 0.011136, "auxiliary_loss_mlp": 0.01001271, "balance_loss_clip": 1.03261852, "balance_loss_mlp": 0.99918443, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8827516154603364, "language_loss": 0.558245, "learning_rate": 3.227130869226105e-06, "loss": 0.57939368, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 4.930521011352539 }, { "auxiliary_loss_clip": 0.01183817, "auxiliary_loss_mlp": 0.01028528, "balance_loss_clip": 1.05802512, "balance_loss_mlp": 1.01947415, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 3.9402276533889884, "language_loss": 0.82771921, "learning_rate": 3.226515665970167e-06, "loss": 0.84984267, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 3.6235342025756836 }, { "auxiliary_loss_clip": 0.01180778, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.05582762, "balance_loss_mlp": 1.02320397, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 3.2647681751373496, "language_loss": 0.86335433, "learning_rate": 3.225900276652777e-06, "loss": 0.88548696, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.5719082355499268 }, { "auxiliary_loss_clip": 0.01171645, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.05571556, "balance_loss_mlp": 1.02794123, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.5843521683089277, "language_loss": 0.75645673, "learning_rate": 3.2252847013672906e-06, "loss": 0.77854073, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.7040858268737793 }, { "auxiliary_loss_clip": 0.01139098, "auxiliary_loss_mlp": 0.01028364, "balance_loss_clip": 1.04824054, "balance_loss_mlp": 1.01860666, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 2.0397767559168707, "language_loss": 0.75869113, "learning_rate": 3.224668940207089e-06, "loss": 0.78036571, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.7463929653167725 }, { "auxiliary_loss_clip": 0.01123732, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.047297, "balance_loss_mlp": 1.02639651, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 1.906613114170268, "language_loss": 0.86826873, "learning_rate": 3.2240529932655828e-06, "loss": 0.88986588, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 3.6618001461029053 }, { "auxiliary_loss_clip": 0.01165377, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.05839515, "balance_loss_mlp": 1.02074659, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 2.562712622849338, "language_loss": 0.88272548, "learning_rate": 3.223436860636211e-06, "loss": 0.90467983, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.6716015338897705 }, { "auxiliary_loss_clip": 0.0119808, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.05928063, "balance_loss_mlp": 1.02150643, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.6526466089810103, "language_loss": 0.74032134, "learning_rate": 3.2228205424124403e-06, "loss": 0.76260805, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 2.6290876865386963 }, { "auxiliary_loss_clip": 0.01151759, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.05158484, "balance_loss_mlp": 1.02829194, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 3.0239091876914603, "language_loss": 0.74623525, "learning_rate": 3.222204038687765e-06, "loss": 0.76813304, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.6696200370788574 }, { "auxiliary_loss_clip": 0.01178421, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.05656815, "balance_loss_mlp": 1.02495766, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.6562240829255577, "language_loss": 0.87731969, "learning_rate": 3.221587349555709e-06, "loss": 0.89944005, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.7220757007598877 }, { "auxiliary_loss_clip": 0.0116902, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.05520296, "balance_loss_mlp": 1.02500594, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.6688665536705716, "language_loss": 0.69463575, "learning_rate": 3.2209704751098236e-06, "loss": 0.71666527, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.6953320503234863 }, { "auxiliary_loss_clip": 0.01168105, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.05578828, "balance_loss_mlp": 1.02283931, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 2.248993420232969, "language_loss": 0.82795715, "learning_rate": 3.2203534154436875e-06, "loss": 0.84996539, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.6477668285369873 }, { "auxiliary_loss_clip": 0.01114312, "auxiliary_loss_mlp": 0.01035623, "balance_loss_clip": 1.048527, "balance_loss_mlp": 1.02646208, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 2.0062417998776887, "language_loss": 0.75530088, "learning_rate": 3.2197361706509084e-06, "loss": 0.77680022, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.749378204345703 }, { "auxiliary_loss_clip": 0.01199961, "auxiliary_loss_mlp": 0.0103489, "balance_loss_clip": 1.05852354, "balance_loss_mlp": 1.0251801, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.763889873536155, "language_loss": 0.83377105, "learning_rate": 3.2191187408251228e-06, "loss": 0.85611957, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.552778482437134 }, { "auxiliary_loss_clip": 0.01185489, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.05482054, "balance_loss_mlp": 1.02383971, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.0018484359477364, "language_loss": 0.78430355, "learning_rate": 3.218501126059993e-06, "loss": 0.80649966, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.620811939239502 }, { "auxiliary_loss_clip": 0.01182159, "auxiliary_loss_mlp": 0.01035585, "balance_loss_clip": 1.05310714, "balance_loss_mlp": 1.02602983, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 1.9803933908132925, "language_loss": 0.81350768, "learning_rate": 3.2178833264492116e-06, "loss": 0.83568513, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.6314892768859863 }, { "auxiliary_loss_clip": 0.01189647, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 1.05813813, "balance_loss_mlp": 1.0202961, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.6559861668059137, "language_loss": 0.76161963, "learning_rate": 3.217265342086498e-06, "loss": 0.78381574, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.692091464996338 }, { "auxiliary_loss_clip": 0.01156524, "auxiliary_loss_mlp": 0.00713937, "balance_loss_clip": 1.05523419, "balance_loss_mlp": 1.00089765, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 2.236248458038336, "language_loss": 0.72905767, "learning_rate": 3.216647173065599e-06, "loss": 0.74776226, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.739828109741211 }, { "auxiliary_loss_clip": 0.01162351, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.05475187, "balance_loss_mlp": 1.02540386, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 1.7089829871461175, "language_loss": 0.73785645, "learning_rate": 3.216028819480292e-06, "loss": 0.75982416, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.9237494468688965 }, { "auxiliary_loss_clip": 0.01153676, "auxiliary_loss_mlp": 0.01035877, "balance_loss_clip": 1.05275452, "balance_loss_mlp": 1.02688873, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 11.68352299285384, "language_loss": 0.76087523, "learning_rate": 3.2154102814243793e-06, "loss": 0.78277075, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.684924602508545 }, { "auxiliary_loss_clip": 0.01155654, "auxiliary_loss_mlp": 0.01036202, "balance_loss_clip": 1.05611205, "balance_loss_mlp": 1.02649868, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 2.1848105660649066, "language_loss": 0.67328131, "learning_rate": 3.2147915589916937e-06, "loss": 0.69519985, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.788149118423462 }, { "auxiliary_loss_clip": 0.01159222, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.05219197, "balance_loss_mlp": 1.02551997, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 1.9846214360778953, "language_loss": 0.82918072, "learning_rate": 3.2141726522760938e-06, "loss": 0.85112906, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.6832215785980225 }, { "auxiliary_loss_clip": 0.01096735, "auxiliary_loss_mlp": 0.01006036, "balance_loss_clip": 1.03144789, "balance_loss_mlp": 1.00377131, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.8375537411276609, "language_loss": 0.52653098, "learning_rate": 3.213553561371469e-06, "loss": 0.54755872, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.2780346870422363 }, { "auxiliary_loss_clip": 0.0113391, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.05340576, "balance_loss_mlp": 1.02598178, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.0622873164287014, "language_loss": 0.95708811, "learning_rate": 3.212934286371733e-06, "loss": 0.978773, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.711177110671997 }, { "auxiliary_loss_clip": 0.01184574, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.05992723, "balance_loss_mlp": 1.02368498, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 2.5268745896978126, "language_loss": 0.83650011, "learning_rate": 3.2123148273708304e-06, "loss": 0.85868156, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.796170949935913 }, { "auxiliary_loss_clip": 0.01195637, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.05670285, "balance_loss_mlp": 1.01839542, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 2.051014992219818, "language_loss": 0.76953089, "learning_rate": 3.211695184462733e-06, "loss": 0.79176819, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 3.5417497158050537 }, { "auxiliary_loss_clip": 0.01075413, "auxiliary_loss_mlp": 0.0100387, "balance_loss_clip": 1.0311631, "balance_loss_mlp": 1.00197411, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.8720164716232162, "language_loss": 0.60436887, "learning_rate": 3.2110753577414383e-06, "loss": 0.62516165, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 4.16603684425354 }, { "auxiliary_loss_clip": 0.01167685, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.05213404, "balance_loss_mlp": 1.02557421, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 2.521464383939193, "language_loss": 0.7916438, "learning_rate": 3.2104553473009757e-06, "loss": 0.81367606, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.6506686210632324 }, { "auxiliary_loss_clip": 0.01129131, "auxiliary_loss_mlp": 0.01030036, "balance_loss_clip": 1.04974222, "balance_loss_mlp": 1.02065969, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 1.9830822253598261, "language_loss": 0.67902696, "learning_rate": 3.209835153235399e-06, "loss": 0.70061862, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 3.7747907638549805 }, { "auxiliary_loss_clip": 0.01139299, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.05047178, "balance_loss_mlp": 1.02510941, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.9168674162946393, "language_loss": 0.67925656, "learning_rate": 3.2092147756387916e-06, "loss": 0.700993, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.6975948810577393 }, { "auxiliary_loss_clip": 0.01157209, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.05016661, "balance_loss_mlp": 1.01984692, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 1.9021551633487643, "language_loss": 0.83220446, "learning_rate": 3.208594214605264e-06, "loss": 0.85407072, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.6234817504882812 }, { "auxiliary_loss_clip": 0.01152686, "auxiliary_loss_mlp": 0.01029655, "balance_loss_clip": 1.05070305, "balance_loss_mlp": 1.02035069, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 2.0812965754521793, "language_loss": 0.77360034, "learning_rate": 3.2079734702289553e-06, "loss": 0.79542369, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 3.601824998855591 }, { "auxiliary_loss_clip": 0.01094071, "auxiliary_loss_mlp": 0.00704106, "balance_loss_clip": 1.02848101, "balance_loss_mlp": 1.00036907, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8062548281536688, "language_loss": 0.60403812, "learning_rate": 3.207352542604031e-06, "loss": 0.62201989, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.3081183433532715 }, { "auxiliary_loss_clip": 0.01137327, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.04920387, "balance_loss_mlp": 1.02147162, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 2.4078390699272454, "language_loss": 0.78840375, "learning_rate": 3.2067314318246864e-06, "loss": 0.81008351, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.7248451709747314 }, { "auxiliary_loss_clip": 0.01152441, "auxiliary_loss_mlp": 0.01030695, "balance_loss_clip": 1.05442548, "balance_loss_mlp": 1.02115798, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.778676499031753, "language_loss": 0.77938735, "learning_rate": 3.206110137985143e-06, "loss": 0.80121863, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.7724313735961914 }, { "auxiliary_loss_clip": 0.01138981, "auxiliary_loss_mlp": 0.01033297, "balance_loss_clip": 1.05142522, "balance_loss_mlp": 1.02361739, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.1101444793143007, "language_loss": 0.9215166, "learning_rate": 3.2054886611796505e-06, "loss": 0.94323933, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.7098803520202637 }, { "auxiliary_loss_clip": 0.01108734, "auxiliary_loss_mlp": 0.01001764, "balance_loss_clip": 1.0290494, "balance_loss_mlp": 0.99967748, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8800016946338323, "language_loss": 0.63496768, "learning_rate": 3.204867001502487e-06, "loss": 0.65607274, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.1723475456237793 }, { "auxiliary_loss_clip": 0.01197692, "auxiliary_loss_mlp": 0.01036765, "balance_loss_clip": 1.05718124, "balance_loss_mlp": 1.02592933, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 1.9715900532873938, "language_loss": 0.80855453, "learning_rate": 3.2042451590479567e-06, "loss": 0.83089912, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.6256160736083984 }, { "auxiliary_loss_clip": 0.01192258, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.05483031, "balance_loss_mlp": 1.0204258, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.8834216452811372, "language_loss": 0.86899436, "learning_rate": 3.203623133910394e-06, "loss": 0.89121664, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.6286873817443848 }, { "auxiliary_loss_clip": 0.01120245, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 1.05032635, "balance_loss_mlp": 1.01800513, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.7587225680454512, "language_loss": 0.77135205, "learning_rate": 3.203000926184158e-06, "loss": 0.79282522, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.8095641136169434 }, { "auxiliary_loss_clip": 0.01196835, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.05798125, "balance_loss_mlp": 1.02235579, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.6421287594862704, "language_loss": 0.77315962, "learning_rate": 3.202378535963639e-06, "loss": 0.79544425, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.6549630165100098 }, { "auxiliary_loss_clip": 0.0115442, "auxiliary_loss_mlp": 0.00714482, "balance_loss_clip": 1.05034387, "balance_loss_mlp": 1.00074816, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.6065563546245716, "language_loss": 0.83920991, "learning_rate": 3.2017559633432516e-06, "loss": 0.85789895, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.6719536781311035 }, { "auxiliary_loss_clip": 0.01173505, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.05489349, "balance_loss_mlp": 1.02565289, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 1.9919469101382967, "language_loss": 0.66591865, "learning_rate": 3.2011332084174398e-06, "loss": 0.6880011, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.685878276824951 }, { "auxiliary_loss_clip": 0.01177741, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.05328608, "balance_loss_mlp": 1.01605511, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.5936538718537088, "language_loss": 0.8922227, "learning_rate": 3.2005102712806756e-06, "loss": 0.91425622, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.6271142959594727 }, { "auxiliary_loss_clip": 0.01183508, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.05442131, "balance_loss_mlp": 1.02574313, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.08681932959789, "language_loss": 0.73399293, "learning_rate": 3.1998871520274575e-06, "loss": 0.75617909, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.5802643299102783 }, { "auxiliary_loss_clip": 0.01162116, "auxiliary_loss_mlp": 0.01035196, "balance_loss_clip": 1.04949832, "balance_loss_mlp": 1.02548647, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 1.6895805927113026, "language_loss": 0.85081679, "learning_rate": 3.199263850752312e-06, "loss": 0.87278986, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.684962034225464 }, { "auxiliary_loss_clip": 0.01182949, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.05543149, "balance_loss_mlp": 1.01924968, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.090987946510145, "language_loss": 0.86028749, "learning_rate": 3.198640367549795e-06, "loss": 0.88240379, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.593080759048462 }, { "auxiliary_loss_clip": 0.01181735, "auxiliary_loss_mlp": 0.00713024, "balance_loss_clip": 1.05443656, "balance_loss_mlp": 1.00059175, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.7378409484119557, "language_loss": 0.85706538, "learning_rate": 3.198016702514487e-06, "loss": 0.87601298, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.681157112121582 }, { "auxiliary_loss_clip": 0.01196447, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02236307, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 2.0506054514525105, "language_loss": 0.8463043, "learning_rate": 3.1973928557409972e-06, "loss": 0.86858523, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.5892999172210693 }, { "auxiliary_loss_clip": 0.01197982, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.05805218, "balance_loss_mlp": 1.02593815, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 1.8810344439295774, "language_loss": 0.71076179, "learning_rate": 3.1967688273239636e-06, "loss": 0.73309612, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.6219708919525146 }, { "auxiliary_loss_clip": 0.01147897, "auxiliary_loss_mlp": 0.01033585, "balance_loss_clip": 1.05292046, "balance_loss_mlp": 1.02423334, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.6657303675512416, "language_loss": 0.82039845, "learning_rate": 3.1961446173580503e-06, "loss": 0.84221327, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.6584675312042236 }, { "auxiliary_loss_clip": 0.01162552, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.05579007, "balance_loss_mlp": 1.02453578, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 2.569029146986301, "language_loss": 0.77115214, "learning_rate": 3.1955202259379502e-06, "loss": 0.79312038, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 3.558779239654541 }, { "auxiliary_loss_clip": 0.01177069, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.05365086, "balance_loss_mlp": 1.02050781, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 1.7301003390996665, "language_loss": 0.82780421, "learning_rate": 3.194895653158381e-06, "loss": 0.84987223, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 3.5869123935699463 }, { "auxiliary_loss_clip": 0.01112271, "auxiliary_loss_mlp": 0.01004458, "balance_loss_clip": 1.03152287, "balance_loss_mlp": 1.00218153, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.7674557067581028, "language_loss": 0.55505455, "learning_rate": 3.194270899114093e-06, "loss": 0.57622182, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.2431108951568604 }, { "auxiliary_loss_clip": 0.01189145, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.05748272, "balance_loss_mlp": 1.02152729, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 1.848307266062572, "language_loss": 0.82595205, "learning_rate": 3.193645963899858e-06, "loss": 0.84815258, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 3.4901797771453857 }, { "auxiliary_loss_clip": 0.01162148, "auxiliary_loss_mlp": 0.01033996, "balance_loss_clip": 1.05516076, "balance_loss_mlp": 1.02413774, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.7872146850304715, "language_loss": 0.83842367, "learning_rate": 3.193020847610479e-06, "loss": 0.86038518, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.657487630844116 }, { "auxiliary_loss_clip": 0.01159054, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.05402446, "balance_loss_mlp": 1.02475441, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 3.047319618763939, "language_loss": 0.71084118, "learning_rate": 3.192395550340787e-06, "loss": 0.73277783, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.6473536491394043 }, { "auxiliary_loss_clip": 0.01179835, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.05794311, "balance_loss_mlp": 1.02802801, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 1.9799739092259678, "language_loss": 0.77144432, "learning_rate": 3.191770072185638e-06, "loss": 0.79361188, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 3.585787057876587 }, { "auxiliary_loss_clip": 0.01180177, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 1.05808544, "balance_loss_mlp": 1.01876044, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 2.419252700146642, "language_loss": 0.72714078, "learning_rate": 3.191144413239916e-06, "loss": 0.74922317, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.5766077041625977 }, { "auxiliary_loss_clip": 0.01164912, "auxiliary_loss_mlp": 0.01030676, "balance_loss_clip": 1.05441082, "balance_loss_mlp": 1.0210917, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 2.435314028144155, "language_loss": 0.88522995, "learning_rate": 3.190518573598534e-06, "loss": 0.90718585, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.6941604614257812 }, { "auxiliary_loss_clip": 0.01156259, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.05268514, "balance_loss_mlp": 1.02129662, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.778515349735552, "language_loss": 0.77282608, "learning_rate": 3.1898925533564308e-06, "loss": 0.79469699, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.740488052368164 }, { "auxiliary_loss_clip": 0.01137047, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.05031896, "balance_loss_mlp": 1.02413893, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 2.155997503706367, "language_loss": 0.64053553, "learning_rate": 3.1892663526085733e-06, "loss": 0.66223931, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.66396164894104 }, { "auxiliary_loss_clip": 0.01110727, "auxiliary_loss_mlp": 0.01005329, "balance_loss_clip": 1.03056419, "balance_loss_mlp": 1.00305259, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7908737671221548, "language_loss": 0.56934786, "learning_rate": 3.188639971449956e-06, "loss": 0.59050846, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.1723313331604004 }, { "auxiliary_loss_clip": 0.01197075, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.05644965, "balance_loss_mlp": 1.02694166, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.54421166223567, "language_loss": 0.72287971, "learning_rate": 3.1880134099756e-06, "loss": 0.74521947, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.628854274749756 }, { "auxiliary_loss_clip": 0.01179466, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.0553118, "balance_loss_mlp": 1.02012014, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 1.8960939759219995, "language_loss": 0.69581777, "learning_rate": 3.1873866682805535e-06, "loss": 0.71790421, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.6827104091644287 }, { "auxiliary_loss_clip": 0.01168512, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.05601597, "balance_loss_mlp": 1.02139997, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 1.792940499176234, "language_loss": 0.88339549, "learning_rate": 3.186759746459894e-06, "loss": 0.90538746, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.6249687671661377 }, { "auxiliary_loss_clip": 0.01164001, "auxiliary_loss_mlp": 0.01034093, "balance_loss_clip": 1.05503035, "balance_loss_mlp": 1.02452624, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 4.132387891453955, "language_loss": 0.80084062, "learning_rate": 3.1861326446087246e-06, "loss": 0.8228215, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.729020833969116 }, { "auxiliary_loss_clip": 0.0118416, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.05711007, "balance_loss_mlp": 1.02522957, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.1860130174569443, "language_loss": 0.72079015, "learning_rate": 3.1855053628221763e-06, "loss": 0.74296778, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.641824960708618 }, { "auxiliary_loss_clip": 0.0113867, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.04857767, "balance_loss_mlp": 1.02146244, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 3.291745455751138, "language_loss": 0.89896452, "learning_rate": 3.184877901195407e-06, "loss": 0.92066783, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.6325998306274414 }, { "auxiliary_loss_clip": 0.01091734, "auxiliary_loss_mlp": 0.01015064, "balance_loss_clip": 1.04939246, "balance_loss_mlp": 1.0132159, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7981206364150326, "language_loss": 0.62816954, "learning_rate": 3.184250259823602e-06, "loss": 0.64923751, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.3193624019622803 }, { "auxiliary_loss_clip": 0.01149921, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.05229044, "balance_loss_mlp": 1.02396536, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.388035934186836, "language_loss": 0.81892908, "learning_rate": 3.183622438801974e-06, "loss": 0.84076828, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.6556997299194336 }, { "auxiliary_loss_clip": 0.01200456, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.05951464, "balance_loss_mlp": 1.02434802, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 2.041265887690002, "language_loss": 0.75541496, "learning_rate": 3.1829944382257637e-06, "loss": 0.77776194, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.631307363510132 }, { "auxiliary_loss_clip": 0.01177832, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.055969, "balance_loss_mlp": 1.0216496, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.3737648541215517, "language_loss": 0.81589687, "learning_rate": 3.1823662581902373e-06, "loss": 0.83798152, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.660102367401123 }, { "auxiliary_loss_clip": 0.01131748, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.04497862, "balance_loss_mlp": 1.02683258, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.0525523348612924, "language_loss": 0.7515043, "learning_rate": 3.1817378987906896e-06, "loss": 0.77318376, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.725520133972168 }, { "auxiliary_loss_clip": 0.01131075, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.05531991, "balance_loss_mlp": 1.02230799, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 1.9634469729988864, "language_loss": 0.80086869, "learning_rate": 3.181109360122442e-06, "loss": 0.82249749, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.7021892070770264 }, { "auxiliary_loss_clip": 0.01147739, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.05261385, "balance_loss_mlp": 1.0230732, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 2.0329173910439393, "language_loss": 0.78320956, "learning_rate": 3.1804806422808445e-06, "loss": 0.80502021, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.6393001079559326 }, { "auxiliary_loss_clip": 0.01155716, "auxiliary_loss_mlp": 0.01039716, "balance_loss_clip": 1.05383396, "balance_loss_mlp": 1.03029835, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.6277914031141572, "language_loss": 0.72916621, "learning_rate": 3.1798517453612714e-06, "loss": 0.75112057, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.605053663253784 }, { "auxiliary_loss_clip": 0.01181927, "auxiliary_loss_mlp": 0.01026557, "balance_loss_clip": 1.06079388, "balance_loss_mlp": 1.01741385, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.8097885376165201, "language_loss": 0.75619566, "learning_rate": 3.1792226694591265e-06, "loss": 0.7782805, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 3.699167013168335 }, { "auxiliary_loss_clip": 0.01148706, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.05494893, "balance_loss_mlp": 1.03134525, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.279330509863167, "language_loss": 0.80530208, "learning_rate": 3.178593414669841e-06, "loss": 0.82719952, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 3.599191188812256 }, { "auxiliary_loss_clip": 0.01183349, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.05730605, "balance_loss_mlp": 1.02227306, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 2.011295838227847, "language_loss": 0.70870769, "learning_rate": 3.1779639810888707e-06, "loss": 0.7308743, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 3.6792027950286865 }, { "auxiliary_loss_clip": 0.01176121, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.05496407, "balance_loss_mlp": 1.02322459, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 1.9333628689162616, "language_loss": 0.76091927, "learning_rate": 3.1773343688117013e-06, "loss": 0.78300476, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.6657567024230957 }, { "auxiliary_loss_clip": 0.01168378, "auxiliary_loss_mlp": 0.00713564, "balance_loss_clip": 1.05485368, "balance_loss_mlp": 1.00060201, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 4.174916903935464, "language_loss": 0.83920491, "learning_rate": 3.1767045779338445e-06, "loss": 0.8580243, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.723332643508911 }, { "auxiliary_loss_clip": 0.01176051, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.05101347, "balance_loss_mlp": 1.02211297, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.398853321741799, "language_loss": 0.91387808, "learning_rate": 3.176074608550839e-06, "loss": 0.93594456, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.6086761951446533 }, { "auxiliary_loss_clip": 0.01115158, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.04872322, "balance_loss_mlp": 1.02487195, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.4699575958755893, "language_loss": 0.82283235, "learning_rate": 3.17544446075825e-06, "loss": 0.84432566, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 3.6774656772613525 }, { "auxiliary_loss_clip": 0.01170294, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.05470645, "balance_loss_mlp": 1.02527261, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.5337028502924792, "language_loss": 0.70571399, "learning_rate": 3.174814134651671e-06, "loss": 0.72775877, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.8082916736602783 }, { "auxiliary_loss_clip": 0.01191824, "auxiliary_loss_mlp": 0.01033869, "balance_loss_clip": 1.05605555, "balance_loss_mlp": 1.02450502, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 1.711403304954451, "language_loss": 0.80395842, "learning_rate": 3.1741836303267215e-06, "loss": 0.82621539, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.5820670127868652 }, { "auxiliary_loss_clip": 0.01196628, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.05863786, "balance_loss_mlp": 1.0215497, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 1.9425200488263477, "language_loss": 0.75477469, "learning_rate": 3.1735529478790496e-06, "loss": 0.7770474, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.608419895172119 }, { "auxiliary_loss_clip": 0.01182237, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.05669439, "balance_loss_mlp": 1.02794003, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 1.9131382381061686, "language_loss": 0.79925585, "learning_rate": 3.172922087404328e-06, "loss": 0.82145298, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.8735973834991455 }, { "auxiliary_loss_clip": 0.01116234, "auxiliary_loss_mlp": 0.01015881, "balance_loss_clip": 1.03792024, "balance_loss_mlp": 1.0135802, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7719868103587827, "language_loss": 0.55206281, "learning_rate": 3.1722910489982586e-06, "loss": 0.57338393, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.268033742904663 }, { "auxiliary_loss_clip": 0.01160543, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.05281138, "balance_loss_mlp": 1.02569246, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.7027999150508621, "language_loss": 0.80100036, "learning_rate": 3.1716598327565694e-06, "loss": 0.82295799, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.6745853424072266 }, { "auxiliary_loss_clip": 0.01195293, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.05841672, "balance_loss_mlp": 1.01674974, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.5456310960773088, "language_loss": 0.84081537, "learning_rate": 3.171028438775015e-06, "loss": 0.86302614, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.5851597785949707 }, { "auxiliary_loss_clip": 0.01195338, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.05685461, "balance_loss_mlp": 1.02342165, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 2.2565190481874042, "language_loss": 0.84439147, "learning_rate": 3.170396867149377e-06, "loss": 0.86667562, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.5587713718414307 }, { "auxiliary_loss_clip": 0.01126366, "auxiliary_loss_mlp": 0.01037355, "balance_loss_clip": 1.05155587, "balance_loss_mlp": 1.0282414, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 2.3557622871508697, "language_loss": 0.86388171, "learning_rate": 3.1697651179754653e-06, "loss": 0.88551891, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.7074153423309326 }, { "auxiliary_loss_clip": 0.01155587, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.06055522, "balance_loss_mlp": 1.02593303, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.7126558287312645, "language_loss": 0.73066604, "learning_rate": 3.1691331913491153e-06, "loss": 0.7525804, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.755711793899536 }, { "auxiliary_loss_clip": 0.01194686, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.05635881, "balance_loss_mlp": 1.01913834, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 2.015532892620882, "language_loss": 0.84704125, "learning_rate": 3.1685010873661898e-06, "loss": 0.86927646, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.5564377307891846 }, { "auxiliary_loss_clip": 0.0117454, "auxiliary_loss_mlp": 0.01032563, "balance_loss_clip": 1.05403459, "balance_loss_mlp": 1.0229069, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 1.9352304618493807, "language_loss": 0.79471123, "learning_rate": 3.167868806122578e-06, "loss": 0.81678224, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.6852691173553467 }, { "auxiliary_loss_clip": 0.01166927, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.05422342, "balance_loss_mlp": 1.02579808, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 2.1192384768554673, "language_loss": 0.66062719, "learning_rate": 3.1672363477141968e-06, "loss": 0.68265086, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.6965928077697754 }, { "auxiliary_loss_clip": 0.01167363, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.05349493, "balance_loss_mlp": 1.02059495, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 1.9132011300581877, "language_loss": 0.84819877, "learning_rate": 3.1666037122369903e-06, "loss": 0.87018025, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.721184492111206 }, { "auxiliary_loss_clip": 0.01174078, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.05109239, "balance_loss_mlp": 1.02682638, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.421811194436069, "language_loss": 0.86619747, "learning_rate": 3.165970899786928e-06, "loss": 0.88829672, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.6265878677368164 }, { "auxiliary_loss_clip": 0.01154077, "auxiliary_loss_mlp": 0.0104028, "balance_loss_clip": 1.05482543, "balance_loss_mlp": 1.03055227, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 2.560132100740429, "language_loss": 0.75227928, "learning_rate": 3.1653379104600067e-06, "loss": 0.77422285, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.7245614528656006 }, { "auxiliary_loss_clip": 0.01178978, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.05496085, "balance_loss_mlp": 1.02835965, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.4976263030868313, "language_loss": 0.6931957, "learning_rate": 3.164704744352251e-06, "loss": 0.71536016, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.6274161338806152 }, { "auxiliary_loss_clip": 0.01176208, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.05459642, "balance_loss_mlp": 1.0202558, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.8691321434303023, "language_loss": 0.80647206, "learning_rate": 3.164071401559713e-06, "loss": 0.82852888, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.5949959754943848 }, { "auxiliary_loss_clip": 0.01167344, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.05575657, "balance_loss_mlp": 1.02462935, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.6119249116097045, "language_loss": 0.7131291, "learning_rate": 3.1634378821784674e-06, "loss": 0.73514235, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 2.6538989543914795 }, { "auxiliary_loss_clip": 0.01148776, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.05425048, "balance_loss_mlp": 1.0204643, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.2510721298239087, "language_loss": 0.74531984, "learning_rate": 3.1628041863046208e-06, "loss": 0.76710033, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 3.57318115234375 }, { "auxiliary_loss_clip": 0.01199321, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.05722022, "balance_loss_mlp": 1.02154255, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.139814102990125, "language_loss": 0.91476154, "learning_rate": 3.162170314034304e-06, "loss": 0.93707252, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 3.495560646057129 }, { "auxiliary_loss_clip": 0.01199778, "auxiliary_loss_mlp": 0.0102765, "balance_loss_clip": 1.05894637, "balance_loss_mlp": 1.01857185, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.5660053840166415, "language_loss": 0.80953586, "learning_rate": 3.1615362654636738e-06, "loss": 0.83181012, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.574756383895874 }, { "auxiliary_loss_clip": 0.01142687, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.0557754, "balance_loss_mlp": 1.02884769, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.8729082995468551, "language_loss": 0.87290519, "learning_rate": 3.1609020406889163e-06, "loss": 0.89470482, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 3.5900681018829346 }, { "auxiliary_loss_clip": 0.01165816, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.05293155, "balance_loss_mlp": 1.02287245, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.702149150086206, "language_loss": 0.8557927, "learning_rate": 3.1602676398062416e-06, "loss": 0.87776339, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.6026203632354736 }, { "auxiliary_loss_clip": 0.01178277, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.0557462, "balance_loss_mlp": 1.02185154, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 2.249505366145429, "language_loss": 0.62143797, "learning_rate": 3.1596330629118886e-06, "loss": 0.64353317, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.6424686908721924 }, { "auxiliary_loss_clip": 0.01125942, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.05144143, "balance_loss_mlp": 1.02969646, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.256220140167565, "language_loss": 0.72963953, "learning_rate": 3.1589983101021223e-06, "loss": 0.75129211, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 3.8942010402679443 }, { "auxiliary_loss_clip": 0.01164555, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.05378366, "balance_loss_mlp": 1.02797699, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.5469074191015615, "language_loss": 0.85182738, "learning_rate": 3.1583633814732337e-06, "loss": 0.87384313, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.6692075729370117 }, { "auxiliary_loss_clip": 0.01193769, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.05433452, "balance_loss_mlp": 1.02327609, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 3.5346850950002193, "language_loss": 0.72238117, "learning_rate": 3.157728277121541e-06, "loss": 0.74464977, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.5972321033477783 }, { "auxiliary_loss_clip": 0.01194663, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.05480134, "balance_loss_mlp": 1.02314758, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 3.2936845065701728, "language_loss": 0.78642017, "learning_rate": 3.1570929971433897e-06, "loss": 0.80869961, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.533177137374878 }, { "auxiliary_loss_clip": 0.01178407, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.05612469, "balance_loss_mlp": 1.02229977, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 1.8660330691186138, "language_loss": 0.83881545, "learning_rate": 3.1564575416351504e-06, "loss": 0.86092508, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.670745372772217 }, { "auxiliary_loss_clip": 0.01199096, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.06010485, "balance_loss_mlp": 1.02823496, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 2.037609620563687, "language_loss": 0.74467289, "learning_rate": 3.155821910693221e-06, "loss": 0.76704293, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.5842604637145996 }, { "auxiliary_loss_clip": 0.01161212, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.05345929, "balance_loss_mlp": 1.02048302, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.5775360580026632, "language_loss": 0.86243129, "learning_rate": 3.1551861044140275e-06, "loss": 0.8843416, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.6804018020629883 }, { "auxiliary_loss_clip": 0.01124727, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.0522573, "balance_loss_mlp": 1.02679539, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 1.6931635140170422, "language_loss": 0.77386332, "learning_rate": 3.15455012289402e-06, "loss": 0.79547274, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.8213000297546387 }, { "auxiliary_loss_clip": 0.01184948, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 1.06113267, "balance_loss_mlp": 1.02063751, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 1.9085006025460858, "language_loss": 0.83776742, "learning_rate": 3.153913966229677e-06, "loss": 0.85991317, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.6147425174713135 }, { "auxiliary_loss_clip": 0.01101584, "auxiliary_loss_mlp": 0.01003318, "balance_loss_clip": 1.03489578, "balance_loss_mlp": 1.0010767, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6396653941046455, "language_loss": 0.50284719, "learning_rate": 3.1532776345175027e-06, "loss": 0.52389622, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.2314085960388184 }, { "auxiliary_loss_clip": 0.01197471, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.05991828, "balance_loss_mlp": 1.02186489, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 1.8052017325285128, "language_loss": 0.78800142, "learning_rate": 3.1526411278540285e-06, "loss": 0.81028068, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.5624704360961914 }, { "auxiliary_loss_clip": 0.01169531, "auxiliary_loss_mlp": 0.01030367, "balance_loss_clip": 1.05361879, "balance_loss_mlp": 1.02103865, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.051233738816413, "language_loss": 0.81293827, "learning_rate": 3.1520044463358116e-06, "loss": 0.83493721, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.708256483078003 }, { "auxiliary_loss_clip": 0.01177659, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.05645561, "balance_loss_mlp": 1.02229834, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.4673032363112999, "language_loss": 0.80305517, "learning_rate": 3.151367590059436e-06, "loss": 0.82514334, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.60387921333313 }, { "auxiliary_loss_clip": 0.01195667, "auxiliary_loss_mlp": 0.00713255, "balance_loss_clip": 1.05757141, "balance_loss_mlp": 1.00069845, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 1.878241826645491, "language_loss": 0.86855507, "learning_rate": 3.1507305591215117e-06, "loss": 0.88764423, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.593348503112793 }, { "auxiliary_loss_clip": 0.01099685, "auxiliary_loss_mlp": 0.01001248, "balance_loss_clip": 1.0340178, "balance_loss_mlp": 0.99894696, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6679859028402084, "language_loss": 0.5574075, "learning_rate": 3.150093353618677e-06, "loss": 0.57841671, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.23982310295105 }, { "auxiliary_loss_clip": 0.01184787, "auxiliary_loss_mlp": 0.010294, "balance_loss_clip": 1.05563939, "balance_loss_mlp": 1.02012515, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.4428373879493717, "language_loss": 0.88538975, "learning_rate": 3.149455973647596e-06, "loss": 0.90753162, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.614678144454956 }, { "auxiliary_loss_clip": 0.01138017, "auxiliary_loss_mlp": 0.01029427, "balance_loss_clip": 1.0491091, "balance_loss_mlp": 1.01985455, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 1.8977220835627235, "language_loss": 0.77191436, "learning_rate": 3.1488184193049563e-06, "loss": 0.79358888, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.7059483528137207 }, { "auxiliary_loss_clip": 0.01196983, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.06065989, "balance_loss_mlp": 1.02292562, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.6918261782433073, "language_loss": 0.72048146, "learning_rate": 3.1481806906874767e-06, "loss": 0.74277073, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.63840389251709 }, { "auxiliary_loss_clip": 0.01193447, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.05668128, "balance_loss_mlp": 1.02157474, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.6834415464980483, "language_loss": 0.8758136, "learning_rate": 3.147542787891899e-06, "loss": 0.89805007, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.6414668560028076 }, { "auxiliary_loss_clip": 0.01167478, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.05905259, "balance_loss_mlp": 1.02433503, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 4.159213078601547, "language_loss": 0.75181174, "learning_rate": 3.1469047110149926e-06, "loss": 0.77382064, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.710602283477783 }, { "auxiliary_loss_clip": 0.01126307, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.05259931, "balance_loss_mlp": 1.02376604, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 1.8380466617526512, "language_loss": 0.85197908, "learning_rate": 3.146266460153554e-06, "loss": 0.87357092, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 3.6253530979156494 }, { "auxiliary_loss_clip": 0.01162535, "auxiliary_loss_mlp": 0.00713802, "balance_loss_clip": 1.05559659, "balance_loss_mlp": 1.00053072, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.6991441932214186, "language_loss": 0.79911649, "learning_rate": 3.145628035404404e-06, "loss": 0.8178798, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 3.7310807704925537 }, { "auxiliary_loss_clip": 0.01094542, "auxiliary_loss_mlp": 0.01002896, "balance_loss_clip": 1.03070831, "balance_loss_mlp": 1.00048757, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8821237390024028, "language_loss": 0.5754534, "learning_rate": 3.1449894368643922e-06, "loss": 0.5964278, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.294029712677002 }, { "auxiliary_loss_clip": 0.01148352, "auxiliary_loss_mlp": 0.01026487, "balance_loss_clip": 1.05770588, "balance_loss_mlp": 1.0178802, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.4805723245499984, "language_loss": 0.71232271, "learning_rate": 3.1443506646303934e-06, "loss": 0.73407114, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 3.7314491271972656 }, { "auxiliary_loss_clip": 0.01180714, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.0559727, "balance_loss_mlp": 1.01949275, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 3.059160016457673, "language_loss": 0.66949934, "learning_rate": 3.1437117187993086e-06, "loss": 0.69159216, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.662818670272827 }, { "auxiliary_loss_clip": 0.01142637, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.05247343, "balance_loss_mlp": 1.02704942, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.6460227875093887, "language_loss": 0.80132514, "learning_rate": 3.143072599468065e-06, "loss": 0.82311237, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.7147834300994873 }, { "auxiliary_loss_clip": 0.01165508, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.05766928, "balance_loss_mlp": 1.02433717, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.506880104725669, "language_loss": 0.75913662, "learning_rate": 3.1424333067336174e-06, "loss": 0.78112656, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 3.745246648788452 }, { "auxiliary_loss_clip": 0.0118494, "auxiliary_loss_mlp": 0.01030151, "balance_loss_clip": 1.05721259, "balance_loss_mlp": 1.02137709, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.660843011840758, "language_loss": 0.78460407, "learning_rate": 3.141793840692945e-06, "loss": 0.80675495, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.668184757232666 }, { "auxiliary_loss_clip": 0.01154886, "auxiliary_loss_mlp": 0.01031081, "balance_loss_clip": 1.05257404, "balance_loss_mlp": 1.02178299, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 2.175518178377112, "language_loss": 0.61368775, "learning_rate": 3.1411542014430553e-06, "loss": 0.6355474, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 2.723083257675171 }, { "auxiliary_loss_clip": 0.01144176, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.04955196, "balance_loss_mlp": 1.02335751, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.9066300108448901, "language_loss": 0.8177352, "learning_rate": 3.1405143890809804e-06, "loss": 0.83949947, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.6659481525421143 }, { "auxiliary_loss_clip": 0.01159286, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.05602312, "balance_loss_mlp": 1.02427852, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.7399982924115365, "language_loss": 0.69871926, "learning_rate": 3.1398744037037796e-06, "loss": 0.72064722, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.735630750656128 }, { "auxiliary_loss_clip": 0.01163832, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.05575621, "balance_loss_mlp": 1.0268079, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 2.0616817174049116, "language_loss": 0.84383023, "learning_rate": 3.139234245408538e-06, "loss": 0.86582351, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.6118881702423096 }, { "auxiliary_loss_clip": 0.01146211, "auxiliary_loss_mlp": 0.00712344, "balance_loss_clip": 1.05316341, "balance_loss_mlp": 1.00063586, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.3248658521369712, "language_loss": 0.76117468, "learning_rate": 3.1385939142923666e-06, "loss": 0.77976024, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.687922477722168 }, { "auxiliary_loss_clip": 0.01160986, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.05152678, "balance_loss_mlp": 1.02590251, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 2.2290651669461297, "language_loss": 0.78254449, "learning_rate": 3.137953410452405e-06, "loss": 0.80450559, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.6777493953704834 }, { "auxiliary_loss_clip": 0.0115941, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.0524298, "balance_loss_mlp": 1.02589607, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.921735818791526, "language_loss": 0.74553514, "learning_rate": 3.1373127339858146e-06, "loss": 0.76747334, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.766324996948242 }, { "auxiliary_loss_clip": 0.01139189, "auxiliary_loss_mlp": 0.01035545, "balance_loss_clip": 1.05048418, "balance_loss_mlp": 1.02725446, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.7905933177465279, "language_loss": 0.74722534, "learning_rate": 3.136671884989787e-06, "loss": 0.76897269, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.7464704513549805 }, { "auxiliary_loss_clip": 0.01116626, "auxiliary_loss_mlp": 0.01033646, "balance_loss_clip": 1.0505631, "balance_loss_mlp": 1.02416909, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.3760056299711745, "language_loss": 0.87402844, "learning_rate": 3.1360308635615383e-06, "loss": 0.89553118, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.686105728149414 }, { "auxiliary_loss_clip": 0.01169292, "auxiliary_loss_mlp": 0.0103113, "balance_loss_clip": 1.05472314, "balance_loss_mlp": 1.02155733, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 1.9512298952736218, "language_loss": 0.78721124, "learning_rate": 3.135389669798311e-06, "loss": 0.80921543, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.6725780963897705 }, { "auxiliary_loss_clip": 0.01176932, "auxiliary_loss_mlp": 0.00712348, "balance_loss_clip": 1.05462146, "balance_loss_mlp": 1.00063419, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 1.8002014402703668, "language_loss": 0.79937065, "learning_rate": 3.134748303797373e-06, "loss": 0.81826347, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.5994582176208496 }, { "auxiliary_loss_clip": 0.01127068, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.04899514, "balance_loss_mlp": 1.02497876, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 1.8714973591740398, "language_loss": 0.81422555, "learning_rate": 3.1341067656560203e-06, "loss": 0.83584344, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.7223174571990967 }, { "auxiliary_loss_clip": 0.01169272, "auxiliary_loss_mlp": 0.01029476, "balance_loss_clip": 1.05217159, "balance_loss_mlp": 1.02097607, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 1.8656839043797189, "language_loss": 0.86421955, "learning_rate": 3.133465055471572e-06, "loss": 0.88620698, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.62334942817688 }, { "auxiliary_loss_clip": 0.01137957, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.05149078, "balance_loss_mlp": 1.02366853, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.293948778275564, "language_loss": 0.66045392, "learning_rate": 3.1328231733413767e-06, "loss": 0.68215966, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.6359407901763916 }, { "auxiliary_loss_clip": 0.01170317, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.05290914, "balance_loss_mlp": 1.01859045, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.2918850673870015, "language_loss": 0.90826738, "learning_rate": 3.1321811193628067e-06, "loss": 0.93024158, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.641779899597168 }, { "auxiliary_loss_clip": 0.01174698, "auxiliary_loss_mlp": 0.00713143, "balance_loss_clip": 1.05457342, "balance_loss_mlp": 1.00053394, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 2.230374342337599, "language_loss": 0.69793868, "learning_rate": 3.131538893633261e-06, "loss": 0.71681714, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.6866986751556396 }, { "auxiliary_loss_clip": 0.01195796, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.05895817, "balance_loss_mlp": 1.02493191, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 1.9627561175509154, "language_loss": 0.77929163, "learning_rate": 3.130896496250165e-06, "loss": 0.80158925, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.5751404762268066 }, { "auxiliary_loss_clip": 0.01191148, "auxiliary_loss_mlp": 0.01034026, "balance_loss_clip": 1.05369365, "balance_loss_mlp": 1.02535915, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 1.9067572298790476, "language_loss": 0.86878443, "learning_rate": 3.1302539273109693e-06, "loss": 0.89103615, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 2.538579225540161 }, { "auxiliary_loss_clip": 0.01154609, "auxiliary_loss_mlp": 0.01032385, "balance_loss_clip": 1.05343246, "balance_loss_mlp": 1.02308035, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 2.700008661041109, "language_loss": 0.80496365, "learning_rate": 3.1296111869131513e-06, "loss": 0.82683355, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 4.437559604644775 }, { "auxiliary_loss_clip": 0.01190966, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.05516934, "balance_loss_mlp": 1.02091742, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.8345637979290816, "language_loss": 0.85895431, "learning_rate": 3.1289682751542153e-06, "loss": 0.88116318, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.5991146564483643 }, { "auxiliary_loss_clip": 0.01172956, "auxiliary_loss_mlp": 0.0102706, "balance_loss_clip": 1.0529604, "balance_loss_mlp": 1.01855421, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 2.1709079759518253, "language_loss": 0.71117866, "learning_rate": 3.1283251921316883e-06, "loss": 0.73317885, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.6093528270721436 }, { "auxiliary_loss_clip": 0.01125983, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.0507381, "balance_loss_mlp": 1.01951492, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 3.1399562844464604, "language_loss": 0.81081414, "learning_rate": 3.1276819379431277e-06, "loss": 0.8323608, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 3.568270206451416 }, { "auxiliary_loss_clip": 0.01169443, "auxiliary_loss_mlp": 0.00713101, "balance_loss_clip": 1.05520058, "balance_loss_mlp": 1.00055647, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.0549086201946354, "language_loss": 0.75305176, "learning_rate": 3.1270385126861134e-06, "loss": 0.77187717, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.6256160736083984 }, { "auxiliary_loss_clip": 0.01193106, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.05550385, "balance_loss_mlp": 1.02140546, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 1.897141364135139, "language_loss": 0.8198415, "learning_rate": 3.1263949164582533e-06, "loss": 0.84207535, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.5696218013763428 }, { "auxiliary_loss_clip": 0.01188227, "auxiliary_loss_mlp": 0.0102701, "balance_loss_clip": 1.05079865, "balance_loss_mlp": 1.01811123, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 1.9744012744349917, "language_loss": 0.78549546, "learning_rate": 3.1257511493571797e-06, "loss": 0.80764782, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 3.4716174602508545 }, { "auxiliary_loss_clip": 0.01143381, "auxiliary_loss_mlp": 0.0102697, "balance_loss_clip": 1.05166912, "balance_loss_mlp": 1.01858926, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 1.8908940239362975, "language_loss": 0.78683758, "learning_rate": 3.125107211480552e-06, "loss": 0.80854118, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.7861435413360596 }, { "auxiliary_loss_clip": 0.011086, "auxiliary_loss_mlp": 0.0102887, "balance_loss_clip": 1.04634643, "balance_loss_mlp": 1.02037024, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.5904328533729872, "language_loss": 0.79696715, "learning_rate": 3.124463102926054e-06, "loss": 0.81834185, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.7118921279907227 }, { "auxiliary_loss_clip": 0.01108517, "auxiliary_loss_mlp": 0.01009031, "balance_loss_clip": 1.04410934, "balance_loss_mlp": 1.00696886, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.7653339697769017, "language_loss": 0.61582077, "learning_rate": 3.1238188237913984e-06, "loss": 0.63699621, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.3095548152923584 }, { "auxiliary_loss_clip": 0.01198484, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.05831182, "balance_loss_mlp": 1.02334547, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.905807143262487, "language_loss": 0.76346642, "learning_rate": 3.1231743741743202e-06, "loss": 0.78578198, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.567779064178467 }, { "auxiliary_loss_clip": 0.01171234, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.05115807, "balance_loss_mlp": 1.02228236, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.6097506945928717, "language_loss": 0.83561975, "learning_rate": 3.122529754172582e-06, "loss": 0.85764629, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.591878890991211 }, { "auxiliary_loss_clip": 0.01176292, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.05586207, "balance_loss_mlp": 1.01648879, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 2.0814561007332064, "language_loss": 0.72672808, "learning_rate": 3.1218849638839736e-06, "loss": 0.74874574, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.6375670433044434 }, { "auxiliary_loss_clip": 0.01129194, "auxiliary_loss_mlp": 0.01032707, "balance_loss_clip": 1.04462075, "balance_loss_mlp": 1.02264619, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 7.026487529598584, "language_loss": 0.78671503, "learning_rate": 3.121240003406307e-06, "loss": 0.80833405, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.633578062057495 }, { "auxiliary_loss_clip": 0.01151011, "auxiliary_loss_mlp": 0.01033918, "balance_loss_clip": 1.05500507, "balance_loss_mlp": 1.0243212, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 1.9846108367168507, "language_loss": 0.72682214, "learning_rate": 3.120594872837425e-06, "loss": 0.74867141, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.747927188873291 }, { "auxiliary_loss_clip": 0.0109376, "auxiliary_loss_mlp": 0.00704408, "balance_loss_clip": 1.02957606, "balance_loss_mlp": 1.00043297, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.832057767257918, "language_loss": 0.62368774, "learning_rate": 3.1199495722751906e-06, "loss": 0.64166945, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.211254835128784 }, { "auxiliary_loss_clip": 0.01129319, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.04844582, "balance_loss_mlp": 1.02259803, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 1.7033831726068525, "language_loss": 0.84016556, "learning_rate": 3.1193041018174972e-06, "loss": 0.86177421, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.7208142280578613 }, { "auxiliary_loss_clip": 0.0118025, "auxiliary_loss_mlp": 0.0102746, "balance_loss_clip": 1.05622292, "balance_loss_mlp": 1.01858461, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 2.496891061359442, "language_loss": 0.94638288, "learning_rate": 3.118658461562261e-06, "loss": 0.96846002, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.6148922443389893 }, { "auxiliary_loss_clip": 0.01159987, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.05578482, "balance_loss_mlp": 1.02454913, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.8088377350490104, "language_loss": 0.84938484, "learning_rate": 3.118012651607426e-06, "loss": 0.87131739, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.755305051803589 }, { "auxiliary_loss_clip": 0.01193966, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 1.05761361, "balance_loss_mlp": 1.02470469, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.252876051882947, "language_loss": 0.83371627, "learning_rate": 3.1173666720509603e-06, "loss": 0.85599732, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.548795461654663 }, { "auxiliary_loss_clip": 0.01163066, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.05204666, "balance_loss_mlp": 1.02453244, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.7770517853177923, "language_loss": 0.68143475, "learning_rate": 3.116720522990859e-06, "loss": 0.70340312, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.702495813369751 }, { "auxiliary_loss_clip": 0.01112599, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.04746974, "balance_loss_mlp": 1.02251434, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 2.272167445024398, "language_loss": 0.62245524, "learning_rate": 3.116074204525142e-06, "loss": 0.64389789, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.698265552520752 }, { "auxiliary_loss_clip": 0.01167237, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.05349815, "balance_loss_mlp": 1.02515817, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.5732471067197964, "language_loss": 0.83439493, "learning_rate": 3.1154277167518553e-06, "loss": 0.85641503, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.7138402462005615 }, { "auxiliary_loss_clip": 0.01074968, "auxiliary_loss_mlp": 0.0101183, "balance_loss_clip": 1.0249697, "balance_loss_mlp": 1.00994623, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7772756945368426, "language_loss": 0.59499097, "learning_rate": 3.114781059769072e-06, "loss": 0.61585891, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.0902037620544434 }, { "auxiliary_loss_clip": 0.01162854, "auxiliary_loss_mlp": 0.01026815, "balance_loss_clip": 1.05506933, "balance_loss_mlp": 1.0181005, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 3.7679469780283728, "language_loss": 0.68163848, "learning_rate": 3.1141342336748874e-06, "loss": 0.7035352, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.6769471168518066 }, { "auxiliary_loss_clip": 0.01170235, "auxiliary_loss_mlp": 0.01030591, "balance_loss_clip": 1.05241942, "balance_loss_mlp": 1.0219543, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.5007820779871568, "language_loss": 0.81964481, "learning_rate": 3.1134872385674253e-06, "loss": 0.84165311, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 2.684967041015625 }, { "auxiliary_loss_clip": 0.01162524, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.04968953, "balance_loss_mlp": 1.02051377, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.736560132753415, "language_loss": 0.85358465, "learning_rate": 3.1128400745448353e-06, "loss": 0.87550175, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 3.5561888217926025 }, { "auxiliary_loss_clip": 0.01179721, "auxiliary_loss_mlp": 0.01040831, "balance_loss_clip": 1.0561415, "balance_loss_mlp": 1.03201008, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.943806670332198, "language_loss": 0.62563419, "learning_rate": 3.11219274170529e-06, "loss": 0.64783973, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.7472903728485107 }, { "auxiliary_loss_clip": 0.01152672, "auxiliary_loss_mlp": 0.01031977, "balance_loss_clip": 1.05177438, "balance_loss_mlp": 1.02319682, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.893375081289593, "language_loss": 0.81840992, "learning_rate": 3.1115452401469903e-06, "loss": 0.84025639, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.673506021499634 }, { "auxiliary_loss_clip": 0.01120646, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.0475812, "balance_loss_mlp": 1.02273285, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.1532344463103166, "language_loss": 0.86559844, "learning_rate": 3.1108975699681613e-06, "loss": 0.88711888, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 3.5858545303344727 }, { "auxiliary_loss_clip": 0.01142053, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.05044675, "balance_loss_mlp": 1.0203948, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 2.0768473342332125, "language_loss": 0.71333444, "learning_rate": 3.1102497312670542e-06, "loss": 0.73504162, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.6837587356567383 }, { "auxiliary_loss_clip": 0.01150806, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.05237818, "balance_loss_mlp": 1.01572275, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.2696769630912486, "language_loss": 0.80573171, "learning_rate": 3.109601724141946e-06, "loss": 0.8274833, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.7097671031951904 }, { "auxiliary_loss_clip": 0.01154622, "auxiliary_loss_mlp": 0.01033006, "balance_loss_clip": 1.05113757, "balance_loss_mlp": 1.02445912, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 7.31144467982341, "language_loss": 0.68687004, "learning_rate": 3.108953548691138e-06, "loss": 0.70874631, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 3.594446897506714 }, { "auxiliary_loss_clip": 0.01193444, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.05612171, "balance_loss_mlp": 1.02965379, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.161136969989972, "language_loss": 0.7299698, "learning_rate": 3.108305205012959e-06, "loss": 0.75228977, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 2.7717349529266357 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.05487788, "balance_loss_mlp": 1.01918292, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.017728254267232, "language_loss": 0.87609684, "learning_rate": 3.107656693205761e-06, "loss": 0.89799112, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.655661106109619 }, { "auxiliary_loss_clip": 0.01196472, "auxiliary_loss_mlp": 0.01034513, "balance_loss_clip": 1.05599296, "balance_loss_mlp": 1.02479184, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.4020752599992097, "language_loss": 0.70406651, "learning_rate": 3.107008013367924e-06, "loss": 0.72637635, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.5904088020324707 }, { "auxiliary_loss_clip": 0.01141084, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04985631, "balance_loss_mlp": 1.02201152, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 2.9703134722339435, "language_loss": 0.87001085, "learning_rate": 3.1063591655978507e-06, "loss": 0.89173079, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.610067367553711 }, { "auxiliary_loss_clip": 0.01114361, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.04374051, "balance_loss_mlp": 1.02310658, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 2.3696069866158322, "language_loss": 0.79629785, "learning_rate": 3.105710149993972e-06, "loss": 0.81776321, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.7533206939697266 }, { "auxiliary_loss_clip": 0.01193687, "auxiliary_loss_mlp": 0.01033521, "balance_loss_clip": 1.05603361, "balance_loss_mlp": 1.02540886, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 1.728023902981011, "language_loss": 0.84903288, "learning_rate": 3.1050609666547427e-06, "loss": 0.87130487, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.5710484981536865 }, { "auxiliary_loss_clip": 0.01151881, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.05407023, "balance_loss_mlp": 1.0241127, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 1.7895469127595298, "language_loss": 0.77399343, "learning_rate": 3.104411615678644e-06, "loss": 0.79584354, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.7983908653259277 }, { "auxiliary_loss_clip": 0.01160627, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.05600262, "balance_loss_mlp": 1.02884758, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 2.7985676530958603, "language_loss": 0.73438179, "learning_rate": 3.1037620971641803e-06, "loss": 0.75637329, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.817903757095337 }, { "auxiliary_loss_clip": 0.01196655, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.05735266, "balance_loss_mlp": 1.02308607, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 5.993142195942359, "language_loss": 0.64827365, "learning_rate": 3.1031124112098844e-06, "loss": 0.67056477, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.6213648319244385 }, { "auxiliary_loss_clip": 0.01162642, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.05430341, "balance_loss_mlp": 1.0277884, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 1.9700691725597899, "language_loss": 0.72296035, "learning_rate": 3.1024625579143127e-06, "loss": 0.74495268, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.587024450302124 }, { "auxiliary_loss_clip": 0.01193098, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.05650353, "balance_loss_mlp": 1.0311811, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.8726382154663925, "language_loss": 0.73172802, "learning_rate": 3.101812537376048e-06, "loss": 0.75406349, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.6349823474884033 }, { "auxiliary_loss_clip": 0.01148962, "auxiliary_loss_mlp": 0.00713344, "balance_loss_clip": 1.04983163, "balance_loss_mlp": 1.00067222, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.1756852656217975, "language_loss": 0.84322172, "learning_rate": 3.1011623496936973e-06, "loss": 0.86184478, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.6740779876708984 }, { "auxiliary_loss_clip": 0.011942, "auxiliary_loss_mlp": 0.01032583, "balance_loss_clip": 1.05971766, "balance_loss_mlp": 1.0236125, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.8010133969379922, "language_loss": 0.69844347, "learning_rate": 3.100511994965893e-06, "loss": 0.72071135, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.6217174530029297 }, { "auxiliary_loss_clip": 0.01172408, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.05525637, "balance_loss_mlp": 1.02057171, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.8827551007962802, "language_loss": 0.844531, "learning_rate": 3.0998614732912947e-06, "loss": 0.866548, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.5934436321258545 }, { "auxiliary_loss_clip": 0.01177154, "auxiliary_loss_mlp": 0.01029236, "balance_loss_clip": 1.05744863, "balance_loss_mlp": 1.01983631, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 2.0651724546073074, "language_loss": 0.68138099, "learning_rate": 3.0992107847685855e-06, "loss": 0.7034449, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.6725776195526123 }, { "auxiliary_loss_clip": 0.01162899, "auxiliary_loss_mlp": 0.01042196, "balance_loss_clip": 1.05740547, "balance_loss_mlp": 1.03231359, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.997522102382942, "language_loss": 0.79022211, "learning_rate": 3.0985599294964736e-06, "loss": 0.81227303, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.701082706451416 }, { "auxiliary_loss_clip": 0.0115802, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.0547843, "balance_loss_mlp": 1.02527976, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.8211543678951752, "language_loss": 0.70174313, "learning_rate": 3.097908907573695e-06, "loss": 0.72366971, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.741532564163208 }, { "auxiliary_loss_clip": 0.01112807, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.04930568, "balance_loss_mlp": 1.02036285, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 2.058371101798733, "language_loss": 0.89531422, "learning_rate": 3.0972577190990067e-06, "loss": 0.91673905, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.740934133529663 }, { "auxiliary_loss_clip": 0.01148897, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.05247152, "balance_loss_mlp": 1.02443373, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.9611236053568943, "language_loss": 0.79992807, "learning_rate": 3.096606364171196e-06, "loss": 0.82175344, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.6892619132995605 }, { "auxiliary_loss_clip": 0.01129027, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.05018806, "balance_loss_mlp": 1.02232409, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 1.9058832486062192, "language_loss": 0.85023248, "learning_rate": 3.0959548428890703e-06, "loss": 0.8718347, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 3.6687698364257812 }, { "auxiliary_loss_clip": 0.0117658, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.05852497, "balance_loss_mlp": 1.01939106, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 2.27043001198003, "language_loss": 0.84022725, "learning_rate": 3.095303155351468e-06, "loss": 0.86227202, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 3.495406150817871 }, { "auxiliary_loss_clip": 0.01118367, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.04985046, "balance_loss_mlp": 1.02081382, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.3252500583222284, "language_loss": 0.79119116, "learning_rate": 3.0946513016572464e-06, "loss": 0.81267023, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.6679468154907227 }, { "auxiliary_loss_clip": 0.01177457, "auxiliary_loss_mlp": 0.01033239, "balance_loss_clip": 1.05427182, "balance_loss_mlp": 1.02402425, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 1.8604269001484586, "language_loss": 0.77367783, "learning_rate": 3.0939992819052938e-06, "loss": 0.79578483, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 3.3679702281951904 }, { "auxiliary_loss_clip": 0.01162045, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.05500841, "balance_loss_mlp": 1.02528906, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 1.9819031285937143, "language_loss": 0.80918771, "learning_rate": 3.0933470961945193e-06, "loss": 0.83114612, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.6402294635772705 }, { "auxiliary_loss_clip": 0.01158183, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 1.05583358, "balance_loss_mlp": 1.02086854, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 2.161536842314242, "language_loss": 0.68510389, "learning_rate": 3.0926947446238597e-06, "loss": 0.70697892, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.698850631713867 }, { "auxiliary_loss_clip": 0.01180738, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.0528059, "balance_loss_mlp": 1.0244596, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.13402621156649, "language_loss": 0.82796419, "learning_rate": 3.092042227292276e-06, "loss": 0.85011172, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 3.5242059230804443 }, { "auxiliary_loss_clip": 0.01190777, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.05746651, "balance_loss_mlp": 1.02123332, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.6134212761801745, "language_loss": 0.88111484, "learning_rate": 3.0913895442987557e-06, "loss": 0.90331978, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 2.562303304672241 }, { "auxiliary_loss_clip": 0.01144715, "auxiliary_loss_mlp": 0.00713463, "balance_loss_clip": 1.05401802, "balance_loss_mlp": 1.0007658, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 5.821260261146965, "language_loss": 0.85749525, "learning_rate": 3.090736695742308e-06, "loss": 0.87607706, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.756779193878174 }, { "auxiliary_loss_clip": 0.01122021, "auxiliary_loss_mlp": 0.01029816, "balance_loss_clip": 1.04950249, "balance_loss_mlp": 1.02082133, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.3959309519446035, "language_loss": 0.52257609, "learning_rate": 3.0900836817219713e-06, "loss": 0.54409444, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.670344352722168 }, { "auxiliary_loss_clip": 0.0119219, "auxiliary_loss_mlp": 0.01031153, "balance_loss_clip": 1.05673814, "balance_loss_mlp": 1.0219028, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 1.985520500487879, "language_loss": 0.8362264, "learning_rate": 3.089430502336807e-06, "loss": 0.85845977, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.595470666885376 }, { "auxiliary_loss_clip": 0.0118229, "auxiliary_loss_mlp": 0.01030228, "balance_loss_clip": 1.05734646, "balance_loss_mlp": 1.02061987, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 3.0528568932059557, "language_loss": 0.90248919, "learning_rate": 3.088777157685902e-06, "loss": 0.92461437, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.5852348804473877 }, { "auxiliary_loss_clip": 0.01157253, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.05414295, "balance_loss_mlp": 1.01932955, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 2.1378278229122154, "language_loss": 0.85718191, "learning_rate": 3.088123647868367e-06, "loss": 0.87903094, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.688530921936035 }, { "auxiliary_loss_clip": 0.01176853, "auxiliary_loss_mlp": 0.01036568, "balance_loss_clip": 1.05233121, "balance_loss_mlp": 1.02793074, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 1.822506859975697, "language_loss": 0.81039679, "learning_rate": 3.0874699729833405e-06, "loss": 0.83253098, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.6529245376586914 }, { "auxiliary_loss_clip": 0.01156271, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.05385399, "balance_loss_mlp": 1.02135026, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.6441668523351551, "language_loss": 0.80165708, "learning_rate": 3.086816133129983e-06, "loss": 0.82352293, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.6991665363311768 }, { "auxiliary_loss_clip": 0.0119477, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.05944347, "balance_loss_mlp": 1.02779579, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 1.8697891479016648, "language_loss": 0.76132059, "learning_rate": 3.0861621284074826e-06, "loss": 0.78363323, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.612515449523926 }, { "auxiliary_loss_clip": 0.01169842, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.05701709, "balance_loss_mlp": 1.02209532, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.5047118931724857, "language_loss": 0.73049122, "learning_rate": 3.085507958915051e-06, "loss": 0.75250149, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.669374465942383 }, { "auxiliary_loss_clip": 0.01159827, "auxiliary_loss_mlp": 0.01030328, "balance_loss_clip": 1.05567837, "balance_loss_mlp": 1.02053571, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 7.560979543196061, "language_loss": 0.71142626, "learning_rate": 3.084853624751925e-06, "loss": 0.73332775, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.815229654312134 }, { "auxiliary_loss_clip": 0.01145049, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.05252039, "balance_loss_mlp": 1.02788234, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 2.200856227976782, "language_loss": 0.85758805, "learning_rate": 3.0841991260173668e-06, "loss": 0.87941247, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.6892147064208984 }, { "auxiliary_loss_clip": 0.01194528, "auxiliary_loss_mlp": 0.01036391, "balance_loss_clip": 1.05784106, "balance_loss_mlp": 1.02712226, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 2.0874883010814114, "language_loss": 0.80532819, "learning_rate": 3.0835444628106634e-06, "loss": 0.82763737, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.5974152088165283 }, { "auxiliary_loss_clip": 0.01195183, "auxiliary_loss_mlp": 0.00713246, "balance_loss_clip": 1.05886984, "balance_loss_mlp": 1.00060868, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.7101275808348806, "language_loss": 0.83018029, "learning_rate": 3.082889635231126e-06, "loss": 0.84926456, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.5730268955230713 }, { "auxiliary_loss_clip": 0.01163018, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.05259764, "balance_loss_mlp": 1.02067125, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.322493660486229, "language_loss": 0.77003276, "learning_rate": 3.0822346433780925e-06, "loss": 0.79196978, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.701610803604126 }, { "auxiliary_loss_clip": 0.01177006, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.05285954, "balance_loss_mlp": 1.02591288, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 2.1726823648584213, "language_loss": 0.87299716, "learning_rate": 3.0815794873509237e-06, "loss": 0.8951205, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.6366310119628906 }, { "auxiliary_loss_clip": 0.0119203, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.05637765, "balance_loss_mlp": 1.01924431, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 2.4130757398017546, "language_loss": 0.73275363, "learning_rate": 3.0809241672490066e-06, "loss": 0.75495684, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.5878243446350098 }, { "auxiliary_loss_clip": 0.01162371, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.05483174, "balance_loss_mlp": 1.02169085, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.7420726289732908, "language_loss": 0.85036659, "learning_rate": 3.080268683171753e-06, "loss": 0.87229228, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.6754424571990967 }, { "auxiliary_loss_clip": 0.01175756, "auxiliary_loss_mlp": 0.01037573, "balance_loss_clip": 1.05358756, "balance_loss_mlp": 1.02930641, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 2.0610587354892758, "language_loss": 0.89607227, "learning_rate": 3.0796130352185985e-06, "loss": 0.91820562, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.6036884784698486 }, { "auxiliary_loss_clip": 0.01147167, "auxiliary_loss_mlp": 0.00713702, "balance_loss_clip": 1.04874611, "balance_loss_mlp": 1.00057781, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 3.0630867416256318, "language_loss": 0.66597503, "learning_rate": 3.0789572234890057e-06, "loss": 0.68458372, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 4.56087589263916 }, { "auxiliary_loss_clip": 0.01161307, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.0556252, "balance_loss_mlp": 1.02732646, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 2.0307623193963944, "language_loss": 0.77491099, "learning_rate": 3.0783012480824596e-06, "loss": 0.79689527, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.6035146713256836 }, { "auxiliary_loss_clip": 0.01194858, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.057863, "balance_loss_mlp": 1.01943016, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.2169660296537415, "language_loss": 0.74475729, "learning_rate": 3.077645109098471e-06, "loss": 0.76699376, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.5785200595855713 }, { "auxiliary_loss_clip": 0.01133512, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.05127358, "balance_loss_mlp": 1.02318883, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.8289651831521454, "language_loss": 0.72061265, "learning_rate": 3.076988806636577e-06, "loss": 0.74226683, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 3.541961669921875 }, { "auxiliary_loss_clip": 0.0116349, "auxiliary_loss_mlp": 0.00714478, "balance_loss_clip": 1.05591476, "balance_loss_mlp": 1.00059891, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.9067037796080681, "language_loss": 0.88748276, "learning_rate": 3.0763323407963377e-06, "loss": 0.9062624, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.6909310817718506 }, { "auxiliary_loss_clip": 0.01175424, "auxiliary_loss_mlp": 0.0103132, "balance_loss_clip": 1.05419934, "balance_loss_mlp": 1.02267694, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 2.337247300914237, "language_loss": 0.80067623, "learning_rate": 3.075675711677337e-06, "loss": 0.82274365, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 3.6178457736968994 }, { "auxiliary_loss_clip": 0.01156984, "auxiliary_loss_mlp": 0.01026184, "balance_loss_clip": 1.05541384, "balance_loss_mlp": 1.01772046, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 2.424688224174429, "language_loss": 0.77790201, "learning_rate": 3.0750189193791865e-06, "loss": 0.7997337, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.6022427082061768 }, { "auxiliary_loss_clip": 0.01174331, "auxiliary_loss_mlp": 0.0102485, "balance_loss_clip": 1.05487335, "balance_loss_mlp": 1.01566505, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 2.862848934596538, "language_loss": 0.70374179, "learning_rate": 3.0743619640015203e-06, "loss": 0.72573352, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 2.711721181869507 }, { "auxiliary_loss_clip": 0.01164915, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.05239451, "balance_loss_mlp": 1.02619076, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 2.00004423403662, "language_loss": 0.92590755, "learning_rate": 3.073704845643999e-06, "loss": 0.94791144, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.607398509979248 }, { "auxiliary_loss_clip": 0.01178808, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.05291951, "balance_loss_mlp": 1.02342653, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 3.006431285033957, "language_loss": 0.77322173, "learning_rate": 3.0730475644063063e-06, "loss": 0.79534185, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.6401596069335938 }, { "auxiliary_loss_clip": 0.01153407, "auxiliary_loss_mlp": 0.00712872, "balance_loss_clip": 1.05110347, "balance_loss_mlp": 1.00056434, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 2.2335781202887635, "language_loss": 0.64729536, "learning_rate": 3.072390120388151e-06, "loss": 0.66595817, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.6563398838043213 }, { "auxiliary_loss_clip": 0.01176827, "auxiliary_loss_mlp": 0.0102811, "balance_loss_clip": 1.05541849, "balance_loss_mlp": 1.0183531, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.0293507827414476, "language_loss": 0.71290958, "learning_rate": 3.071732513689267e-06, "loss": 0.73495901, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.617974042892456 }, { "auxiliary_loss_clip": 0.01181092, "auxiliary_loss_mlp": 0.01038129, "balance_loss_clip": 1.05908549, "balance_loss_mlp": 1.02848458, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.3411760824838446, "language_loss": 0.6721617, "learning_rate": 3.0710747444094134e-06, "loss": 0.69435382, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.609628915786743 }, { "auxiliary_loss_clip": 0.01162406, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.05397189, "balance_loss_mlp": 1.02195525, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 2.6042969815280443, "language_loss": 0.65552127, "learning_rate": 3.070416812648372e-06, "loss": 0.67745864, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.8259403705596924 }, { "auxiliary_loss_clip": 0.01141266, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.04895365, "balance_loss_mlp": 1.02322936, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.0409100191848446, "language_loss": 0.65339768, "learning_rate": 3.069758718505951e-06, "loss": 0.67512673, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.6897737979888916 }, { "auxiliary_loss_clip": 0.01192812, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.05802441, "balance_loss_mlp": 1.02251792, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.73968131672346, "language_loss": 0.79962605, "learning_rate": 3.0691004620819836e-06, "loss": 0.82186902, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.639873743057251 }, { "auxiliary_loss_clip": 0.01048781, "auxiliary_loss_mlp": 0.01005649, "balance_loss_clip": 1.02694392, "balance_loss_mlp": 1.00341976, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.820195728354895, "language_loss": 0.60274059, "learning_rate": 3.0684420434763254e-06, "loss": 0.62328488, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.3693697452545166 }, { "auxiliary_loss_clip": 0.01137712, "auxiliary_loss_mlp": 0.01030817, "balance_loss_clip": 1.05337358, "balance_loss_mlp": 1.02210236, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.7709932212031945, "language_loss": 0.76846159, "learning_rate": 3.06778346278886e-06, "loss": 0.79014689, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.768252372741699 }, { "auxiliary_loss_clip": 0.01194017, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.05792475, "balance_loss_mlp": 1.02337742, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 1.9977858517837281, "language_loss": 0.79353082, "learning_rate": 3.0671247201194906e-06, "loss": 0.81580114, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.6500444412231445 }, { "auxiliary_loss_clip": 0.01147207, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.05246687, "balance_loss_mlp": 1.02732229, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.8136142166988163, "language_loss": 0.75323009, "learning_rate": 3.066465815568151e-06, "loss": 0.77506983, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.755519151687622 }, { "auxiliary_loss_clip": 0.01179671, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.05547094, "balance_loss_mlp": 1.02306902, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 1.619214800634247, "language_loss": 0.68678784, "learning_rate": 3.0658067492347947e-06, "loss": 0.70890641, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.7478203773498535 }, { "auxiliary_loss_clip": 0.01088809, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.0459199, "balance_loss_mlp": 1.02017403, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 1.9449940711649392, "language_loss": 0.66219461, "learning_rate": 3.065147521219402e-06, "loss": 0.6833722, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.752239942550659 }, { "auxiliary_loss_clip": 0.01149612, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.05227244, "balance_loss_mlp": 1.02363682, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.5428141263994044, "language_loss": 0.74332798, "learning_rate": 3.064488131621977e-06, "loss": 0.76514661, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.8792977333068848 }, { "auxiliary_loss_clip": 0.0116891, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.05242157, "balance_loss_mlp": 1.02436602, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.7692214019939363, "language_loss": 0.73906046, "learning_rate": 3.063828580542549e-06, "loss": 0.76108468, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.6866772174835205 }, { "auxiliary_loss_clip": 0.0116131, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.0540899, "balance_loss_mlp": 1.02007031, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 1.9433747859558164, "language_loss": 0.73118287, "learning_rate": 3.0631688680811706e-06, "loss": 0.75307846, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.6613783836364746 }, { "auxiliary_loss_clip": 0.01192258, "auxiliary_loss_mlp": 0.01037311, "balance_loss_clip": 1.05586362, "balance_loss_mlp": 1.02848363, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.0193451135077223, "language_loss": 0.75692707, "learning_rate": 3.062508994337921e-06, "loss": 0.77922273, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.623955488204956 }, { "auxiliary_loss_clip": 0.01174285, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.05301523, "balance_loss_mlp": 1.02106965, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 2.0817306801961712, "language_loss": 0.79008734, "learning_rate": 3.0618489594129013e-06, "loss": 0.81213415, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 4.5037713050842285 }, { "auxiliary_loss_clip": 0.01151447, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.05472326, "balance_loss_mlp": 1.02233148, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 2.2834388507041408, "language_loss": 0.71291471, "learning_rate": 3.061188763406239e-06, "loss": 0.7347458, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.671294689178467 }, { "auxiliary_loss_clip": 0.01154584, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.05118179, "balance_loss_mlp": 1.02323747, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.9528359041230843, "language_loss": 0.82441527, "learning_rate": 3.060528406418085e-06, "loss": 0.84628844, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 3.7680859565734863 }, { "auxiliary_loss_clip": 0.0115632, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.02219129, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 2.1704846253968806, "language_loss": 0.61766458, "learning_rate": 3.0598678885486145e-06, "loss": 0.63953876, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.7140440940856934 }, { "auxiliary_loss_clip": 0.01143876, "auxiliary_loss_mlp": 0.0071292, "balance_loss_clip": 1.04857147, "balance_loss_mlp": 1.00051713, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6736829604332983, "language_loss": 0.74402404, "learning_rate": 3.0592072098980282e-06, "loss": 0.76259202, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.6852195262908936 }, { "auxiliary_loss_clip": 0.01156537, "auxiliary_loss_mlp": 0.01041952, "balance_loss_clip": 1.0542891, "balance_loss_mlp": 1.0333209, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 11.948141455894767, "language_loss": 0.72713917, "learning_rate": 3.0585463705665514e-06, "loss": 0.74912405, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 3.6145594120025635 }, { "auxiliary_loss_clip": 0.01143834, "auxiliary_loss_mlp": 0.01027342, "balance_loss_clip": 1.05080509, "balance_loss_mlp": 1.01806152, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.5663011219401866, "language_loss": 0.70815217, "learning_rate": 3.0578853706544304e-06, "loss": 0.729864, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.7641761302948 }, { "auxiliary_loss_clip": 0.01148982, "auxiliary_loss_mlp": 0.00712848, "balance_loss_clip": 1.05351186, "balance_loss_mlp": 1.00048852, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.2978974734602873, "language_loss": 0.6510272, "learning_rate": 3.0572242102619404e-06, "loss": 0.66964549, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 2.7155697345733643 }, { "auxiliary_loss_clip": 0.01163663, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.05667102, "balance_loss_mlp": 1.02306104, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 1.853749350855653, "language_loss": 0.80487704, "learning_rate": 3.0565628894893784e-06, "loss": 0.82683492, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.66473126411438 }, { "auxiliary_loss_clip": 0.01170852, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.0549674, "balance_loss_mlp": 1.02183938, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.6956420781377788, "language_loss": 0.74687332, "learning_rate": 3.0559014084370655e-06, "loss": 0.76888674, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.6146364212036133 }, { "auxiliary_loss_clip": 0.01168467, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.05489421, "balance_loss_mlp": 1.02426362, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 2.1632460261954765, "language_loss": 0.7868771, "learning_rate": 3.055239767205349e-06, "loss": 0.80889475, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.6553399562835693 }, { "auxiliary_loss_clip": 0.01174376, "auxiliary_loss_mlp": 0.01037307, "balance_loss_clip": 1.05793285, "balance_loss_mlp": 1.02873564, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.762874250856924, "language_loss": 0.78433377, "learning_rate": 3.054577965894599e-06, "loss": 0.80645061, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.609287977218628 }, { "auxiliary_loss_clip": 0.01166362, "auxiliary_loss_mlp": 0.0103996, "balance_loss_clip": 1.05636775, "balance_loss_mlp": 1.03075671, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.673017494859501, "language_loss": 0.7031275, "learning_rate": 3.0539160046052094e-06, "loss": 0.7251907, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.6141319274902344 }, { "auxiliary_loss_clip": 0.01152853, "auxiliary_loss_mlp": 0.01037499, "balance_loss_clip": 1.0508002, "balance_loss_mlp": 1.02747393, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.3227809282975356, "language_loss": 0.70422453, "learning_rate": 3.0532538834376003e-06, "loss": 0.72612804, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.692326784133911 }, { "auxiliary_loss_clip": 0.01182085, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.05573058, "balance_loss_mlp": 1.02658296, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 1.8386335053358964, "language_loss": 0.78185177, "learning_rate": 3.0525916024922143e-06, "loss": 0.80402672, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.6109132766723633 }, { "auxiliary_loss_clip": 0.011562, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.05150843, "balance_loss_mlp": 1.02856445, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 2.878270927538967, "language_loss": 0.83638442, "learning_rate": 3.0519291618695193e-06, "loss": 0.85832119, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.6451783180236816 }, { "auxiliary_loss_clip": 0.01133182, "auxiliary_loss_mlp": 0.01039092, "balance_loss_clip": 1.04727483, "balance_loss_mlp": 1.03031266, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 2.1456401213153455, "language_loss": 0.76001334, "learning_rate": 3.0512665616700065e-06, "loss": 0.78173614, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.6986005306243896 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01029001, "balance_loss_clip": 1.04736114, "balance_loss_mlp": 1.01977432, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 2.2451531916073133, "language_loss": 0.89136851, "learning_rate": 3.0506038019941933e-06, "loss": 0.91285849, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.7565300464630127 }, { "auxiliary_loss_clip": 0.01146085, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.05443966, "balance_loss_mlp": 1.0226475, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.588359503858114, "language_loss": 0.67261839, "learning_rate": 3.049940882942617e-06, "loss": 0.69440675, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.697021961212158 }, { "auxiliary_loss_clip": 0.01195174, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.0561502, "balance_loss_mlp": 1.02383852, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 1.9681081177713642, "language_loss": 0.80640125, "learning_rate": 3.0492778046158448e-06, "loss": 0.82867968, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.6053645610809326 }, { "auxiliary_loss_clip": 0.01175549, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.05733538, "balance_loss_mlp": 1.02018595, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 2.301359474748001, "language_loss": 0.76819825, "learning_rate": 3.0486145671144633e-06, "loss": 0.79024184, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.582942485809326 }, { "auxiliary_loss_clip": 0.01098256, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.04761755, "balance_loss_mlp": 1.02956867, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.1874404196199224, "language_loss": 0.76998562, "learning_rate": 3.047951170539086e-06, "loss": 0.79134846, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.795300006866455 }, { "auxiliary_loss_clip": 0.01148239, "auxiliary_loss_mlp": 0.01038078, "balance_loss_clip": 1.05670965, "balance_loss_mlp": 1.02988863, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 2.069669974673915, "language_loss": 0.84661973, "learning_rate": 3.047287614990349e-06, "loss": 0.86848283, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.6798226833343506 }, { "auxiliary_loss_clip": 0.01154291, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.05364668, "balance_loss_mlp": 1.02354431, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.7495963299659927, "language_loss": 0.62437618, "learning_rate": 3.046623900568914e-06, "loss": 0.64625448, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.7777199745178223 }, { "auxiliary_loss_clip": 0.01159708, "auxiliary_loss_mlp": 0.01049431, "balance_loss_clip": 1.05445933, "balance_loss_mlp": 1.03939998, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 2.3233431212645472, "language_loss": 0.69968724, "learning_rate": 3.045960027375465e-06, "loss": 0.72177863, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 2.7382078170776367 }, { "auxiliary_loss_clip": 0.01179628, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.05351257, "balance_loss_mlp": 1.02377033, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 2.8753297808735607, "language_loss": 0.82876611, "learning_rate": 3.045295995510711e-06, "loss": 0.85089314, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.6577396392822266 }, { "auxiliary_loss_clip": 0.01160271, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.05578303, "balance_loss_mlp": 1.02847815, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.881937974726451, "language_loss": 0.73918045, "learning_rate": 3.0446318050753865e-06, "loss": 0.7611506, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 3.658132314682007 }, { "auxiliary_loss_clip": 0.01167351, "auxiliary_loss_mlp": 0.01030907, "balance_loss_clip": 1.05121303, "balance_loss_mlp": 1.02241886, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 4.1298964356262475, "language_loss": 0.77733231, "learning_rate": 3.0439674561702474e-06, "loss": 0.7993148, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 3.61841082572937 }, { "auxiliary_loss_clip": 0.0117415, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.05578375, "balance_loss_mlp": 1.02593637, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 5.235148271340766, "language_loss": 0.88269782, "learning_rate": 3.043302948896076e-06, "loss": 0.90478587, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.6148602962493896 }, { "auxiliary_loss_clip": 0.01118245, "auxiliary_loss_mlp": 0.01032768, "balance_loss_clip": 1.04849398, "balance_loss_mlp": 1.02375555, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 3.1904797504525213, "language_loss": 0.60080451, "learning_rate": 3.0426382833536756e-06, "loss": 0.62231457, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 3.641254186630249 }, { "auxiliary_loss_clip": 0.01134927, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.04705846, "balance_loss_mlp": 1.02519441, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 2.0774494656368474, "language_loss": 0.77728844, "learning_rate": 3.041973459643877e-06, "loss": 0.79897487, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.7229554653167725 }, { "auxiliary_loss_clip": 0.01119854, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.04553473, "balance_loss_mlp": 1.02113867, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 1.9603806001259627, "language_loss": 0.67286599, "learning_rate": 3.0413084778675334e-06, "loss": 0.69436717, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 3.7104110717773438 }, { "auxiliary_loss_clip": 0.01151668, "auxiliary_loss_mlp": 0.00712217, "balance_loss_clip": 1.0495007, "balance_loss_mlp": 1.00048971, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 3.1415692696137714, "language_loss": 0.83933455, "learning_rate": 3.0406433381255214e-06, "loss": 0.85797334, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.725551128387451 }, { "auxiliary_loss_clip": 0.01175973, "auxiliary_loss_mlp": 0.01026442, "balance_loss_clip": 1.05729163, "balance_loss_mlp": 1.01812148, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 2.3479894156436583, "language_loss": 0.82393003, "learning_rate": 3.0399780405187425e-06, "loss": 0.84595418, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 2.608553409576416 }, { "auxiliary_loss_clip": 0.01169035, "auxiliary_loss_mlp": 0.01021108, "balance_loss_clip": 1.05175292, "balance_loss_mlp": 1.01269794, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 2.2105127071838906, "language_loss": 0.78748816, "learning_rate": 3.0393125851481216e-06, "loss": 0.80938959, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.701826572418213 }, { "auxiliary_loss_clip": 0.01136735, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.04980469, "balance_loss_mlp": 1.02079296, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.527427329851191, "language_loss": 0.86961234, "learning_rate": 3.038646972114608e-06, "loss": 0.89126927, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.6631603240966797 }, { "auxiliary_loss_clip": 0.01137499, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.05149055, "balance_loss_mlp": 1.02030444, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.933784782098699, "language_loss": 0.67276478, "learning_rate": 3.037981201519174e-06, "loss": 0.69442809, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.7572312355041504 }, { "auxiliary_loss_clip": 0.01176032, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.05758119, "balance_loss_mlp": 1.02848029, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 2.1371084211856135, "language_loss": 0.71291888, "learning_rate": 3.0373152734628175e-06, "loss": 0.73504627, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.6032872200012207 }, { "auxiliary_loss_clip": 0.01166119, "auxiliary_loss_mlp": 0.01031947, "balance_loss_clip": 1.0509001, "balance_loss_mlp": 1.02328014, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 3.8031866726481414, "language_loss": 0.75755167, "learning_rate": 3.0366491880465584e-06, "loss": 0.77953237, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.641864776611328 }, { "auxiliary_loss_clip": 0.01195337, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.05894947, "balance_loss_mlp": 1.02279949, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.604931716187294, "language_loss": 0.81844062, "learning_rate": 3.035982945371443e-06, "loss": 0.84071064, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.6205544471740723 }, { "auxiliary_loss_clip": 0.01167076, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 1.0551672, "balance_loss_mlp": 1.02164054, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.0512730519128035, "language_loss": 0.85513622, "learning_rate": 3.035316545538537e-06, "loss": 0.87710959, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.6383285522460938 }, { "auxiliary_loss_clip": 0.01158489, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.05908287, "balance_loss_mlp": 1.03097177, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 1.9843092843514105, "language_loss": 0.79174966, "learning_rate": 3.034649988648935e-06, "loss": 0.81372875, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.7403616905212402 }, { "auxiliary_loss_clip": 0.01157297, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.05047154, "balance_loss_mlp": 1.01887393, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.8563773343049836, "language_loss": 0.80856323, "learning_rate": 3.033983274803752e-06, "loss": 0.83040923, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.9289209842681885 }, { "auxiliary_loss_clip": 0.01153672, "auxiliary_loss_mlp": 0.01030542, "balance_loss_clip": 1.05123949, "balance_loss_mlp": 1.02182162, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 2.504752699386765, "language_loss": 0.72692871, "learning_rate": 3.0333164041041283e-06, "loss": 0.74877083, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.809319019317627 }, { "auxiliary_loss_clip": 0.011139, "auxiliary_loss_mlp": 0.01023156, "balance_loss_clip": 1.0482415, "balance_loss_mlp": 1.01508582, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 1.764357421274896, "language_loss": 0.72133964, "learning_rate": 3.032649376651228e-06, "loss": 0.74271023, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.745084047317505 }, { "auxiliary_loss_clip": 0.01144036, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.05268633, "balance_loss_mlp": 1.01838887, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 1.9246648254368197, "language_loss": 0.75981003, "learning_rate": 3.031982192546238e-06, "loss": 0.7815187, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.7637813091278076 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.01030159, "balance_loss_clip": 1.05359745, "balance_loss_mlp": 1.02229071, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.758817199720636, "language_loss": 0.94441986, "learning_rate": 3.0313148518903696e-06, "loss": 0.9664818, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.607419967651367 }, { "auxiliary_loss_clip": 0.01165123, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.05897892, "balance_loss_mlp": 1.02591074, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 2.4015855003307105, "language_loss": 0.81237406, "learning_rate": 3.030647354784859e-06, "loss": 0.8343699, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.7113163471221924 }, { "auxiliary_loss_clip": 0.01138481, "auxiliary_loss_mlp": 0.01028219, "balance_loss_clip": 1.04925632, "balance_loss_mlp": 1.02032709, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 2.918170497282149, "language_loss": 0.77236778, "learning_rate": 3.029979701330964e-06, "loss": 0.79403472, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.6520795822143555 }, { "auxiliary_loss_clip": 0.01165043, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.05401134, "balance_loss_mlp": 1.02107775, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.2859841155224663, "language_loss": 0.80267739, "learning_rate": 3.029311891629966e-06, "loss": 0.82462227, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.672886848449707 }, { "auxiliary_loss_clip": 0.0115796, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.05400002, "balance_loss_mlp": 1.01866186, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.775873783323423, "language_loss": 0.74532229, "learning_rate": 3.0286439257831744e-06, "loss": 0.76716995, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 2.7039525508880615 }, { "auxiliary_loss_clip": 0.01196891, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.05749631, "balance_loss_mlp": 1.01865268, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 2.0262033655274494, "language_loss": 0.71530104, "learning_rate": 3.0279758038919156e-06, "loss": 0.73754841, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.6263065338134766 }, { "auxiliary_loss_clip": 0.01174479, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.02196169, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.0889713433651007, "language_loss": 0.78285086, "learning_rate": 3.0273075260575455e-06, "loss": 0.80490053, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 3.5791454315185547 }, { "auxiliary_loss_clip": 0.01161523, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.05245614, "balance_loss_mlp": 1.02774, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 2.0882337449614363, "language_loss": 0.80582094, "learning_rate": 3.0266390923814396e-06, "loss": 0.82780355, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 3.5820229053497314 }, { "auxiliary_loss_clip": 0.01164311, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.05817556, "balance_loss_mlp": 1.02306199, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.8992617258638578, "language_loss": 0.82022208, "learning_rate": 3.0259705029650008e-06, "loss": 0.84218132, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 3.621873378753662 }, { "auxiliary_loss_clip": 0.01174485, "auxiliary_loss_mlp": 0.0102902, "balance_loss_clip": 1.05244195, "balance_loss_mlp": 1.02080691, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.723794764094757, "language_loss": 0.72833616, "learning_rate": 3.025301757909652e-06, "loss": 0.75037122, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.699235677719116 }, { "auxiliary_loss_clip": 0.01145525, "auxiliary_loss_mlp": 0.00712969, "balance_loss_clip": 1.05277133, "balance_loss_mlp": 1.00049329, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.6340234880498057, "language_loss": 0.8050971, "learning_rate": 3.024632857316842e-06, "loss": 0.82368207, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.713686227798462 }, { "auxiliary_loss_clip": 0.01178182, "auxiliary_loss_mlp": 0.01034753, "balance_loss_clip": 1.05607939, "balance_loss_mlp": 1.0260272, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.8856768540272963, "language_loss": 0.77860856, "learning_rate": 3.0239638012880412e-06, "loss": 0.80073792, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 3.615074634552002 }, { "auxiliary_loss_clip": 0.01122198, "auxiliary_loss_mlp": 0.01027176, "balance_loss_clip": 1.04935896, "balance_loss_mlp": 1.01847982, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.866801637671114, "language_loss": 0.81679165, "learning_rate": 3.0232945899247466e-06, "loss": 0.83828545, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.6683075428009033 }, { "auxiliary_loss_clip": 0.01176201, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.05472958, "balance_loss_mlp": 1.02329683, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 2.090234037438855, "language_loss": 0.77354729, "learning_rate": 3.022625223328476e-06, "loss": 0.79562676, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 2.6632437705993652 }, { "auxiliary_loss_clip": 0.01182891, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.05666184, "balance_loss_mlp": 1.0201211, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.4976515507067032, "language_loss": 0.69109356, "learning_rate": 3.0219557016007723e-06, "loss": 0.71321559, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.6503703594207764 }, { "auxiliary_loss_clip": 0.01175501, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.05808604, "balance_loss_mlp": 1.02612603, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 1.8169915964868206, "language_loss": 0.69700557, "learning_rate": 3.021286024843202e-06, "loss": 0.71910274, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.6526906490325928 }, { "auxiliary_loss_clip": 0.01106963, "auxiliary_loss_mlp": 0.01006922, "balance_loss_clip": 1.03125167, "balance_loss_mlp": 1.00497901, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.0687790631972327, "language_loss": 0.64830494, "learning_rate": 3.0206161931573526e-06, "loss": 0.66944385, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.0942130088806152 }, { "auxiliary_loss_clip": 0.01157641, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.05249131, "balance_loss_mlp": 1.02835202, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.628326686331488, "language_loss": 0.93107533, "learning_rate": 3.0199462066448388e-06, "loss": 0.95301402, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.6705331802368164 }, { "auxiliary_loss_clip": 0.01179881, "auxiliary_loss_mlp": 0.01028357, "balance_loss_clip": 1.05816483, "balance_loss_mlp": 1.01979804, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 2.0476786166876124, "language_loss": 0.69070047, "learning_rate": 3.019276065407296e-06, "loss": 0.71278286, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.6519501209259033 }, { "auxiliary_loss_clip": 0.01128728, "auxiliary_loss_mlp": 0.01025084, "balance_loss_clip": 1.05019248, "balance_loss_mlp": 1.01609015, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 2.10530493478128, "language_loss": 0.80451232, "learning_rate": 3.018605769546385e-06, "loss": 0.82605046, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.726191282272339 }, { "auxiliary_loss_clip": 0.01173727, "auxiliary_loss_mlp": 0.01029898, "balance_loss_clip": 1.05367303, "balance_loss_mlp": 1.02081442, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 1.6821707003874045, "language_loss": 0.79839557, "learning_rate": 3.017935319163788e-06, "loss": 0.82043183, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.67179536819458 }, { "auxiliary_loss_clip": 0.01177548, "auxiliary_loss_mlp": 0.01028522, "balance_loss_clip": 1.05529559, "balance_loss_mlp": 1.01853251, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.699510611443601, "language_loss": 0.70727187, "learning_rate": 3.017264714361213e-06, "loss": 0.72933257, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.622612714767456 }, { "auxiliary_loss_clip": 0.01162956, "auxiliary_loss_mlp": 0.00712676, "balance_loss_clip": 1.0572052, "balance_loss_mlp": 1.00060117, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 1.927285545479119, "language_loss": 0.82265818, "learning_rate": 3.016593955240389e-06, "loss": 0.84141445, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.689760208129883 }, { "auxiliary_loss_clip": 0.01086952, "auxiliary_loss_mlp": 0.01002989, "balance_loss_clip": 1.02697897, "balance_loss_mlp": 1.00117731, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8324044808253237, "language_loss": 0.63705456, "learning_rate": 3.015923041903071e-06, "loss": 0.65795398, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.2817137241363525 }, { "auxiliary_loss_clip": 0.01174493, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.05626428, "balance_loss_mlp": 1.01915646, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 1.8789968889509878, "language_loss": 0.83591378, "learning_rate": 3.0152519744510347e-06, "loss": 0.85793746, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.6698107719421387 }, { "auxiliary_loss_clip": 0.01144756, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.05111027, "balance_loss_mlp": 1.02157545, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 2.070853976872528, "language_loss": 0.82770926, "learning_rate": 3.014580752986081e-06, "loss": 0.84946132, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.7399752140045166 }, { "auxiliary_loss_clip": 0.01130098, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.05238652, "balance_loss_mlp": 1.02785921, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.1199304308947666, "language_loss": 0.78654075, "learning_rate": 3.0139093776100345e-06, "loss": 0.80820328, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.6957058906555176 }, { "auxiliary_loss_clip": 0.01190568, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 1.05703676, "balance_loss_mlp": 1.02073097, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.6848826248791215, "language_loss": 0.75426853, "learning_rate": 3.013237848424741e-06, "loss": 0.77645886, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.607522487640381 }, { "auxiliary_loss_clip": 0.01161257, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.0538882, "balance_loss_mlp": 1.01992011, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.558268588787106, "language_loss": 0.75201899, "learning_rate": 3.012566165532072e-06, "loss": 0.7739197, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.6169939041137695 }, { "auxiliary_loss_clip": 0.01115233, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.04900932, "balance_loss_mlp": 1.02937174, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.4157742419139083, "language_loss": 0.76442724, "learning_rate": 3.0118943290339207e-06, "loss": 0.78595316, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.771679639816284 }, { "auxiliary_loss_clip": 0.01130369, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.04711306, "balance_loss_mlp": 1.02171373, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 1.9381149207273347, "language_loss": 0.68721545, "learning_rate": 3.011222339032204e-06, "loss": 0.70882142, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.6377005577087402 }, { "auxiliary_loss_clip": 0.01194135, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.05924523, "balance_loss_mlp": 1.02471828, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 2.0993563928876053, "language_loss": 0.68978649, "learning_rate": 3.0105501956288626e-06, "loss": 0.71205902, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.6502466201782227 }, { "auxiliary_loss_clip": 0.01180006, "auxiliary_loss_mlp": 0.01026377, "balance_loss_clip": 1.05655932, "balance_loss_mlp": 1.01770473, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.2290837992617183, "language_loss": 0.7226212, "learning_rate": 3.0098778989258602e-06, "loss": 0.74468499, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 3.494863510131836 }, { "auxiliary_loss_clip": 0.0113862, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.05228996, "balance_loss_mlp": 1.02507186, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 2.676804522869772, "language_loss": 0.88691515, "learning_rate": 3.009205449025183e-06, "loss": 0.90863657, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 3.5378260612487793 }, { "auxiliary_loss_clip": 0.01138296, "auxiliary_loss_mlp": 0.01033899, "balance_loss_clip": 1.04977179, "balance_loss_mlp": 1.02504182, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 2.0528286919605634, "language_loss": 0.63395679, "learning_rate": 3.008532846028842e-06, "loss": 0.65567875, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 3.579315662384033 }, { "auxiliary_loss_clip": 0.01193106, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.0584662, "balance_loss_mlp": 1.02120328, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 2.1084377800228706, "language_loss": 0.72088933, "learning_rate": 3.0078600900388694e-06, "loss": 0.74312031, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.5713775157928467 }, { "auxiliary_loss_clip": 0.01134134, "auxiliary_loss_mlp": 0.0103354, "balance_loss_clip": 1.04921246, "balance_loss_mlp": 1.02521968, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 1.9673483330283745, "language_loss": 0.74198586, "learning_rate": 3.007187181157323e-06, "loss": 0.76366258, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.81311297416687 }, { "auxiliary_loss_clip": 0.01099428, "auxiliary_loss_mlp": 0.01033554, "balance_loss_clip": 1.04665637, "balance_loss_mlp": 1.02528679, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.7320922270655843, "language_loss": 0.68255925, "learning_rate": 3.006514119486282e-06, "loss": 0.70388901, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 3.694117546081543 }, { "auxiliary_loss_clip": 0.01136227, "auxiliary_loss_mlp": 0.01030653, "balance_loss_clip": 1.05072844, "balance_loss_mlp": 1.02218938, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 2.41200992317478, "language_loss": 0.69618058, "learning_rate": 3.005840905127849e-06, "loss": 0.71784937, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.6768715381622314 }, { "auxiliary_loss_clip": 0.01191668, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.0590117, "balance_loss_mlp": 1.02313173, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.41592868311879, "language_loss": 0.87037128, "learning_rate": 3.0051675381841516e-06, "loss": 0.89260197, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.5979809761047363 }, { "auxiliary_loss_clip": 0.01094387, "auxiliary_loss_mlp": 0.00712612, "balance_loss_clip": 1.04765701, "balance_loss_mlp": 1.00066197, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.6169881471654164, "language_loss": 0.76930845, "learning_rate": 3.0044940187573363e-06, "loss": 0.78737849, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.7967050075531006 }, { "auxiliary_loss_clip": 0.0117657, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.05437303, "balance_loss_mlp": 1.02018893, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 1.9844491714022912, "language_loss": 0.6520375, "learning_rate": 3.003820346949578e-06, "loss": 0.67408609, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.648362398147583 }, { "auxiliary_loss_clip": 0.01192993, "auxiliary_loss_mlp": 0.01033011, "balance_loss_clip": 1.05733681, "balance_loss_mlp": 1.02425504, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.1027676018935244, "language_loss": 0.79313552, "learning_rate": 3.003146522863071e-06, "loss": 0.81539559, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.584846019744873 }, { "auxiliary_loss_clip": 0.01157625, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.05605102, "balance_loss_mlp": 1.02455723, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.0532232163427184, "language_loss": 0.86064392, "learning_rate": 3.0024725466000345e-06, "loss": 0.88255322, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.68792986869812 }, { "auxiliary_loss_clip": 0.01174454, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 1.05712414, "balance_loss_mlp": 1.01702166, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.701363518009926, "language_loss": 0.78430486, "learning_rate": 3.0017984182627087e-06, "loss": 0.80630416, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.60302472114563 }, { "auxiliary_loss_clip": 0.0113802, "auxiliary_loss_mlp": 0.00712736, "balance_loss_clip": 1.04932845, "balance_loss_mlp": 1.00066853, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 1.9486734151809, "language_loss": 0.82539994, "learning_rate": 3.00112413795336e-06, "loss": 0.84390748, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.745802879333496 }, { "auxiliary_loss_clip": 0.01153773, "auxiliary_loss_mlp": 0.01026409, "balance_loss_clip": 1.04906344, "balance_loss_mlp": 1.01801014, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 2.2821493725605086, "language_loss": 0.79418516, "learning_rate": 3.000449705774275e-06, "loss": 0.81598699, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.647669792175293 }, { "auxiliary_loss_clip": 0.01176456, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.05729556, "balance_loss_mlp": 1.01896501, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 3.2218565877127086, "language_loss": 0.72202599, "learning_rate": 2.9997751218277654e-06, "loss": 0.7440657, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.5882210731506348 }, { "auxiliary_loss_clip": 0.01191111, "auxiliary_loss_mlp": 0.01024838, "balance_loss_clip": 1.05772495, "balance_loss_mlp": 1.01636803, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 2.2236830231347846, "language_loss": 0.77894783, "learning_rate": 2.999100386216166e-06, "loss": 0.80110735, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.62605619430542 }, { "auxiliary_loss_clip": 0.01158426, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.05363321, "balance_loss_mlp": 1.02020311, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.747887656190461, "language_loss": 0.74425399, "learning_rate": 2.998425499041831e-06, "loss": 0.76612049, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.686124324798584 }, { "auxiliary_loss_clip": 0.01090126, "auxiliary_loss_mlp": 0.01004039, "balance_loss_clip": 1.03026748, "balance_loss_mlp": 1.00209618, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.2398850674263138, "language_loss": 0.64537108, "learning_rate": 2.997750460407142e-06, "loss": 0.66631275, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.2303237915039062 }, { "auxiliary_loss_clip": 0.01146201, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.05047894, "balance_loss_mlp": 1.02292609, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.3619480133465123, "language_loss": 0.70225883, "learning_rate": 2.997075270414501e-06, "loss": 0.7240355, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.630803108215332 }, { "auxiliary_loss_clip": 0.01081241, "auxiliary_loss_mlp": 0.01004065, "balance_loss_clip": 1.03101349, "balance_loss_mlp": 1.00224137, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.708640958560672, "language_loss": 0.57739139, "learning_rate": 2.9963999291663347e-06, "loss": 0.59824443, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.2763278484344482 }, { "auxiliary_loss_clip": 0.01126434, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.05297589, "balance_loss_mlp": 1.02063251, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 3.0639833279735167, "language_loss": 0.74257624, "learning_rate": 2.9957244367650915e-06, "loss": 0.76413417, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.700906753540039 }, { "auxiliary_loss_clip": 0.01124768, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.05265605, "balance_loss_mlp": 1.01973438, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 1.861714375715011, "language_loss": 0.83567977, "learning_rate": 2.9950487933132425e-06, "loss": 0.85721648, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.749818801879883 }, { "auxiliary_loss_clip": 0.01180441, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.05713129, "balance_loss_mlp": 1.02121687, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 3.2014024466352518, "language_loss": 0.71185625, "learning_rate": 2.994372998913283e-06, "loss": 0.73396397, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.6780810356140137 }, { "auxiliary_loss_clip": 0.01162609, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.05723619, "balance_loss_mlp": 1.02305913, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 2.211199684057753, "language_loss": 0.61812139, "learning_rate": 2.99369705366773e-06, "loss": 0.64005911, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.653982400894165 }, { "auxiliary_loss_clip": 0.01154676, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.05455649, "balance_loss_mlp": 1.02239251, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 1.9595328310910682, "language_loss": 0.8209278, "learning_rate": 2.9930209576791244e-06, "loss": 0.84277987, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.6652684211730957 }, { "auxiliary_loss_clip": 0.01171672, "auxiliary_loss_mlp": 0.01034359, "balance_loss_clip": 1.05523467, "balance_loss_mlp": 1.02596092, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 2.206340486815128, "language_loss": 0.6340822, "learning_rate": 2.9923447110500285e-06, "loss": 0.65614247, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 3.5307891368865967 }, { "auxiliary_loss_clip": 0.01165445, "auxiliary_loss_mlp": 0.01027887, "balance_loss_clip": 1.05479026, "balance_loss_mlp": 1.01990628, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.5911960271516867, "language_loss": 0.75676048, "learning_rate": 2.9916683138830295e-06, "loss": 0.7786938, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 3.616079092025757 }, { "auxiliary_loss_clip": 0.01155021, "auxiliary_loss_mlp": 0.01031633, "balance_loss_clip": 1.05190802, "balance_loss_mlp": 1.02331281, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.2865603530918066, "language_loss": 0.81186092, "learning_rate": 2.9909917662807353e-06, "loss": 0.83372748, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 3.60941743850708 }, { "auxiliary_loss_clip": 0.01174258, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.05596113, "balance_loss_mlp": 1.02256823, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.446613794195853, "language_loss": 0.69527423, "learning_rate": 2.9903150683457783e-06, "loss": 0.7173233, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.622084140777588 }, { "auxiliary_loss_clip": 0.011593, "auxiliary_loss_mlp": 0.01023418, "balance_loss_clip": 1.05263376, "balance_loss_mlp": 1.01468039, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 2.5568115868996615, "language_loss": 0.65561926, "learning_rate": 2.9896382201808126e-06, "loss": 0.67744648, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.6470236778259277 }, { "auxiliary_loss_clip": 0.011919, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 1.05678129, "balance_loss_mlp": 1.01917171, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.1455091097059067, "language_loss": 0.81045413, "learning_rate": 2.988961221888516e-06, "loss": 0.83265269, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 3.497704029083252 }, { "auxiliary_loss_clip": 0.0113422, "auxiliary_loss_mlp": 0.01030463, "balance_loss_clip": 1.05016875, "balance_loss_mlp": 1.02183187, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.5210931988565726, "language_loss": 0.78952622, "learning_rate": 2.988284073571589e-06, "loss": 0.81117302, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.648214340209961 }, { "auxiliary_loss_clip": 0.01176647, "auxiliary_loss_mlp": 0.0071216, "balance_loss_clip": 1.05507827, "balance_loss_mlp": 1.00061643, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.409759993106217, "language_loss": 0.73129171, "learning_rate": 2.9876067753327528e-06, "loss": 0.75017977, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 2.6373047828674316 }, { "auxiliary_loss_clip": 0.01178427, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.05563807, "balance_loss_mlp": 1.02110147, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 2.3093794846997118, "language_loss": 0.80451959, "learning_rate": 2.986929327274754e-06, "loss": 0.82659698, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.7443735599517822 }, { "auxiliary_loss_clip": 0.01173079, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.05624127, "balance_loss_mlp": 1.0203433, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.5532774115530683, "language_loss": 0.7864095, "learning_rate": 2.9862517295003617e-06, "loss": 0.80842614, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.809400796890259 }, { "auxiliary_loss_clip": 0.01139268, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.04887295, "balance_loss_mlp": 1.01795793, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.595763398877567, "language_loss": 0.72741759, "learning_rate": 2.9855739821123654e-06, "loss": 0.74907017, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.785151720046997 }, { "auxiliary_loss_clip": 0.01170884, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.05517936, "balance_loss_mlp": 1.02178061, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 2.3449321690142666, "language_loss": 0.82321888, "learning_rate": 2.98489608521358e-06, "loss": 0.84523302, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.6951048374176025 }, { "auxiliary_loss_clip": 0.01177297, "auxiliary_loss_mlp": 0.00712643, "balance_loss_clip": 1.05478859, "balance_loss_mlp": 1.00073373, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 4.8107436603173, "language_loss": 0.79184473, "learning_rate": 2.9842180389068425e-06, "loss": 0.81074411, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.653331995010376 }, { "auxiliary_loss_clip": 0.01076093, "auxiliary_loss_mlp": 0.01002445, "balance_loss_clip": 1.04913402, "balance_loss_mlp": 1.00075245, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.7603310131466746, "language_loss": 0.5929991, "learning_rate": 2.98353984329501e-06, "loss": 0.61378443, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.292935848236084 }, { "auxiliary_loss_clip": 0.01159894, "auxiliary_loss_mlp": 0.01036524, "balance_loss_clip": 1.05541742, "balance_loss_mlp": 1.02820945, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.6954832166276421, "language_loss": 0.70589495, "learning_rate": 2.982861498480965e-06, "loss": 0.72785908, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.6872754096984863 }, { "auxiliary_loss_clip": 0.01134973, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.04639542, "balance_loss_mlp": 1.02055979, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.7161338635358856, "language_loss": 0.82678175, "learning_rate": 2.9821830045676122e-06, "loss": 0.84841681, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.720496892929077 }, { "auxiliary_loss_clip": 0.01194256, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.0588088, "balance_loss_mlp": 1.02241826, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.853829940505836, "language_loss": 0.72686851, "learning_rate": 2.9815043616578793e-06, "loss": 0.74912608, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.659900188446045 }, { "auxiliary_loss_clip": 0.01137633, "auxiliary_loss_mlp": 0.01032069, "balance_loss_clip": 1.04889226, "balance_loss_mlp": 1.02331269, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.6598278048010253, "language_loss": 0.77197862, "learning_rate": 2.9808255698547145e-06, "loss": 0.79367566, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.8172616958618164 }, { "auxiliary_loss_clip": 0.01175855, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.05900562, "balance_loss_mlp": 1.02093101, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.1220036352511684, "language_loss": 0.79457849, "learning_rate": 2.9801466292610913e-06, "loss": 0.81663096, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.6692144870758057 }, { "auxiliary_loss_clip": 0.01174445, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.05483532, "balance_loss_mlp": 1.01925123, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 3.412857952402332, "language_loss": 0.80802786, "learning_rate": 2.979467539980003e-06, "loss": 0.83004749, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.5759165287017822 }, { "auxiliary_loss_clip": 0.01176993, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.05509031, "balance_loss_mlp": 1.026564, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.7522353656075818, "language_loss": 0.76868016, "learning_rate": 2.978788302114468e-06, "loss": 0.79080069, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.654663324356079 }, { "auxiliary_loss_clip": 0.01173008, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.05451596, "balance_loss_mlp": 1.0203166, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 2.115914306057845, "language_loss": 0.8355633, "learning_rate": 2.9781089157675255e-06, "loss": 0.85758579, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.7179782390594482 }, { "auxiliary_loss_clip": 0.01175205, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.05865979, "balance_loss_mlp": 1.02588761, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.509665705618252, "language_loss": 0.88277322, "learning_rate": 2.977429381042238e-06, "loss": 0.90487117, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.7475035190582275 }, { "auxiliary_loss_clip": 0.01158778, "auxiliary_loss_mlp": 0.01028713, "balance_loss_clip": 1.05245185, "balance_loss_mlp": 1.02090502, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.303459468924728, "language_loss": 0.89239269, "learning_rate": 2.9767496980416913e-06, "loss": 0.9142676, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.6845734119415283 }, { "auxiliary_loss_clip": 0.01151617, "auxiliary_loss_mlp": 0.01029902, "balance_loss_clip": 1.05037093, "balance_loss_mlp": 1.02106309, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.371431185980555, "language_loss": 0.80697858, "learning_rate": 2.9760698668689914e-06, "loss": 0.82879376, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.7290468215942383 }, { "auxiliary_loss_clip": 0.01176533, "auxiliary_loss_mlp": 0.01028812, "balance_loss_clip": 1.05447078, "balance_loss_mlp": 1.01998496, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 1.954404842997319, "language_loss": 0.7167086, "learning_rate": 2.975389887627269e-06, "loss": 0.73876214, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.8342065811157227 }, { "auxiliary_loss_clip": 0.01146794, "auxiliary_loss_mlp": 0.01030844, "balance_loss_clip": 1.05198646, "balance_loss_mlp": 1.02215981, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.0131914275655105, "language_loss": 0.90308523, "learning_rate": 2.9747097604196764e-06, "loss": 0.92486161, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 3.6051135063171387 }, { "auxiliary_loss_clip": 0.01047629, "auxiliary_loss_mlp": 0.01008823, "balance_loss_clip": 1.02591956, "balance_loss_mlp": 1.00680864, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6706446825483336, "language_loss": 0.56617016, "learning_rate": 2.9740294853493875e-06, "loss": 0.58673471, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 4.431067943572998 }, { "auxiliary_loss_clip": 0.01131034, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.04904056, "balance_loss_mlp": 1.01852238, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.2934217355670383, "language_loss": 0.67438853, "learning_rate": 2.9733490625196008e-06, "loss": 0.69597006, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 3.8043994903564453 }, { "auxiliary_loss_clip": 0.01132659, "auxiliary_loss_mlp": 0.01031365, "balance_loss_clip": 1.04975915, "balance_loss_mlp": 1.02281785, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 3.0055269995451073, "language_loss": 0.76111245, "learning_rate": 2.9726684920335353e-06, "loss": 0.78275275, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.666316509246826 }, { "auxiliary_loss_clip": 0.01192501, "auxiliary_loss_mlp": 0.00713138, "balance_loss_clip": 1.05575192, "balance_loss_mlp": 1.00066829, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.453196662209321, "language_loss": 0.81723523, "learning_rate": 2.971987773994432e-06, "loss": 0.83629161, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.5920331478118896 }, { "auxiliary_loss_clip": 0.0116605, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.05193043, "balance_loss_mlp": 1.02292037, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.321810427378974, "language_loss": 0.82840759, "learning_rate": 2.9713069085055566e-06, "loss": 0.85037947, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 3.535994291305542 }, { "auxiliary_loss_clip": 0.01143075, "auxiliary_loss_mlp": 0.01039014, "balance_loss_clip": 1.0522275, "balance_loss_mlp": 1.0301981, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.5906384967223364, "language_loss": 0.79339427, "learning_rate": 2.9706258956701958e-06, "loss": 0.81521517, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 2.6707844734191895 }, { "auxiliary_loss_clip": 0.01177455, "auxiliary_loss_mlp": 0.01026847, "balance_loss_clip": 1.05436611, "balance_loss_mlp": 1.01807356, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 2.225410065068615, "language_loss": 0.77478659, "learning_rate": 2.9699447355916575e-06, "loss": 0.79682964, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.623661756515503 }, { "auxiliary_loss_clip": 0.01189881, "auxiliary_loss_mlp": 0.00712293, "balance_loss_clip": 1.05676293, "balance_loss_mlp": 1.00066304, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 1.8945555171003237, "language_loss": 0.73968852, "learning_rate": 2.969263428373275e-06, "loss": 0.75871027, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.608891248703003 }, { "auxiliary_loss_clip": 0.01162722, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.05450177, "balance_loss_mlp": 1.02066636, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 2.5033075682687103, "language_loss": 0.79195964, "learning_rate": 2.9685819741184007e-06, "loss": 0.81387794, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.6226646900177 }, { "auxiliary_loss_clip": 0.01139032, "auxiliary_loss_mlp": 0.0102463, "balance_loss_clip": 1.05324697, "balance_loss_mlp": 1.01631474, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 2.5065343993193148, "language_loss": 0.69241339, "learning_rate": 2.967900372930411e-06, "loss": 0.71404999, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.664155960083008 }, { "auxiliary_loss_clip": 0.01151182, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.04976869, "balance_loss_mlp": 1.02457118, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 2.413927076387563, "language_loss": 0.7934314, "learning_rate": 2.9672186249127046e-06, "loss": 0.8152855, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.6337993144989014 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.05268502, "balance_loss_mlp": 1.01703215, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 2.201313568853785, "language_loss": 0.78821826, "learning_rate": 2.9665367301687014e-06, "loss": 0.8100332, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.6832661628723145 }, { "auxiliary_loss_clip": 0.01149442, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.04944706, "balance_loss_mlp": 1.02329254, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 2.7118054121333284, "language_loss": 0.76901782, "learning_rate": 2.965854688801845e-06, "loss": 0.79083359, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.722367286682129 }, { "auxiliary_loss_clip": 0.01170082, "auxiliary_loss_mlp": 0.01025219, "balance_loss_clip": 1.04926622, "balance_loss_mlp": 1.01679707, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 1.8531844739577934, "language_loss": 0.763565, "learning_rate": 2.9651725009156005e-06, "loss": 0.78551805, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.6039106845855713 }, { "auxiliary_loss_clip": 0.01149276, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.0499804, "balance_loss_mlp": 1.02245164, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.6599167833908832, "language_loss": 0.7440775, "learning_rate": 2.964490166613454e-06, "loss": 0.76587439, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.682826519012451 }, { "auxiliary_loss_clip": 0.01097594, "auxiliary_loss_mlp": 0.01004336, "balance_loss_clip": 1.0256536, "balance_loss_mlp": 1.00248814, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7496641138705717, "language_loss": 0.57691294, "learning_rate": 2.963807685998917e-06, "loss": 0.59793228, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.0330424308776855 }, { "auxiliary_loss_clip": 0.0112604, "auxiliary_loss_mlp": 0.01022663, "balance_loss_clip": 1.04629755, "balance_loss_mlp": 1.01461041, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 2.5236275808085167, "language_loss": 0.78011274, "learning_rate": 2.9631250591755196e-06, "loss": 0.80159974, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.8866422176361084 }, { "auxiliary_loss_clip": 0.01155133, "auxiliary_loss_mlp": 0.01026367, "balance_loss_clip": 1.05460966, "balance_loss_mlp": 1.01780176, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 1.7591268574180583, "language_loss": 0.57681882, "learning_rate": 2.962442286246817e-06, "loss": 0.59863389, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.7734806537628174 }, { "auxiliary_loss_clip": 0.01160806, "auxiliary_loss_mlp": 0.0102758, "balance_loss_clip": 1.05160999, "balance_loss_mlp": 1.01943779, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.551675627562998, "language_loss": 0.69761968, "learning_rate": 2.9617593673163853e-06, "loss": 0.71950352, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.6104371547698975 }, { "auxiliary_loss_clip": 0.01160285, "auxiliary_loss_mlp": 0.01030185, "balance_loss_clip": 1.05157971, "balance_loss_mlp": 1.02207279, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.0681747824484864, "language_loss": 0.76924199, "learning_rate": 2.9610763024878216e-06, "loss": 0.79114664, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.6479134559631348 }, { "auxiliary_loss_clip": 0.01150845, "auxiliary_loss_mlp": 0.01027023, "balance_loss_clip": 1.04926562, "balance_loss_mlp": 1.01839232, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.742947132850368, "language_loss": 0.91805041, "learning_rate": 2.960393091864747e-06, "loss": 0.93982911, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.6576011180877686 }, { "auxiliary_loss_clip": 0.0115849, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.05315578, "balance_loss_mlp": 1.02376842, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.75100312445812, "language_loss": 0.74945891, "learning_rate": 2.959709735550804e-06, "loss": 0.77136153, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.6813058853149414 }, { "auxiliary_loss_clip": 0.01131002, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.04971719, "balance_loss_mlp": 1.01904655, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.0868212421239667, "language_loss": 0.75279635, "learning_rate": 2.9590262336496575e-06, "loss": 0.77438205, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.7119929790496826 }, { "auxiliary_loss_clip": 0.01144003, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.05584908, "balance_loss_mlp": 1.01868641, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.078843978592156, "language_loss": 0.85606438, "learning_rate": 2.9583425862649936e-06, "loss": 0.87777847, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.6609346866607666 }, { "auxiliary_loss_clip": 0.01192196, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.05664253, "balance_loss_mlp": 1.02395463, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 3.7206545887148477, "language_loss": 0.73753089, "learning_rate": 2.9576587935005215e-06, "loss": 0.75977713, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.528409004211426 }, { "auxiliary_loss_clip": 0.01176473, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.05389428, "balance_loss_mlp": 1.01504672, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.5410006691212637, "language_loss": 0.72294664, "learning_rate": 2.9569748554599713e-06, "loss": 0.74495018, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 3.5179853439331055 }, { "auxiliary_loss_clip": 0.01157709, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.05384588, "balance_loss_mlp": 1.02082884, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 2.175330195878242, "language_loss": 0.73575991, "learning_rate": 2.956290772247097e-06, "loss": 0.75762975, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 3.728898286819458 }, { "auxiliary_loss_clip": 0.01119771, "auxiliary_loss_mlp": 0.01028588, "balance_loss_clip": 1.0509814, "balance_loss_mlp": 1.02045846, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 1.6863654850152523, "language_loss": 0.73239648, "learning_rate": 2.9556065439656724e-06, "loss": 0.75388008, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 3.659346342086792 }, { "auxiliary_loss_clip": 0.01101022, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.04490757, "balance_loss_mlp": 1.02040911, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 2.736519920146095, "language_loss": 0.82067728, "learning_rate": 2.9549221707194952e-06, "loss": 0.84197307, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.6814353466033936 }, { "auxiliary_loss_clip": 0.01177268, "auxiliary_loss_mlp": 0.01023643, "balance_loss_clip": 1.05605686, "balance_loss_mlp": 1.01574504, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 2.2276044404781006, "language_loss": 0.72438753, "learning_rate": 2.954237652612384e-06, "loss": 0.74639666, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.6761887073516846 }, { "auxiliary_loss_clip": 0.01155794, "auxiliary_loss_mlp": 0.01023389, "balance_loss_clip": 1.05390406, "balance_loss_mlp": 1.01557529, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 1.9281427975205787, "language_loss": 0.84220612, "learning_rate": 2.9535529897481796e-06, "loss": 0.86399794, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 3.606830358505249 }, { "auxiliary_loss_clip": 0.01190048, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.05593872, "balance_loss_mlp": 1.0256741, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.1662833972552087, "language_loss": 0.76931512, "learning_rate": 2.9528681822307446e-06, "loss": 0.79155368, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.557032823562622 }, { "auxiliary_loss_clip": 0.0117169, "auxiliary_loss_mlp": 0.00711905, "balance_loss_clip": 1.0564115, "balance_loss_mlp": 1.00083148, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.1664220495468847, "language_loss": 0.82387447, "learning_rate": 2.952183230163964e-06, "loss": 0.84271038, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 2.646101951599121 }, { "auxiliary_loss_clip": 0.01134503, "auxiliary_loss_mlp": 0.0102308, "balance_loss_clip": 1.05075645, "balance_loss_mlp": 1.01526546, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 1.9996809167369136, "language_loss": 0.72904325, "learning_rate": 2.9514981336517448e-06, "loss": 0.75061905, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.6672561168670654 }, { "auxiliary_loss_clip": 0.01175826, "auxiliary_loss_mlp": 0.0102564, "balance_loss_clip": 1.05851281, "balance_loss_mlp": 1.01790309, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 12.024155597995035, "language_loss": 0.81024146, "learning_rate": 2.950812892798015e-06, "loss": 0.83225614, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.65116810798645 }, { "auxiliary_loss_clip": 0.01118121, "auxiliary_loss_mlp": 0.00712359, "balance_loss_clip": 1.05055892, "balance_loss_mlp": 1.00078261, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 5.020668780932636, "language_loss": 0.87440562, "learning_rate": 2.9501275077067256e-06, "loss": 0.89271045, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.727813720703125 }, { "auxiliary_loss_clip": 0.01090983, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.0438695, "balance_loss_mlp": 1.02027178, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.4769843327333763, "language_loss": 0.8853147, "learning_rate": 2.949441978481848e-06, "loss": 0.90649968, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.7782750129699707 }, { "auxiliary_loss_clip": 0.01145532, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.05208421, "balance_loss_mlp": 1.02480984, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 3.498143855336562, "language_loss": 0.80744028, "learning_rate": 2.9487563052273778e-06, "loss": 0.82922566, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.6845126152038574 }, { "auxiliary_loss_clip": 0.01174706, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.05914307, "balance_loss_mlp": 1.02236426, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 1.8396145670998802, "language_loss": 0.85363841, "learning_rate": 2.94807048804733e-06, "loss": 0.87569773, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.5888924598693848 }, { "auxiliary_loss_clip": 0.0114525, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.05077791, "balance_loss_mlp": 1.02358055, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 2.2897730687946876, "language_loss": 0.90268749, "learning_rate": 2.9473845270457434e-06, "loss": 0.92445958, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.6475417613983154 }, { "auxiliary_loss_clip": 0.01151331, "auxiliary_loss_mlp": 0.01019765, "balance_loss_clip": 1.05187654, "balance_loss_mlp": 1.01182008, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 2.218853835409877, "language_loss": 0.69871938, "learning_rate": 2.946698422326677e-06, "loss": 0.72043031, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.6301796436309814 }, { "auxiliary_loss_clip": 0.01124699, "auxiliary_loss_mlp": 0.01024102, "balance_loss_clip": 1.04885674, "balance_loss_mlp": 1.01615715, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 3.497733913032511, "language_loss": 0.79275203, "learning_rate": 2.946012173994213e-06, "loss": 0.8142401, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.7604188919067383 }, { "auxiliary_loss_clip": 0.01170123, "auxiliary_loss_mlp": 0.0102505, "balance_loss_clip": 1.05760717, "balance_loss_mlp": 1.01691985, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.4055370249440708, "language_loss": 0.67797995, "learning_rate": 2.945325782152454e-06, "loss": 0.69993168, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.7637779712677 }, { "auxiliary_loss_clip": 0.01157734, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.05161548, "balance_loss_mlp": 1.02398705, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.9272086379043953, "language_loss": 0.78707671, "learning_rate": 2.9446392469055257e-06, "loss": 0.80897355, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.764878749847412 }, { "auxiliary_loss_clip": 0.01141359, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.05704629, "balance_loss_mlp": 1.02257669, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.7394486514184657, "language_loss": 0.80046701, "learning_rate": 2.9439525683575745e-06, "loss": 0.82218355, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.6409637928009033 }, { "auxiliary_loss_clip": 0.01193625, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.05768263, "balance_loss_mlp": 1.02266216, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 3.6448176178551073, "language_loss": 0.7533673, "learning_rate": 2.9432657466127694e-06, "loss": 0.77561283, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.6228573322296143 }, { "auxiliary_loss_clip": 0.01127308, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.05445123, "balance_loss_mlp": 1.02069986, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 4.155504109229336, "language_loss": 0.76859516, "learning_rate": 2.9425787817753007e-06, "loss": 0.79016149, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.7177376747131348 }, { "auxiliary_loss_clip": 0.01142106, "auxiliary_loss_mlp": 0.01034644, "balance_loss_clip": 1.05244589, "balance_loss_mlp": 1.02614427, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.6515687347753714, "language_loss": 0.71678293, "learning_rate": 2.94189167394938e-06, "loss": 0.73855036, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.749567747116089 }, { "auxiliary_loss_clip": 0.01193348, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.05998552, "balance_loss_mlp": 1.01940691, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 1.9278652484287766, "language_loss": 0.806063, "learning_rate": 2.941204423239241e-06, "loss": 0.82826835, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.649449348449707 }, { "auxiliary_loss_clip": 0.01170551, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.05469573, "balance_loss_mlp": 1.01972294, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 2.115564406115235, "language_loss": 0.76239157, "learning_rate": 2.9405170297491395e-06, "loss": 0.78437084, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.696345806121826 }, { "auxiliary_loss_clip": 0.01103703, "auxiliary_loss_mlp": 0.00712343, "balance_loss_clip": 1.0528599, "balance_loss_mlp": 1.00074553, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 2.0917568053434756, "language_loss": 0.80682397, "learning_rate": 2.939829493583353e-06, "loss": 0.82498443, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.730888843536377 }, { "auxiliary_loss_clip": 0.01132932, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.04664648, "balance_loss_mlp": 1.0186075, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.5050741106253502, "language_loss": 0.83167088, "learning_rate": 2.939141814846179e-06, "loss": 0.85326761, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 3.5576679706573486 }, { "auxiliary_loss_clip": 0.01157106, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.0532012, "balance_loss_mlp": 1.01914334, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.7587166543163943, "language_loss": 0.8255623, "learning_rate": 2.938453993641938e-06, "loss": 0.84740615, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 3.6306357383728027 }, { "auxiliary_loss_clip": 0.01156142, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.05573201, "balance_loss_mlp": 1.02161884, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.1296745401367563, "language_loss": 0.70104009, "learning_rate": 2.937766030074973e-06, "loss": 0.72289848, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 3.566763401031494 }, { "auxiliary_loss_clip": 0.01147861, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.05311215, "balance_loss_mlp": 1.02033961, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.8888020317987968, "language_loss": 0.82839084, "learning_rate": 2.937077924249646e-06, "loss": 0.85015571, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.7250640392303467 }, { "auxiliary_loss_clip": 0.01166614, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.05492282, "balance_loss_mlp": 1.01976871, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 1.9883826502064654, "language_loss": 0.7616775, "learning_rate": 2.9363896762703443e-06, "loss": 0.78362525, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 3.5440118312835693 }, { "auxiliary_loss_clip": 0.01189461, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.05627275, "balance_loss_mlp": 1.01997995, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 2.312333161307083, "language_loss": 0.84102464, "learning_rate": 2.9357012862414725e-06, "loss": 0.86320233, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.591538667678833 }, { "auxiliary_loss_clip": 0.0117649, "auxiliary_loss_mlp": 0.01024225, "balance_loss_clip": 1.05780554, "balance_loss_mlp": 1.01644683, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 2.0735652860985883, "language_loss": 0.71784014, "learning_rate": 2.9350127542674593e-06, "loss": 0.73984724, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.6977598667144775 }, { "auxiliary_loss_clip": 0.01163807, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.05621862, "balance_loss_mlp": 1.02598333, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 2.318882548224442, "language_loss": 0.76381862, "learning_rate": 2.934324080452755e-06, "loss": 0.78579754, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 2.6272716522216797 }, { "auxiliary_loss_clip": 0.01129586, "auxiliary_loss_mlp": 0.00712175, "balance_loss_clip": 1.04729033, "balance_loss_mlp": 1.00065088, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.6140671248614222, "language_loss": 0.78138798, "learning_rate": 2.9336352649018307e-06, "loss": 0.79980558, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.7120206356048584 }, { "auxiliary_loss_clip": 0.01161856, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 1.05542243, "balance_loss_mlp": 1.02121449, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 1.9043084617860642, "language_loss": 0.70049161, "learning_rate": 2.9329463077191783e-06, "loss": 0.72240674, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.7472493648529053 }, { "auxiliary_loss_clip": 0.01124347, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.05046153, "balance_loss_mlp": 1.02180934, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.082779886001824, "language_loss": 0.64881927, "learning_rate": 2.9322572090093135e-06, "loss": 0.67035908, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.72562837600708 }, { "auxiliary_loss_clip": 0.01124351, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.04813719, "balance_loss_mlp": 1.01898932, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 4.909463717865321, "language_loss": 0.76557755, "learning_rate": 2.9315679688767713e-06, "loss": 0.78709769, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.702448844909668 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.04942298, "balance_loss_mlp": 1.01970601, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.7476178009647738, "language_loss": 0.66590059, "learning_rate": 2.9308785874261085e-06, "loss": 0.68767381, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.7192575931549072 }, { "auxiliary_loss_clip": 0.01191712, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.05870426, "balance_loss_mlp": 1.02144194, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.676170025358832, "language_loss": 0.81733143, "learning_rate": 2.9301890647619045e-06, "loss": 0.83954549, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.5719168186187744 }, { "auxiliary_loss_clip": 0.01167178, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.05706263, "balance_loss_mlp": 1.01989341, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 2.9159664132529812, "language_loss": 0.8011272, "learning_rate": 2.929499400988759e-06, "loss": 0.82308829, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.6724579334259033 }, { "auxiliary_loss_clip": 0.01174031, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.05682635, "balance_loss_mlp": 1.0187192, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 2.0447433367916306, "language_loss": 0.6499815, "learning_rate": 2.9288095962112927e-06, "loss": 0.67199337, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.646223306655884 }, { "auxiliary_loss_clip": 0.01191755, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.05913651, "balance_loss_mlp": 1.02234125, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 1.8811732335846179, "language_loss": 0.84960115, "learning_rate": 2.9281196505341503e-06, "loss": 0.87182403, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.59385347366333 }, { "auxiliary_loss_clip": 0.01122426, "auxiliary_loss_mlp": 0.00711611, "balance_loss_clip": 1.05185115, "balance_loss_mlp": 1.00062287, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.003851392401658, "language_loss": 0.78342456, "learning_rate": 2.9274295640619946e-06, "loss": 0.80176497, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.6584105491638184 }, { "auxiliary_loss_clip": 0.01138909, "auxiliary_loss_mlp": 0.0102649, "balance_loss_clip": 1.04861557, "balance_loss_mlp": 1.01834798, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 2.166231381791196, "language_loss": 0.78622156, "learning_rate": 2.9267393368995103e-06, "loss": 0.80787551, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.6641652584075928 }, { "auxiliary_loss_clip": 0.01192525, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.05785108, "balance_loss_mlp": 1.02048278, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.309404010270142, "language_loss": 0.74737465, "learning_rate": 2.926048969151407e-06, "loss": 0.76958883, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.517191171646118 }, { "auxiliary_loss_clip": 0.01122385, "auxiliary_loss_mlp": 0.01024482, "balance_loss_clip": 1.05469322, "balance_loss_mlp": 1.01544034, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 2.0345294155126115, "language_loss": 0.68600231, "learning_rate": 2.92535846092241e-06, "loss": 0.70747095, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.6857290267944336 }, { "auxiliary_loss_clip": 0.01160781, "auxiliary_loss_mlp": 0.01027393, "balance_loss_clip": 1.05428529, "balance_loss_mlp": 1.01950121, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 1.5576928537942714, "language_loss": 0.82793701, "learning_rate": 2.9246678123172704e-06, "loss": 0.84981871, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.664111614227295 }, { "auxiliary_loss_clip": 0.01191885, "auxiliary_loss_mlp": 0.01027335, "balance_loss_clip": 1.05796576, "balance_loss_mlp": 1.01823354, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.4124601914662986, "language_loss": 0.73925191, "learning_rate": 2.9239770234407596e-06, "loss": 0.76144409, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.5745809078216553 }, { "auxiliary_loss_clip": 0.0117464, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.05450737, "balance_loss_mlp": 1.01916862, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.6193125366825398, "language_loss": 0.68098354, "learning_rate": 2.9232860943976686e-06, "loss": 0.7030071, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.613581895828247 }, { "auxiliary_loss_clip": 0.01156436, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.0537653, "balance_loss_mlp": 1.02070284, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.656037773777531, "language_loss": 0.84162223, "learning_rate": 2.9225950252928115e-06, "loss": 0.86347163, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.695495367050171 }, { "auxiliary_loss_clip": 0.01175515, "auxiliary_loss_mlp": 0.01026648, "balance_loss_clip": 1.05790424, "balance_loss_mlp": 1.01818395, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 4.015712091571271, "language_loss": 0.81765223, "learning_rate": 2.9219038162310217e-06, "loss": 0.83967388, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.6883187294006348 }, { "auxiliary_loss_clip": 0.01099511, "auxiliary_loss_mlp": 0.00712895, "balance_loss_clip": 1.0505085, "balance_loss_mlp": 1.00070107, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 1.8748952337721356, "language_loss": 0.82468945, "learning_rate": 2.921212467317157e-06, "loss": 0.84281349, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 3.709200859069824 }, { "auxiliary_loss_clip": 0.01144283, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.05042732, "balance_loss_mlp": 1.02367902, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 2.0332527924422186, "language_loss": 0.79982495, "learning_rate": 2.920520978656093e-06, "loss": 0.82159603, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 3.560563325881958 }, { "auxiliary_loss_clip": 0.01190584, "auxiliary_loss_mlp": 0.00712477, "balance_loss_clip": 1.05842996, "balance_loss_mlp": 1.00067019, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 1.9386096200346272, "language_loss": 0.76632953, "learning_rate": 2.919829350352729e-06, "loss": 0.7853601, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 3.6056811809539795 }, { "auxiliary_loss_clip": 0.01110052, "auxiliary_loss_mlp": 0.01003004, "balance_loss_clip": 1.03567958, "balance_loss_mlp": 1.00101316, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7554730973815339, "language_loss": 0.599828, "learning_rate": 2.919137582511983e-06, "loss": 0.62095857, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.093973398208618 }, { "auxiliary_loss_clip": 0.01154396, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.05880439, "balance_loss_mlp": 1.02616358, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 2.2348958418423224, "language_loss": 0.64058065, "learning_rate": 2.918445675238797e-06, "loss": 0.66247487, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 3.6049447059631348 }, { "auxiliary_loss_clip": 0.01191404, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.05684245, "balance_loss_mlp": 1.0171169, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 1.6855188611226557, "language_loss": 0.69773972, "learning_rate": 2.917753628638132e-06, "loss": 0.71990764, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.620924234390259 }, { "auxiliary_loss_clip": 0.01159737, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.05552244, "balance_loss_mlp": 1.0222609, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 2.58915208495644, "language_loss": 0.70477509, "learning_rate": 2.9170614428149716e-06, "loss": 0.72668493, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.6291558742523193 }, { "auxiliary_loss_clip": 0.01138712, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.05367827, "balance_loss_mlp": 1.01897299, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 3.155065886647056, "language_loss": 0.87127888, "learning_rate": 2.9163691178743195e-06, "loss": 0.8929491, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 2.8158681392669678 }, { "auxiliary_loss_clip": 0.01170944, "auxiliary_loss_mlp": 0.01025687, "balance_loss_clip": 1.05551171, "balance_loss_mlp": 1.01761627, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 1.8949631665562023, "language_loss": 0.77451837, "learning_rate": 2.9156766539212006e-06, "loss": 0.79648465, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.6576778888702393 }, { "auxiliary_loss_clip": 0.01177158, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.05382025, "balance_loss_mlp": 1.01973438, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.7614442765205864, "language_loss": 0.71810973, "learning_rate": 2.9149840510606614e-06, "loss": 0.74016631, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.646411180496216 }, { "auxiliary_loss_clip": 0.01091713, "auxiliary_loss_mlp": 0.00703768, "balance_loss_clip": 1.03214586, "balance_loss_mlp": 1.00104451, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.0217202926172, "language_loss": 0.64146358, "learning_rate": 2.914291309397769e-06, "loss": 0.6594184, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.2845301628112793 }, { "auxiliary_loss_clip": 0.01097181, "auxiliary_loss_mlp": 0.0102426, "balance_loss_clip": 1.04542708, "balance_loss_mlp": 1.01538515, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.459147333622027, "language_loss": 0.78012329, "learning_rate": 2.9135984290376117e-06, "loss": 0.80133772, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.762345314025879 }, { "auxiliary_loss_clip": 0.01107163, "auxiliary_loss_mlp": 0.01031412, "balance_loss_clip": 1.04807329, "balance_loss_mlp": 1.02347887, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 2.713695747700567, "language_loss": 0.82956505, "learning_rate": 2.9129054100853e-06, "loss": 0.85095084, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.830648183822632 }, { "auxiliary_loss_clip": 0.01161227, "auxiliary_loss_mlp": 0.01031836, "balance_loss_clip": 1.05320132, "balance_loss_mlp": 1.02232289, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.822793129207491, "language_loss": 0.75733006, "learning_rate": 2.912212252645963e-06, "loss": 0.77926064, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.649411201477051 }, { "auxiliary_loss_clip": 0.01181598, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.0558821, "balance_loss_mlp": 1.02013421, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.240120431476763, "language_loss": 0.76744604, "learning_rate": 2.9115189568247523e-06, "loss": 0.78955239, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.6189990043640137 }, { "auxiliary_loss_clip": 0.01122624, "auxiliary_loss_mlp": 0.01029396, "balance_loss_clip": 1.05716681, "balance_loss_mlp": 1.02083731, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.162406533532617, "language_loss": 0.92423069, "learning_rate": 2.910825522726841e-06, "loss": 0.94575083, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.7333905696868896 }, { "auxiliary_loss_clip": 0.011206, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.04788613, "balance_loss_mlp": 1.02191234, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 2.1003299218709564, "language_loss": 0.7678259, "learning_rate": 2.9101319504574215e-06, "loss": 0.78933519, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.7155075073242188 }, { "auxiliary_loss_clip": 0.01165054, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.05108905, "balance_loss_mlp": 1.02166295, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 3.7453612244079877, "language_loss": 0.76537371, "learning_rate": 2.909438240121709e-06, "loss": 0.78732979, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.6594135761260986 }, { "auxiliary_loss_clip": 0.01154939, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.05514371, "balance_loss_mlp": 1.02074885, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 2.7788156132844213, "language_loss": 0.69984925, "learning_rate": 2.908744391824939e-06, "loss": 0.7216934, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.7466835975646973 }, { "auxiliary_loss_clip": 0.01115183, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.04909205, "balance_loss_mlp": 1.02314579, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 3.007328997622974, "language_loss": 0.79000103, "learning_rate": 2.908050405672367e-06, "loss": 0.81146562, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.8128180503845215 }, { "auxiliary_loss_clip": 0.01162455, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.0493176, "balance_loss_mlp": 1.01996183, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 2.5271632960113255, "language_loss": 0.79467189, "learning_rate": 2.9073562817692703e-06, "loss": 0.81658196, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.6938955783843994 }, { "auxiliary_loss_clip": 0.01053575, "auxiliary_loss_mlp": 0.01010516, "balance_loss_clip": 1.02613175, "balance_loss_mlp": 1.00834656, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7212522363313644, "language_loss": 0.56520683, "learning_rate": 2.9066620202209468e-06, "loss": 0.58584774, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.233861207962036 }, { "auxiliary_loss_clip": 0.01130516, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.04842162, "balance_loss_mlp": 1.01953065, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 1.9444120720796991, "language_loss": 0.77646798, "learning_rate": 2.905967621132716e-06, "loss": 0.79805481, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.6938765048980713 }, { "auxiliary_loss_clip": 0.01160883, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.0519731, "balance_loss_mlp": 1.0207094, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 1.8506424548486902, "language_loss": 0.7483809, "learning_rate": 2.9052730846099172e-06, "loss": 0.77028602, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.6712779998779297 }, { "auxiliary_loss_clip": 0.01076561, "auxiliary_loss_mlp": 0.01004301, "balance_loss_clip": 1.0306114, "balance_loss_mlp": 1.00257206, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8550357477471752, "language_loss": 0.60883951, "learning_rate": 2.9045784107579123e-06, "loss": 0.62964809, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.237579345703125 }, { "auxiliary_loss_clip": 0.01188198, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.05513906, "balance_loss_mlp": 1.02334535, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.8595686787963932, "language_loss": 0.67045635, "learning_rate": 2.9038835996820807e-06, "loss": 0.69265795, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.581883192062378 }, { "auxiliary_loss_clip": 0.01143626, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.04743266, "balance_loss_mlp": 1.02552414, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 1.848741829189184, "language_loss": 0.79400998, "learning_rate": 2.903188651487826e-06, "loss": 0.81578767, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 3.633876085281372 }, { "auxiliary_loss_clip": 0.01176917, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.05492246, "balance_loss_mlp": 1.02973175, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.439475197535939, "language_loss": 0.8634994, "learning_rate": 2.902493566280571e-06, "loss": 0.88565421, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 3.4817750453948975 }, { "auxiliary_loss_clip": 0.01157442, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.0541594, "balance_loss_mlp": 1.02135134, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 2.189756493945843, "language_loss": 0.81669915, "learning_rate": 2.9017983441657595e-06, "loss": 0.83857703, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 3.565948486328125 }, { "auxiliary_loss_clip": 0.01125556, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.04706526, "balance_loss_mlp": 1.02132499, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.1141230269211797, "language_loss": 0.75708586, "learning_rate": 2.9011029852488564e-06, "loss": 0.77864039, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 3.6867053508758545 }, { "auxiliary_loss_clip": 0.01099981, "auxiliary_loss_mlp": 0.01001765, "balance_loss_clip": 1.02901316, "balance_loss_mlp": 0.99996549, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 0.9953645983421067, "language_loss": 0.62476158, "learning_rate": 2.9004074896353465e-06, "loss": 0.64577901, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.159785270690918 }, { "auxiliary_loss_clip": 0.01189717, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.05873954, "balance_loss_mlp": 1.02206159, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 2.3021300997247076, "language_loss": 0.81495917, "learning_rate": 2.8997118574307362e-06, "loss": 0.83715963, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.597669839859009 }, { "auxiliary_loss_clip": 0.01150198, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 1.05414438, "balance_loss_mlp": 1.01909721, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.0952094426039145, "language_loss": 0.74305856, "learning_rate": 2.899016088740553e-06, "loss": 0.7648356, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.679699420928955 }, { "auxiliary_loss_clip": 0.01125655, "auxiliary_loss_mlp": 0.01028321, "balance_loss_clip": 1.05003476, "balance_loss_mlp": 1.01987493, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 2.778882366760685, "language_loss": 0.79286242, "learning_rate": 2.898320183670344e-06, "loss": 0.81440222, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 2.714003801345825 }, { "auxiliary_loss_clip": 0.01126895, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.05450106, "balance_loss_mlp": 1.02310586, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 4.152167528198494, "language_loss": 0.88912046, "learning_rate": 2.8976241423256767e-06, "loss": 0.91070724, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.8201260566711426 }, { "auxiliary_loss_clip": 0.01151025, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.05104136, "balance_loss_mlp": 1.01935327, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 2.8085432828654318, "language_loss": 0.68085223, "learning_rate": 2.896927964812142e-06, "loss": 0.70263946, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.7263481616973877 }, { "auxiliary_loss_clip": 0.01162829, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.05863345, "balance_loss_mlp": 1.01844954, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.3082452847328634, "language_loss": 0.74659395, "learning_rate": 2.8962316512353465e-06, "loss": 0.76849318, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.6737148761749268 }, { "auxiliary_loss_clip": 0.01104113, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.04656005, "balance_loss_mlp": 1.02108002, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.5063155097122922, "language_loss": 0.74967003, "learning_rate": 2.8955352017009233e-06, "loss": 0.7710048, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.7723007202148438 }, { "auxiliary_loss_clip": 0.01159084, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 1.05702758, "balance_loss_mlp": 1.02550495, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 3.208973878725219, "language_loss": 0.77305877, "learning_rate": 2.8948386163145212e-06, "loss": 0.79498953, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.6979775428771973 }, { "auxiliary_loss_clip": 0.0117664, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.05535507, "balance_loss_mlp": 1.01985323, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.9575377448840703, "language_loss": 0.79355061, "learning_rate": 2.8941418951818135e-06, "loss": 0.81559932, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.6248528957366943 }, { "auxiliary_loss_clip": 0.01141481, "auxiliary_loss_mlp": 0.01026993, "balance_loss_clip": 1.04959238, "balance_loss_mlp": 1.01886845, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 3.16810787410192, "language_loss": 0.70816803, "learning_rate": 2.8934450384084903e-06, "loss": 0.72985274, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.727132558822632 }, { "auxiliary_loss_clip": 0.01149196, "auxiliary_loss_mlp": 0.01027978, "balance_loss_clip": 1.05041242, "balance_loss_mlp": 1.01943052, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 2.024427394221537, "language_loss": 0.69364738, "learning_rate": 2.8927480461002653e-06, "loss": 0.71541911, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.7058284282684326 }, { "auxiliary_loss_clip": 0.0115444, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.05035162, "balance_loss_mlp": 1.02575564, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.662377779207516, "language_loss": 0.85855746, "learning_rate": 2.892050918362872e-06, "loss": 0.88044524, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.651454210281372 }, { "auxiliary_loss_clip": 0.01017847, "auxiliary_loss_mlp": 0.01004864, "balance_loss_clip": 1.02513933, "balance_loss_mlp": 1.00286102, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 1.4199087103164834, "language_loss": 0.55993998, "learning_rate": 2.8913536553020626e-06, "loss": 0.58016706, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.370059013366699 }, { "auxiliary_loss_clip": 0.01115458, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.04728293, "balance_loss_mlp": 1.01904726, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.918575819895311, "language_loss": 0.84577364, "learning_rate": 2.8906562570236137e-06, "loss": 0.86720425, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 2.868943452835083 }, { "auxiliary_loss_clip": 0.01107713, "auxiliary_loss_mlp": 0.01022711, "balance_loss_clip": 1.046, "balance_loss_mlp": 1.01461124, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.7386835466595598, "language_loss": 0.76402092, "learning_rate": 2.889958723633318e-06, "loss": 0.78532517, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.8032431602478027 }, { "auxiliary_loss_clip": 0.01140133, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.05101895, "balance_loss_mlp": 1.02227378, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.746736217902641, "language_loss": 0.73958111, "learning_rate": 2.889261055236992e-06, "loss": 0.76128215, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.7452011108398438 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.0549345, "balance_loss_mlp": 1.02483225, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.9276662055399867, "language_loss": 0.82494152, "learning_rate": 2.8885632519404704e-06, "loss": 0.84683537, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.7217278480529785 }, { "auxiliary_loss_clip": 0.01154978, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.0531683, "balance_loss_mlp": 1.02376699, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.2115395255093926, "language_loss": 0.75491053, "learning_rate": 2.8878653138496107e-06, "loss": 0.77678269, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.683567523956299 }, { "auxiliary_loss_clip": 0.01106317, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04541183, "balance_loss_mlp": 1.01985836, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.6647251683852886, "language_loss": 0.76572949, "learning_rate": 2.8871672410702878e-06, "loss": 0.78707302, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.885307788848877 }, { "auxiliary_loss_clip": 0.01150124, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.05127227, "balance_loss_mlp": 1.01761723, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.856583736333337, "language_loss": 0.82653725, "learning_rate": 2.8864690337084008e-06, "loss": 0.84830177, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.7724757194519043 }, { "auxiliary_loss_clip": 0.01167081, "auxiliary_loss_mlp": 0.01026055, "balance_loss_clip": 1.05165863, "balance_loss_mlp": 1.01797867, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.8476770143922652, "language_loss": 0.7813623, "learning_rate": 2.885770691869866e-06, "loss": 0.8032937, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.688523769378662 }, { "auxiliary_loss_clip": 0.01170471, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.0544796, "balance_loss_mlp": 1.02000833, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 3.0043264436273223, "language_loss": 0.74311721, "learning_rate": 2.8850722156606207e-06, "loss": 0.76510572, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 3.685331106185913 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.05222964, "balance_loss_mlp": 1.02033901, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.5665808343817207, "language_loss": 0.67045921, "learning_rate": 2.8843736051866252e-06, "loss": 0.69239616, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 3.5912632942199707 }, { "auxiliary_loss_clip": 0.01121407, "auxiliary_loss_mlp": 0.00713311, "balance_loss_clip": 1.0481683, "balance_loss_mlp": 1.00081348, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 2.123075876553819, "language_loss": 0.69236267, "learning_rate": 2.8836748605538557e-06, "loss": 0.71070987, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 3.8259267807006836 }, { "auxiliary_loss_clip": 0.01161381, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.05227828, "balance_loss_mlp": 1.02237332, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 4.790602123073358, "language_loss": 0.63267905, "learning_rate": 2.882975981868313e-06, "loss": 0.65459919, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 3.7009029388427734 }, { "auxiliary_loss_clip": 0.01172489, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.05570984, "balance_loss_mlp": 1.02277017, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.744830460563377, "language_loss": 0.6858983, "learning_rate": 2.882276969236016e-06, "loss": 0.7079432, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.8644216060638428 }, { "auxiliary_loss_clip": 0.01151948, "auxiliary_loss_mlp": 0.01038855, "balance_loss_clip": 1.05064654, "balance_loss_mlp": 1.03030145, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.287909777742296, "language_loss": 0.76416671, "learning_rate": 2.881577822763005e-06, "loss": 0.78607476, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.6240971088409424 }, { "auxiliary_loss_clip": 0.01170875, "auxiliary_loss_mlp": 0.01028319, "balance_loss_clip": 1.05295146, "balance_loss_mlp": 1.02019501, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.8201834576477647, "language_loss": 0.87842703, "learning_rate": 2.880878542555338e-06, "loss": 0.900419, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.629387855529785 }, { "auxiliary_loss_clip": 0.01191618, "auxiliary_loss_mlp": 0.01030828, "balance_loss_clip": 1.05670547, "balance_loss_mlp": 1.02269185, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 2.8824344732319642, "language_loss": 0.80394369, "learning_rate": 2.8801791287190976e-06, "loss": 0.82616812, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.5961415767669678 }, { "auxiliary_loss_clip": 0.01175798, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.05419564, "balance_loss_mlp": 1.02196443, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 3.040025147259029, "language_loss": 0.86005104, "learning_rate": 2.8794795813603817e-06, "loss": 0.88211775, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.6489810943603516 }, { "auxiliary_loss_clip": 0.01174697, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.05175114, "balance_loss_mlp": 1.02525425, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 2.3433124967395464, "language_loss": 0.81461614, "learning_rate": 2.878779900585314e-06, "loss": 0.83669913, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.647555351257324 }, { "auxiliary_loss_clip": 0.01163243, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.05331945, "balance_loss_mlp": 1.02270365, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.7647439379612844, "language_loss": 0.75385165, "learning_rate": 2.8780800865000336e-06, "loss": 0.77579224, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.7706809043884277 }, { "auxiliary_loss_clip": 0.01086599, "auxiliary_loss_mlp": 0.01011796, "balance_loss_clip": 1.02653825, "balance_loss_mlp": 1.00997198, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.976136138146838, "language_loss": 0.59136868, "learning_rate": 2.877380139210702e-06, "loss": 0.61235261, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.1428215503692627 }, { "auxiliary_loss_clip": 0.0114286, "auxiliary_loss_mlp": 0.01034452, "balance_loss_clip": 1.05061388, "balance_loss_mlp": 1.02527332, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.60821603116303, "language_loss": 0.76285011, "learning_rate": 2.876680058823501e-06, "loss": 0.78462327, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.7412052154541016 }, { "auxiliary_loss_clip": 0.0114815, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.05148304, "balance_loss_mlp": 1.0169971, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 1.7990664475459606, "language_loss": 0.66126871, "learning_rate": 2.8759798454446314e-06, "loss": 0.68300527, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.7047605514526367 }, { "auxiliary_loss_clip": 0.01177063, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 1.05594647, "balance_loss_mlp": 1.02286541, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 2.108996195736057, "language_loss": 0.81333244, "learning_rate": 2.8752794991803173e-06, "loss": 0.83541965, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.692551374435425 }, { "auxiliary_loss_clip": 0.01152663, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.05225897, "balance_loss_mlp": 1.02138209, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 4.8598351080476645, "language_loss": 0.75410724, "learning_rate": 2.8745790201367976e-06, "loss": 0.77592921, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.6004817485809326 }, { "auxiliary_loss_clip": 0.01193657, "auxiliary_loss_mlp": 0.01026285, "balance_loss_clip": 1.05748999, "balance_loss_mlp": 1.01809573, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 8.689669711431781, "language_loss": 0.84105778, "learning_rate": 2.8738784084203373e-06, "loss": 0.86325723, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.6232948303222656 }, { "auxiliary_loss_clip": 0.01148979, "auxiliary_loss_mlp": 0.01031406, "balance_loss_clip": 1.04946232, "balance_loss_mlp": 1.02377629, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.678126482681455, "language_loss": 0.78782797, "learning_rate": 2.873177664137216e-06, "loss": 0.80963176, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.655071973800659 }, { "auxiliary_loss_clip": 0.01140526, "auxiliary_loss_mlp": 0.0102809, "balance_loss_clip": 1.05355263, "balance_loss_mlp": 1.01981723, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 2.96380118548156, "language_loss": 0.6898278, "learning_rate": 2.8724767873937384e-06, "loss": 0.711514, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.7801506519317627 }, { "auxiliary_loss_clip": 0.01160212, "auxiliary_loss_mlp": 0.01028, "balance_loss_clip": 1.05434811, "balance_loss_mlp": 1.01980448, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.4717660182763432, "language_loss": 0.87581658, "learning_rate": 2.871775778296225e-06, "loss": 0.89769864, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.6241438388824463 }, { "auxiliary_loss_clip": 0.01176883, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.05758238, "balance_loss_mlp": 1.02247262, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.373751978214288, "language_loss": 0.78287208, "learning_rate": 2.8710746369510196e-06, "loss": 0.80495703, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.7338309288024902 }, { "auxiliary_loss_clip": 0.01149654, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.05015779, "balance_loss_mlp": 1.01858401, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.4160013027197302, "language_loss": 0.83428925, "learning_rate": 2.8703733634644846e-06, "loss": 0.85605562, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.636295795440674 }, { "auxiliary_loss_clip": 0.01189306, "auxiliary_loss_mlp": 0.01027667, "balance_loss_clip": 1.05717504, "balance_loss_mlp": 1.01974893, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.693918887364713, "language_loss": 0.79223943, "learning_rate": 2.869671957943002e-06, "loss": 0.8144092, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.5805752277374268 }, { "auxiliary_loss_clip": 0.01153573, "auxiliary_loss_mlp": 0.01030404, "balance_loss_clip": 1.0569768, "balance_loss_mlp": 1.02180886, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 1.9162288947854753, "language_loss": 0.74117535, "learning_rate": 2.8689704204929747e-06, "loss": 0.76301515, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.6837077140808105 }, { "auxiliary_loss_clip": 0.01189372, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.05614567, "balance_loss_mlp": 1.01821375, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 1.9190853664446121, "language_loss": 0.80658722, "learning_rate": 2.8682687512208253e-06, "loss": 0.82874691, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.561455011367798 }, { "auxiliary_loss_clip": 0.01179009, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.05476654, "balance_loss_mlp": 1.02869654, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 2.6326476064979296, "language_loss": 0.80323112, "learning_rate": 2.8675669502329972e-06, "loss": 0.82538986, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.656081199645996 }, { "auxiliary_loss_clip": 0.01170797, "auxiliary_loss_mlp": 0.00712282, "balance_loss_clip": 1.0517019, "balance_loss_mlp": 1.00067997, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.505406922485489, "language_loss": 0.85663384, "learning_rate": 2.866865017635952e-06, "loss": 0.87546456, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 3.480705499649048 }, { "auxiliary_loss_clip": 0.01144276, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.05637515, "balance_loss_mlp": 1.0265398, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 4.630830822527213, "language_loss": 0.794025, "learning_rate": 2.866162953536174e-06, "loss": 0.81582189, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 3.6362690925598145 }, { "auxiliary_loss_clip": 0.01157092, "auxiliary_loss_mlp": 0.00711808, "balance_loss_clip": 1.05309343, "balance_loss_mlp": 1.0008204, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.7112012148113733, "language_loss": 0.745951, "learning_rate": 2.8654607580401634e-06, "loss": 0.76463997, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 3.5843441486358643 }, { "auxiliary_loss_clip": 0.01084538, "auxiliary_loss_mlp": 0.01001579, "balance_loss_clip": 1.02760386, "balance_loss_mlp": 0.9997552, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.8779342726046869, "language_loss": 0.65133381, "learning_rate": 2.8647584312544446e-06, "loss": 0.67219496, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.1911232471466064 }, { "auxiliary_loss_clip": 0.01135236, "auxiliary_loss_mlp": 0.00712058, "balance_loss_clip": 1.04924655, "balance_loss_mlp": 1.00070953, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.4772510766443026, "language_loss": 0.85046303, "learning_rate": 2.864055973285559e-06, "loss": 0.86893594, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 3.6656665802001953 }, { "auxiliary_loss_clip": 0.01145054, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 1.0507021, "balance_loss_mlp": 1.01615477, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.928637039168916, "language_loss": 0.86600542, "learning_rate": 2.8633533842400698e-06, "loss": 0.88769448, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.680948257446289 }, { "auxiliary_loss_clip": 0.01173702, "auxiliary_loss_mlp": 0.00712787, "balance_loss_clip": 1.05505955, "balance_loss_mlp": 1.00077856, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.8085750835698096, "language_loss": 0.7759912, "learning_rate": 2.862650664224558e-06, "loss": 0.79485607, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.6605753898620605 }, { "auxiliary_loss_clip": 0.01169103, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.05573893, "balance_loss_mlp": 1.02043152, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.604496194760241, "language_loss": 0.69670427, "learning_rate": 2.861947813345627e-06, "loss": 0.71867329, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.832796573638916 }, { "auxiliary_loss_clip": 0.01193188, "auxiliary_loss_mlp": 0.0071236, "balance_loss_clip": 1.05956054, "balance_loss_mlp": 1.00075328, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 1.9252923014681704, "language_loss": 0.72681034, "learning_rate": 2.8612448317098974e-06, "loss": 0.74586582, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.6298420429229736 }, { "auxiliary_loss_clip": 0.01140858, "auxiliary_loss_mlp": 0.00713141, "balance_loss_clip": 1.05046606, "balance_loss_mlp": 1.00072718, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 1.9968737856236363, "language_loss": 0.8316468, "learning_rate": 2.8605417194240114e-06, "loss": 0.85018682, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.681957721710205 }, { "auxiliary_loss_clip": 0.01164742, "auxiliary_loss_mlp": 0.01026042, "balance_loss_clip": 1.05128646, "balance_loss_mlp": 1.01809621, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 2.024113602272879, "language_loss": 0.78725809, "learning_rate": 2.8598384765946315e-06, "loss": 0.80916589, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.5801889896392822 }, { "auxiliary_loss_clip": 0.01186693, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.05487072, "balance_loss_mlp": 1.02011883, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 5.715240845188757, "language_loss": 0.7176649, "learning_rate": 2.8591351033284377e-06, "loss": 0.7398144, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.605360507965088 }, { "auxiliary_loss_clip": 0.01173477, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.05304849, "balance_loss_mlp": 1.02252996, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.2151278603438582, "language_loss": 0.83629668, "learning_rate": 2.8584315997321325e-06, "loss": 0.85833728, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.5571751594543457 }, { "auxiliary_loss_clip": 0.01188137, "auxiliary_loss_mlp": 0.01031372, "balance_loss_clip": 1.05485892, "balance_loss_mlp": 1.02321172, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.6321616211693546, "language_loss": 0.78117752, "learning_rate": 2.8577279659124356e-06, "loss": 0.80337262, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.5783448219299316 }, { "auxiliary_loss_clip": 0.01166328, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.05165458, "balance_loss_mlp": 1.01927495, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.8896503700632865, "language_loss": 0.83523476, "learning_rate": 2.857024201976089e-06, "loss": 0.85716915, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.579197645187378 }, { "auxiliary_loss_clip": 0.01152597, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 1.05229986, "balance_loss_mlp": 1.01926994, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 3.2224704829794533, "language_loss": 0.73223466, "learning_rate": 2.8563203080298516e-06, "loss": 0.75403726, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.7482924461364746 }, { "auxiliary_loss_clip": 0.01155626, "auxiliary_loss_mlp": 0.00712178, "balance_loss_clip": 1.05466712, "balance_loss_mlp": 1.00071335, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.4052993406898535, "language_loss": 0.88898849, "learning_rate": 2.855616284180505e-06, "loss": 0.9076665, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.697938919067383 }, { "auxiliary_loss_clip": 0.01090343, "auxiliary_loss_mlp": 0.01003263, "balance_loss_clip": 1.03111386, "balance_loss_mlp": 1.00166595, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8753624965443081, "language_loss": 0.66188264, "learning_rate": 2.8549121305348477e-06, "loss": 0.68281877, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.1778643131256104 }, { "auxiliary_loss_clip": 0.01170774, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.05295992, "balance_loss_mlp": 1.02121353, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 4.117109996101501, "language_loss": 0.83488685, "learning_rate": 2.8542078471997006e-06, "loss": 0.85688174, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.722123146057129 }, { "auxiliary_loss_clip": 0.01170324, "auxiliary_loss_mlp": 0.0102283, "balance_loss_clip": 1.05230486, "balance_loss_mlp": 1.01518917, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.6013292257964054, "language_loss": 0.75827414, "learning_rate": 2.8535034342819013e-06, "loss": 0.78020573, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.607893705368042 }, { "auxiliary_loss_clip": 0.01185034, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.0545547, "balance_loss_mlp": 1.02171838, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 2.879472651423328, "language_loss": 0.72627985, "learning_rate": 2.85279889188831e-06, "loss": 0.74843109, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.60547137260437 }, { "auxiliary_loss_clip": 0.01139, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04903579, "balance_loss_mlp": 1.01725864, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 3.02788380793733, "language_loss": 0.81280959, "learning_rate": 2.852094220125805e-06, "loss": 0.83445197, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.6783998012542725 }, { "auxiliary_loss_clip": 0.01172263, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.0555439, "balance_loss_mlp": 1.02133977, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.110916446235482, "language_loss": 0.71284902, "learning_rate": 2.8513894191012846e-06, "loss": 0.73487318, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.575165271759033 }, { "auxiliary_loss_clip": 0.01186968, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.05452502, "balance_loss_mlp": 1.0232259, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.5109661375417267, "language_loss": 0.78756881, "learning_rate": 2.8506844889216664e-06, "loss": 0.80974865, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.519202470779419 }, { "auxiliary_loss_clip": 0.0109465, "auxiliary_loss_mlp": 0.01006161, "balance_loss_clip": 1.04081249, "balance_loss_mlp": 1.00423014, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8650001770360981, "language_loss": 0.62863016, "learning_rate": 2.849979429693887e-06, "loss": 0.6496383, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.1713216304779053 }, { "auxiliary_loss_clip": 0.01183586, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.0532068, "balance_loss_mlp": 1.01786518, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 2.48255713680655, "language_loss": 0.74096495, "learning_rate": 2.8492742415249042e-06, "loss": 0.76306427, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.4569079875946045 }, { "auxiliary_loss_clip": 0.01185974, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.05486894, "balance_loss_mlp": 1.02063179, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.9085763877196644, "language_loss": 0.76202744, "learning_rate": 2.848568924521694e-06, "loss": 0.78417408, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 3.394674062728882 }, { "auxiliary_loss_clip": 0.01162301, "auxiliary_loss_mlp": 0.01025174, "balance_loss_clip": 1.0493499, "balance_loss_mlp": 1.0164423, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 1.8014596731025492, "language_loss": 0.73825651, "learning_rate": 2.8478634787912526e-06, "loss": 0.76013124, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 3.5216054916381836 }, { "auxiliary_loss_clip": 0.01170936, "auxiliary_loss_mlp": 0.01025737, "balance_loss_clip": 1.05320835, "balance_loss_mlp": 1.01778531, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.2267908963771106, "language_loss": 0.76618195, "learning_rate": 2.847157904440596e-06, "loss": 0.78814864, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 3.57285213470459 }, { "auxiliary_loss_clip": 0.01169871, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.05552006, "balance_loss_mlp": 1.01892495, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.8468586191821013, "language_loss": 0.74076426, "learning_rate": 2.846452201576759e-06, "loss": 0.76273042, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 3.520612955093384 }, { "auxiliary_loss_clip": 0.01079759, "auxiliary_loss_mlp": 0.01004001, "balance_loss_clip": 1.03026879, "balance_loss_mlp": 1.00226104, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.8415741793811059, "language_loss": 0.62774158, "learning_rate": 2.845746370306795e-06, "loss": 0.64857912, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.3036065101623535 }, { "auxiliary_loss_clip": 0.01170761, "auxiliary_loss_mlp": 0.01033018, "balance_loss_clip": 1.0526377, "balance_loss_mlp": 1.02488208, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 2.0335449120513474, "language_loss": 0.78570855, "learning_rate": 2.84504041073778e-06, "loss": 0.80774641, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.6001060009002686 }, { "auxiliary_loss_clip": 0.0115073, "auxiliary_loss_mlp": 0.01031092, "balance_loss_clip": 1.05533361, "balance_loss_mlp": 1.02284229, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.710539940653463, "language_loss": 0.79105949, "learning_rate": 2.844334322976806e-06, "loss": 0.81287766, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.642960786819458 }, { "auxiliary_loss_clip": 0.01130303, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.04977417, "balance_loss_mlp": 1.01971149, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 2.4251276010470013, "language_loss": 0.8324762, "learning_rate": 2.8436281071309866e-06, "loss": 0.85406196, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.8325791358947754 }, { "auxiliary_loss_clip": 0.01049338, "auxiliary_loss_mlp": 0.0100371, "balance_loss_clip": 1.02616906, "balance_loss_mlp": 1.00204122, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7244246567229339, "language_loss": 0.52966225, "learning_rate": 2.842921763307455e-06, "loss": 0.55019271, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.2391037940979004 }, { "auxiliary_loss_clip": 0.01150843, "auxiliary_loss_mlp": 0.01019809, "balance_loss_clip": 1.05200267, "balance_loss_mlp": 1.01206613, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 2.1428944014851905, "language_loss": 0.82338363, "learning_rate": 2.842215291613361e-06, "loss": 0.84509015, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.69267201423645 }, { "auxiliary_loss_clip": 0.01016314, "auxiliary_loss_mlp": 0.01002441, "balance_loss_clip": 1.02797604, "balance_loss_mlp": 1.00086784, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.7806168141464089, "language_loss": 0.59192306, "learning_rate": 2.8415086921558774e-06, "loss": 0.61211061, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.4136881828308105 }, { "auxiliary_loss_clip": 0.0114113, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.04620314, "balance_loss_mlp": 1.01849699, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.9059235077756578, "language_loss": 0.78466892, "learning_rate": 2.840801965042194e-06, "loss": 0.80634022, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 2.8095741271972656 }, { "auxiliary_loss_clip": 0.01144, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.04729986, "balance_loss_mlp": 1.02524245, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.8866207186002184, "language_loss": 0.83779979, "learning_rate": 2.840095110379521e-06, "loss": 0.85957336, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.6251015663146973 }, { "auxiliary_loss_clip": 0.01042224, "auxiliary_loss_mlp": 0.01002573, "balance_loss_clip": 1.02750969, "balance_loss_mlp": 1.00086832, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7292685748435184, "language_loss": 0.53844631, "learning_rate": 2.8393881282750884e-06, "loss": 0.55889428, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.250868797302246 }, { "auxiliary_loss_clip": 0.01156652, "auxiliary_loss_mlp": 0.01029074, "balance_loss_clip": 1.05334783, "balance_loss_mlp": 1.02015078, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 1.9338787905465737, "language_loss": 0.78757817, "learning_rate": 2.838681018836144e-06, "loss": 0.80943543, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.6840453147888184 }, { "auxiliary_loss_clip": 0.01141304, "auxiliary_loss_mlp": 0.00711979, "balance_loss_clip": 1.04955149, "balance_loss_mlp": 1.00097442, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 4.490715710249874, "language_loss": 0.78182149, "learning_rate": 2.837973782169955e-06, "loss": 0.80035436, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.6406280994415283 }, { "auxiliary_loss_clip": 0.01098823, "auxiliary_loss_mlp": 0.0100349, "balance_loss_clip": 1.02928472, "balance_loss_mlp": 1.00159454, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.9061054163133612, "language_loss": 0.59240514, "learning_rate": 2.8372664183838096e-06, "loss": 0.61342829, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.2456040382385254 }, { "auxiliary_loss_clip": 0.01188114, "auxiliary_loss_mlp": 0.0102483, "balance_loss_clip": 1.05723321, "balance_loss_mlp": 1.01693845, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.060506341000264, "language_loss": 0.6810348, "learning_rate": 2.836558927585015e-06, "loss": 0.70316428, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.557997703552246 }, { "auxiliary_loss_clip": 0.01174163, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.05390382, "balance_loss_mlp": 1.01913691, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 1.8783898775749788, "language_loss": 0.81875443, "learning_rate": 2.8358513098808957e-06, "loss": 0.8407619, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.646181583404541 }, { "auxiliary_loss_clip": 0.0111678, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.0509007, "balance_loss_mlp": 1.02072096, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.8782509044925282, "language_loss": 0.7668817, "learning_rate": 2.835143565378798e-06, "loss": 0.78834772, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.722625255584717 }, { "auxiliary_loss_clip": 0.01101163, "auxiliary_loss_mlp": 0.01025312, "balance_loss_clip": 1.04643154, "balance_loss_mlp": 1.01717019, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 2.00162098845872, "language_loss": 0.7812342, "learning_rate": 2.8344356941860847e-06, "loss": 0.80249894, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.727200746536255 }, { "auxiliary_loss_clip": 0.01137141, "auxiliary_loss_mlp": 0.010287, "balance_loss_clip": 1.05320573, "balance_loss_mlp": 1.0202359, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.7954400600200935, "language_loss": 0.66943163, "learning_rate": 2.8337276964101403e-06, "loss": 0.69108999, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.8126869201660156 }, { "auxiliary_loss_clip": 0.01174054, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.05468357, "balance_loss_mlp": 1.01927423, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 2.6014381014938968, "language_loss": 0.76148379, "learning_rate": 2.833019572158367e-06, "loss": 0.78349555, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.5918283462524414 }, { "auxiliary_loss_clip": 0.01155693, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.05383468, "balance_loss_mlp": 1.01714683, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 2.381195887136688, "language_loss": 0.80140054, "learning_rate": 2.8323113215381872e-06, "loss": 0.82321322, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.729886770248413 }, { "auxiliary_loss_clip": 0.01138752, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.05113935, "balance_loss_mlp": 1.02471471, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 1.9132507020868559, "language_loss": 0.76284909, "learning_rate": 2.831602944657042e-06, "loss": 0.78457659, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.676578998565674 }, { "auxiliary_loss_clip": 0.01161669, "auxiliary_loss_mlp": 0.01025077, "balance_loss_clip": 1.05198777, "balance_loss_mlp": 1.01669693, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.145718697021353, "language_loss": 0.74339056, "learning_rate": 2.830894441622391e-06, "loss": 0.76525801, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.655980348587036 }, { "auxiliary_loss_clip": 0.01136789, "auxiliary_loss_mlp": 0.00712416, "balance_loss_clip": 1.0480597, "balance_loss_mlp": 1.00103617, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 1.8490209605434609, "language_loss": 0.80030465, "learning_rate": 2.8301858125417134e-06, "loss": 0.81879669, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 3.875680685043335 }, { "auxiliary_loss_clip": 0.01159998, "auxiliary_loss_mlp": 0.01024144, "balance_loss_clip": 1.05679107, "balance_loss_mlp": 1.01658654, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.8452117304741258, "language_loss": 0.73867297, "learning_rate": 2.8294770575225082e-06, "loss": 0.76051438, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 3.659843683242798 }, { "auxiliary_loss_clip": 0.01171492, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.05629408, "balance_loss_mlp": 1.02123046, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.7501167122391188, "language_loss": 0.84017682, "learning_rate": 2.828768176672293e-06, "loss": 0.86218482, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 3.649854898452759 }, { "auxiliary_loss_clip": 0.01138873, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 1.0495646, "balance_loss_mlp": 1.01974499, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.9099051936148725, "language_loss": 0.70971036, "learning_rate": 2.8280591700986044e-06, "loss": 0.7313838, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 3.71964955329895 }, { "auxiliary_loss_clip": 0.01157464, "auxiliary_loss_mlp": 0.01023043, "balance_loss_clip": 1.04982591, "balance_loss_mlp": 1.0153538, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 1.68477661956032, "language_loss": 0.75292438, "learning_rate": 2.827350037908999e-06, "loss": 0.77472949, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.7296090126037598 }, { "auxiliary_loss_clip": 0.01148358, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.05246246, "balance_loss_mlp": 1.02019525, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.214346095588715, "language_loss": 0.79040098, "learning_rate": 2.8266407802110496e-06, "loss": 0.81217444, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.692612886428833 }, { "auxiliary_loss_clip": 0.01099582, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.04698551, "balance_loss_mlp": 1.02323771, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 2.1626189251260763, "language_loss": 0.75825369, "learning_rate": 2.8259313971123515e-06, "loss": 0.77957052, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.8163208961486816 }, { "auxiliary_loss_clip": 0.01167254, "auxiliary_loss_mlp": 0.01026632, "balance_loss_clip": 1.05424368, "balance_loss_mlp": 1.01885355, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5337293078084229, "language_loss": 0.78219789, "learning_rate": 2.8252218887205166e-06, "loss": 0.80413675, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.6521353721618652 }, { "auxiliary_loss_clip": 0.01110831, "auxiliary_loss_mlp": 0.01024179, "balance_loss_clip": 1.04884696, "balance_loss_mlp": 1.01616776, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.8633348169409605, "language_loss": 0.80711949, "learning_rate": 2.824512255143178e-06, "loss": 0.82846951, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.7303502559661865 }, { "auxiliary_loss_clip": 0.01144163, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.05216742, "balance_loss_mlp": 1.02113461, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.795757518221699, "language_loss": 0.79229069, "learning_rate": 2.8238024964879855e-06, "loss": 0.81402206, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.696401596069336 }, { "auxiliary_loss_clip": 0.0119215, "auxiliary_loss_mlp": 0.01026867, "balance_loss_clip": 1.05749381, "balance_loss_mlp": 1.01850426, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.3825059581683465, "language_loss": 0.76986492, "learning_rate": 2.8230926128626095e-06, "loss": 0.79205501, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.5689985752105713 }, { "auxiliary_loss_clip": 0.01148537, "auxiliary_loss_mlp": 0.01025995, "balance_loss_clip": 1.05043125, "balance_loss_mlp": 1.01731038, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 2.0846047641838803, "language_loss": 0.78825283, "learning_rate": 2.822382604374738e-06, "loss": 0.80999815, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.6932191848754883 }, { "auxiliary_loss_clip": 0.0115777, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.05665219, "balance_loss_mlp": 1.03018749, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 2.257328182942212, "language_loss": 0.65688407, "learning_rate": 2.8216724711320793e-06, "loss": 0.67885339, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.708150625228882 }, { "auxiliary_loss_clip": 0.01186119, "auxiliary_loss_mlp": 0.0071249, "balance_loss_clip": 1.05534768, "balance_loss_mlp": 1.00099218, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.6645332168001603, "language_loss": 0.79844701, "learning_rate": 2.820962213242361e-06, "loss": 0.81743312, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.5981557369232178 }, { "auxiliary_loss_clip": 0.01171074, "auxiliary_loss_mlp": 0.01025156, "balance_loss_clip": 1.0575726, "balance_loss_mlp": 1.01727653, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 4.229452506955188, "language_loss": 0.8416152, "learning_rate": 2.8202518308133264e-06, "loss": 0.86357749, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.5936086177825928 }, { "auxiliary_loss_clip": 0.01190271, "auxiliary_loss_mlp": 0.01023384, "balance_loss_clip": 1.05607152, "balance_loss_mlp": 1.0151763, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 2.2346331752071493, "language_loss": 0.73539507, "learning_rate": 2.8195413239527426e-06, "loss": 0.75753164, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.576615571975708 }, { "auxiliary_loss_clip": 0.01164878, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.05146098, "balance_loss_mlp": 1.01855898, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 2.2875697694162476, "language_loss": 0.80154383, "learning_rate": 2.8188306927683906e-06, "loss": 0.82346213, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.597019672393799 }, { "auxiliary_loss_clip": 0.01156347, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.05409789, "balance_loss_mlp": 1.02019691, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.565514090374619, "language_loss": 0.75060296, "learning_rate": 2.818119937368074e-06, "loss": 0.77244687, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.6381912231445312 }, { "auxiliary_loss_clip": 0.01177037, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.05337822, "balance_loss_mlp": 1.02128899, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 4.299440749159815, "language_loss": 0.65622419, "learning_rate": 2.817409057859613e-06, "loss": 0.67829674, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.6711599826812744 }, { "auxiliary_loss_clip": 0.01120298, "auxiliary_loss_mlp": 0.01025837, "balance_loss_clip": 1.0506891, "balance_loss_mlp": 1.0175283, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 2.942338965193402, "language_loss": 0.79330349, "learning_rate": 2.8166980543508482e-06, "loss": 0.81476486, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.7148261070251465 }, { "auxiliary_loss_clip": 0.01191663, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.05885971, "balance_loss_mlp": 1.02034879, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 1.766388825016305, "language_loss": 0.80057156, "learning_rate": 2.815986926949638e-06, "loss": 0.82277894, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.675985097885132 }, { "auxiliary_loss_clip": 0.01171333, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.05677509, "balance_loss_mlp": 1.01769233, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 2.875210059475869, "language_loss": 0.80379868, "learning_rate": 2.8152756757638597e-06, "loss": 0.82577252, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.587883234024048 }, { "auxiliary_loss_clip": 0.01174049, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.05785191, "balance_loss_mlp": 1.02308393, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 2.6894764753413654, "language_loss": 0.84620643, "learning_rate": 2.8145643009014093e-06, "loss": 0.86826259, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.6415600776672363 }, { "auxiliary_loss_clip": 0.01174485, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.05581498, "balance_loss_mlp": 1.01820791, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.8328351622396701, "language_loss": 0.79062641, "learning_rate": 2.813852802470202e-06, "loss": 0.8126269, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.598708152770996 }, { "auxiliary_loss_clip": 0.01154106, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.05515575, "balance_loss_mlp": 1.02057242, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.988316984897444, "language_loss": 0.72681499, "learning_rate": 2.8131411805781717e-06, "loss": 0.7486462, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.660428285598755 }, { "auxiliary_loss_clip": 0.01165821, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.06033182, "balance_loss_mlp": 1.02305472, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.4221437140306064, "language_loss": 0.64295125, "learning_rate": 2.8124294353332707e-06, "loss": 0.66492593, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.644263744354248 }, { "auxiliary_loss_clip": 0.01147149, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.05324185, "balance_loss_mlp": 1.02047014, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 1.632777384428412, "language_loss": 0.7738831, "learning_rate": 2.8117175668434713e-06, "loss": 0.79563606, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 3.654984712600708 }, { "auxiliary_loss_clip": 0.01190966, "auxiliary_loss_mlp": 0.01025811, "balance_loss_clip": 1.05744553, "balance_loss_mlp": 1.01789594, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.2845398535768493, "language_loss": 0.70161194, "learning_rate": 2.811005575216762e-06, "loss": 0.72377968, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 3.469102144241333 }, { "auxiliary_loss_clip": 0.01138724, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.05332685, "balance_loss_mlp": 1.01902151, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.5339819685376368, "language_loss": 0.79098874, "learning_rate": 2.8102934605611513e-06, "loss": 0.81264365, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 3.6598727703094482 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01026285, "balance_loss_clip": 1.05715561, "balance_loss_mlp": 1.01790476, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.4407148785879635, "language_loss": 0.67190325, "learning_rate": 2.8095812229846665e-06, "loss": 0.69380879, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 3.560413360595703 }, { "auxiliary_loss_clip": 0.01161641, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.05621028, "balance_loss_mlp": 1.02367234, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.363279167958978, "language_loss": 0.69090366, "learning_rate": 2.808868862595355e-06, "loss": 0.71284735, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.686325788497925 }, { "auxiliary_loss_clip": 0.01179573, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.05766511, "balance_loss_mlp": 1.01966429, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 2.082041779838581, "language_loss": 0.79751068, "learning_rate": 2.8081563795012795e-06, "loss": 0.8195895, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.6445953845977783 }, { "auxiliary_loss_clip": 0.01167838, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.05625105, "balance_loss_mlp": 1.01649237, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.7336429986516955, "language_loss": 0.73649597, "learning_rate": 2.807443773810524e-06, "loss": 0.75842243, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.7494473457336426 }, { "auxiliary_loss_clip": 0.01145878, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.05651021, "balance_loss_mlp": 1.023664, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.8403688163666012, "language_loss": 0.89622015, "learning_rate": 2.80673104563119e-06, "loss": 0.91799974, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.6969618797302246 }, { "auxiliary_loss_clip": 0.01172152, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.05810356, "balance_loss_mlp": 1.01592147, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 2.544316502438844, "language_loss": 0.78844678, "learning_rate": 2.8060181950713976e-06, "loss": 0.81040704, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.5972251892089844 }, { "auxiliary_loss_clip": 0.01141179, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.0530436, "balance_loss_mlp": 1.01905155, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 2.449611086223843, "language_loss": 0.80982572, "learning_rate": 2.805305222239286e-06, "loss": 0.83151758, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.6370558738708496 }, { "auxiliary_loss_clip": 0.01160105, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.05877829, "balance_loss_mlp": 1.02283549, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 2.2403880257145974, "language_loss": 0.74168015, "learning_rate": 2.8045921272430118e-06, "loss": 0.7635861, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.7516024112701416 }, { "auxiliary_loss_clip": 0.01183119, "auxiliary_loss_mlp": 0.01033829, "balance_loss_clip": 1.05782318, "balance_loss_mlp": 1.02491212, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.259529103984059, "language_loss": 0.76254624, "learning_rate": 2.803878910190753e-06, "loss": 0.78471577, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.594838857650757 }, { "auxiliary_loss_clip": 0.01177449, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.05515087, "balance_loss_mlp": 1.02293992, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.599071007766846, "language_loss": 0.82587099, "learning_rate": 2.8031655711907017e-06, "loss": 0.84795421, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.557600736618042 }, { "auxiliary_loss_clip": 0.0117924, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.05989718, "balance_loss_mlp": 1.01485395, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.0088759247361265, "language_loss": 0.80743372, "learning_rate": 2.8024521103510723e-06, "loss": 0.82945538, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.6378378868103027 }, { "auxiliary_loss_clip": 0.01172524, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.05299497, "balance_loss_mlp": 1.02015638, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.9513151650525828, "language_loss": 0.75721312, "learning_rate": 2.8017385277800952e-06, "loss": 0.77921695, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.6175875663757324 }, { "auxiliary_loss_clip": 0.01147665, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.05602288, "balance_loss_mlp": 1.02166259, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 2.0419563089659696, "language_loss": 0.74801683, "learning_rate": 2.8010248235860213e-06, "loss": 0.76980048, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.7707924842834473 }, { "auxiliary_loss_clip": 0.01083398, "auxiliary_loss_mlp": 0.0070289, "balance_loss_clip": 1.03504705, "balance_loss_mlp": 1.00099874, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8397748695140739, "language_loss": 0.62792313, "learning_rate": 2.8003109978771192e-06, "loss": 0.64578599, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.2911391258239746 }, { "auxiliary_loss_clip": 0.01134367, "auxiliary_loss_mlp": 0.01024926, "balance_loss_clip": 1.04886436, "balance_loss_mlp": 1.01689124, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 2.0255798950859067, "language_loss": 0.78724706, "learning_rate": 2.799597050761674e-06, "loss": 0.80883998, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.704834222793579 }, { "auxiliary_loss_clip": 0.01195812, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.06177711, "balance_loss_mlp": 1.02464008, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 8.159071043265511, "language_loss": 0.79070401, "learning_rate": 2.7988829823479924e-06, "loss": 0.81298769, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.6000571250915527 }, { "auxiliary_loss_clip": 0.01153251, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.05273473, "balance_loss_mlp": 1.02213371, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.8047609568485141, "language_loss": 0.64116871, "learning_rate": 2.7981687927443976e-06, "loss": 0.66300517, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.623168468475342 }, { "auxiliary_loss_clip": 0.01173755, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.05464303, "balance_loss_mlp": 1.01762521, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 2.024352265473409, "language_loss": 0.85542274, "learning_rate": 2.797454482059231e-06, "loss": 0.87741673, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.604511022567749 }, { "auxiliary_loss_clip": 0.01196857, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.0611043, "balance_loss_mlp": 1.01828706, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 2.3565540936848905, "language_loss": 0.84701437, "learning_rate": 2.7967400504008537e-06, "loss": 0.86924982, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.5922460556030273 }, { "auxiliary_loss_clip": 0.01053435, "auxiliary_loss_mlp": 0.01021299, "balance_loss_clip": 1.03492188, "balance_loss_mlp": 1.01934397, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.810917721446951, "language_loss": 0.57472932, "learning_rate": 2.7960254978776456e-06, "loss": 0.59547669, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.297614574432373 }, { "auxiliary_loss_clip": 0.01196571, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 1.06123388, "balance_loss_mlp": 1.02090168, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.044938529274284, "language_loss": 0.82011086, "learning_rate": 2.7953108245980006e-06, "loss": 0.84236795, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.543884038925171 }, { "auxiliary_loss_clip": 0.01159977, "auxiliary_loss_mlp": 0.0102701, "balance_loss_clip": 1.06102145, "balance_loss_mlp": 1.01890969, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.5580522264628451, "language_loss": 0.73751909, "learning_rate": 2.7945960306703365e-06, "loss": 0.75938898, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.6863045692443848 }, { "auxiliary_loss_clip": 0.01182102, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.05832267, "balance_loss_mlp": 1.0179162, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.5839402383223244, "language_loss": 0.65696251, "learning_rate": 2.7938811162030865e-06, "loss": 0.67904383, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.6810545921325684 }, { "auxiliary_loss_clip": 0.01174616, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.05800676, "balance_loss_mlp": 1.01984215, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 2.4022689065620346, "language_loss": 0.82266569, "learning_rate": 2.793166081304702e-06, "loss": 0.84468603, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 3.5566911697387695 }, { "auxiliary_loss_clip": 0.01148141, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.05361915, "balance_loss_mlp": 1.02027583, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 1.8504913574772437, "language_loss": 0.82660198, "learning_rate": 2.7924509260836543e-06, "loss": 0.8483727, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 3.6382946968078613 }, { "auxiliary_loss_clip": 0.01140974, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.05210042, "balance_loss_mlp": 1.02168131, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.4934456944761867, "language_loss": 0.68253011, "learning_rate": 2.791735650648431e-06, "loss": 0.70424032, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 3.568133592605591 }, { "auxiliary_loss_clip": 0.01157761, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.05589211, "balance_loss_mlp": 1.01547825, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 2.1372802661048547, "language_loss": 0.7428295, "learning_rate": 2.791020255107538e-06, "loss": 0.76464653, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 3.533369302749634 }, { "auxiliary_loss_clip": 0.01137332, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.04941058, "balance_loss_mlp": 1.0188576, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.7039495595500072, "language_loss": 0.80649102, "learning_rate": 2.7903047395695023e-06, "loss": 0.82814133, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.7621896266937256 }, { "auxiliary_loss_clip": 0.01175703, "auxiliary_loss_mlp": 0.0071264, "balance_loss_clip": 1.05961668, "balance_loss_mlp": 1.00063968, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.140488136563491, "language_loss": 0.90292841, "learning_rate": 2.789589104142865e-06, "loss": 0.92181182, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.6354072093963623 }, { "auxiliary_loss_clip": 0.01145903, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.05422866, "balance_loss_mlp": 1.01964688, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 11.639612282615087, "language_loss": 0.7667436, "learning_rate": 2.7888733489361895e-06, "loss": 0.78847754, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.6941494941711426 }, { "auxiliary_loss_clip": 0.01106464, "auxiliary_loss_mlp": 0.01007834, "balance_loss_clip": 1.03613234, "balance_loss_mlp": 1.00589132, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7340403266798425, "language_loss": 0.58733594, "learning_rate": 2.788157474058054e-06, "loss": 0.60847902, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.243403434753418 }, { "auxiliary_loss_clip": 0.01188609, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.05729699, "balance_loss_mlp": 1.02368629, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.6180514666913444, "language_loss": 0.70090324, "learning_rate": 2.7874414796170555e-06, "loss": 0.72310328, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.716701030731201 }, { "auxiliary_loss_clip": 0.01169696, "auxiliary_loss_mlp": 0.01033415, "balance_loss_clip": 1.05429482, "balance_loss_mlp": 1.02464747, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 3.6276545437627594, "language_loss": 0.8407985, "learning_rate": 2.7867253657218113e-06, "loss": 0.86282957, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.5909337997436523 }, { "auxiliary_loss_clip": 0.01157307, "auxiliary_loss_mlp": 0.00712325, "balance_loss_clip": 1.05311728, "balance_loss_mlp": 1.00057292, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 2.403905309014829, "language_loss": 0.73094529, "learning_rate": 2.7860091324809544e-06, "loss": 0.74964154, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.722156286239624 }, { "auxiliary_loss_clip": 0.0117394, "auxiliary_loss_mlp": 0.0102661, "balance_loss_clip": 1.0590204, "balance_loss_mlp": 1.01880193, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.8543387037021863, "language_loss": 0.80929297, "learning_rate": 2.7852927800031377e-06, "loss": 0.83129847, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.638279914855957 }, { "auxiliary_loss_clip": 0.01158242, "auxiliary_loss_mlp": 0.01025548, "balance_loss_clip": 1.05528831, "balance_loss_mlp": 1.01778769, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 11.8786138600118, "language_loss": 0.82806361, "learning_rate": 2.7845763083970298e-06, "loss": 0.84990144, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.773859977722168 }, { "auxiliary_loss_clip": 0.01166938, "auxiliary_loss_mlp": 0.01029399, "balance_loss_clip": 1.05371141, "balance_loss_mlp": 1.02049994, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 1.9288483093231459, "language_loss": 0.82141626, "learning_rate": 2.7838597177713205e-06, "loss": 0.84337962, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.685803174972534 }, { "auxiliary_loss_clip": 0.01104616, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.05371702, "balance_loss_mlp": 1.02398133, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 1.8257817376054744, "language_loss": 0.73687893, "learning_rate": 2.7831430082347143e-06, "loss": 0.75824964, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.7459940910339355 }, { "auxiliary_loss_clip": 0.01178042, "auxiliary_loss_mlp": 0.00711775, "balance_loss_clip": 1.05821085, "balance_loss_mlp": 1.00068951, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 5.358230512447936, "language_loss": 0.82339084, "learning_rate": 2.7824261798959373e-06, "loss": 0.84228903, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.65285587310791 }, { "auxiliary_loss_clip": 0.01159615, "auxiliary_loss_mlp": 0.01040244, "balance_loss_clip": 1.05227268, "balance_loss_mlp": 1.03221583, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.809458255628944, "language_loss": 0.79634368, "learning_rate": 2.78170923286373e-06, "loss": 0.81834227, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.6414308547973633 }, { "auxiliary_loss_clip": 0.01091153, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.05009973, "balance_loss_mlp": 1.02661586, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.376337305967102, "language_loss": 0.84144253, "learning_rate": 2.780992167246854e-06, "loss": 0.86271101, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.8090474605560303 }, { "auxiliary_loss_clip": 0.01082902, "auxiliary_loss_mlp": 0.01004189, "balance_loss_clip": 1.03515148, "balance_loss_mlp": 1.00228143, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.976453779176755, "language_loss": 0.7214874, "learning_rate": 2.7802749831540883e-06, "loss": 0.74235833, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.24760365486145 }, { "auxiliary_loss_clip": 0.01130697, "auxiliary_loss_mlp": 0.01031693, "balance_loss_clip": 1.05127668, "balance_loss_mlp": 1.02444255, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 2.065228376530988, "language_loss": 0.81413531, "learning_rate": 2.7795576806942268e-06, "loss": 0.83575916, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.696350336074829 }, { "auxiliary_loss_clip": 0.01090277, "auxiliary_loss_mlp": 0.0100234, "balance_loss_clip": 1.05427384, "balance_loss_mlp": 1.00050449, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7593447437290978, "language_loss": 0.54911137, "learning_rate": 2.778840259976085e-06, "loss": 0.57003748, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.1889491081237793 }, { "auxiliary_loss_clip": 0.01179696, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.05908597, "balance_loss_mlp": 1.02026451, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 1.969275170421358, "language_loss": 0.77211684, "learning_rate": 2.778122721108495e-06, "loss": 0.79419935, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.6282801628112793 }, { "auxiliary_loss_clip": 0.01170262, "auxiliary_loss_mlp": 0.01039189, "balance_loss_clip": 1.05563593, "balance_loss_mlp": 1.03092206, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 8.404244527769958, "language_loss": 0.88331956, "learning_rate": 2.7774050642003076e-06, "loss": 0.9054141, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.6701056957244873 }, { "auxiliary_loss_clip": 0.01196643, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.0603081, "balance_loss_mlp": 1.02351475, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 3.4536240917339933, "language_loss": 0.93175125, "learning_rate": 2.7766872893603896e-06, "loss": 0.95403808, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.633366346359253 }, { "auxiliary_loss_clip": 0.01174758, "auxiliary_loss_mlp": 0.01028137, "balance_loss_clip": 1.05563807, "balance_loss_mlp": 1.02029324, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 2.808680157415979, "language_loss": 0.73362446, "learning_rate": 2.7759693966976275e-06, "loss": 0.75565338, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.6211040019989014 }, { "auxiliary_loss_clip": 0.01141262, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.05300391, "balance_loss_mlp": 1.01901078, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 2.36378699659736, "language_loss": 0.85410845, "learning_rate": 2.7752513863209242e-06, "loss": 0.87579656, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.649378776550293 }, { "auxiliary_loss_clip": 0.01156762, "auxiliary_loss_mlp": 0.00711459, "balance_loss_clip": 1.05767763, "balance_loss_mlp": 1.00047445, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.7944685992908191, "language_loss": 0.84467542, "learning_rate": 2.774533258339203e-06, "loss": 0.86335766, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 3.5531349182128906 }, { "auxiliary_loss_clip": 0.01124641, "auxiliary_loss_mlp": 0.01032936, "balance_loss_clip": 1.04583764, "balance_loss_mlp": 1.02460921, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.559791728862209, "language_loss": 0.80120873, "learning_rate": 2.7738150128614014e-06, "loss": 0.82278448, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 3.6104471683502197 }, { "auxiliary_loss_clip": 0.01131773, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.05276155, "balance_loss_mlp": 1.02641654, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.9327058490134952, "language_loss": 0.90069056, "learning_rate": 2.7730966499964777e-06, "loss": 0.92235476, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 3.5881638526916504 }, { "auxiliary_loss_clip": 0.01192151, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.0233531, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 2.7697347571587105, "language_loss": 0.80833507, "learning_rate": 2.772378169853408e-06, "loss": 0.83057612, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 3.5268101692199707 }, { "auxiliary_loss_clip": 0.01147214, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.05756021, "balance_loss_mlp": 1.02742136, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 2.037513422677225, "language_loss": 0.74315685, "learning_rate": 2.771659572541183e-06, "loss": 0.76498556, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.679615020751953 }, { "auxiliary_loss_clip": 0.01180134, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.05988359, "balance_loss_mlp": 1.02384579, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 2.6610098139108227, "language_loss": 0.8703683, "learning_rate": 2.7709408581688143e-06, "loss": 0.89248121, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.6160061359405518 }, { "auxiliary_loss_clip": 0.0114799, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 1.05440497, "balance_loss_mlp": 1.02689719, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 1.74411026886662, "language_loss": 0.88027191, "learning_rate": 2.7702220268453307e-06, "loss": 0.90209866, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.683347225189209 }, { "auxiliary_loss_clip": 0.01161292, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 1.05388236, "balance_loss_mlp": 1.02008653, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 2.504067859683392, "language_loss": 0.84870613, "learning_rate": 2.7695030786797785e-06, "loss": 0.87060064, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.6170389652252197 }, { "auxiliary_loss_clip": 0.01121811, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.05054533, "balance_loss_mlp": 1.02031851, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 9.020119278939674, "language_loss": 0.74374872, "learning_rate": 2.7687840137812206e-06, "loss": 0.76524723, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.761094570159912 }, { "auxiliary_loss_clip": 0.01082392, "auxiliary_loss_mlp": 0.01003121, "balance_loss_clip": 1.02952838, "balance_loss_mlp": 1.00144005, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.8024554541735851, "language_loss": 0.62074006, "learning_rate": 2.7680648322587395e-06, "loss": 0.64159524, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.22892427444458 }, { "auxiliary_loss_clip": 0.01189738, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.05760658, "balance_loss_mlp": 1.02435756, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 2.193615581562159, "language_loss": 0.81003129, "learning_rate": 2.7673455342214334e-06, "loss": 0.83225387, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.5893964767456055 }, { "auxiliary_loss_clip": 0.01176566, "auxiliary_loss_mlp": 0.01026869, "balance_loss_clip": 1.05765665, "balance_loss_mlp": 1.0187571, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 8.878717534167116, "language_loss": 0.76045847, "learning_rate": 2.7666261197784198e-06, "loss": 0.78249288, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.5993857383728027 }, { "auxiliary_loss_clip": 0.01154367, "auxiliary_loss_mlp": 0.0102505, "balance_loss_clip": 1.05640984, "balance_loss_mlp": 1.0172832, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.3391051891160846, "language_loss": 0.76909292, "learning_rate": 2.7659065890388336e-06, "loss": 0.79088706, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.6719133853912354 }, { "auxiliary_loss_clip": 0.01157593, "auxiliary_loss_mlp": 0.01027611, "balance_loss_clip": 1.05330586, "balance_loss_mlp": 1.0193435, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 2.1083766942306013, "language_loss": 0.84877479, "learning_rate": 2.7651869421118266e-06, "loss": 0.87062681, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.604069471359253 }, { "auxiliary_loss_clip": 0.01178235, "auxiliary_loss_mlp": 0.010282, "balance_loss_clip": 1.05913186, "balance_loss_mlp": 1.02014744, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.9295377754704415, "language_loss": 0.82989204, "learning_rate": 2.76446717910657e-06, "loss": 0.85195649, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.656005620956421 }, { "auxiliary_loss_clip": 0.01170816, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.05578208, "balance_loss_mlp": 1.02325344, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.1594891660018973, "language_loss": 0.76420397, "learning_rate": 2.763747300132249e-06, "loss": 0.78622574, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.592308521270752 }, { "auxiliary_loss_clip": 0.01191848, "auxiliary_loss_mlp": 0.01026625, "balance_loss_clip": 1.06034219, "balance_loss_mlp": 1.01875114, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 2.1507122482278045, "language_loss": 0.86173052, "learning_rate": 2.7630273052980704e-06, "loss": 0.88391525, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.6423685550689697 }, { "auxiliary_loss_clip": 0.01147391, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.05338705, "balance_loss_mlp": 1.02348137, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.1394778107224246, "language_loss": 0.67480755, "learning_rate": 2.7623071947132554e-06, "loss": 0.69659573, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.6647214889526367 }, { "auxiliary_loss_clip": 0.01162602, "auxiliary_loss_mlp": 0.01026318, "balance_loss_clip": 1.05328631, "balance_loss_mlp": 1.01806319, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 4.050192713550704, "language_loss": 0.78526103, "learning_rate": 2.7615869684870458e-06, "loss": 0.80715024, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.6659181118011475 }, { "auxiliary_loss_clip": 0.01174081, "auxiliary_loss_mlp": 0.01026864, "balance_loss_clip": 1.05886674, "balance_loss_mlp": 1.018785, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.8444199605621034, "language_loss": 0.84512579, "learning_rate": 2.7608666267286986e-06, "loss": 0.86713523, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.720146656036377 }, { "auxiliary_loss_clip": 0.01102548, "auxiliary_loss_mlp": 0.01036692, "balance_loss_clip": 1.04604459, "balance_loss_mlp": 1.02816856, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.5528809418356406, "language_loss": 0.86843681, "learning_rate": 2.760146169547489e-06, "loss": 0.88982922, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.750408172607422 }, { "auxiliary_loss_clip": 0.01162403, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 1.05780005, "balance_loss_mlp": 1.01786208, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.7115047710611462, "language_loss": 0.7645613, "learning_rate": 2.75942559705271e-06, "loss": 0.78644645, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.7106096744537354 }, { "auxiliary_loss_clip": 0.01171555, "auxiliary_loss_mlp": 0.01032393, "balance_loss_clip": 1.0552361, "balance_loss_mlp": 1.02363718, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 5.2219978927008075, "language_loss": 0.89403182, "learning_rate": 2.7587049093536713e-06, "loss": 0.9160713, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.736431837081909 }, { "auxiliary_loss_clip": 0.01175353, "auxiliary_loss_mlp": 0.01029677, "balance_loss_clip": 1.05424917, "balance_loss_mlp": 1.02171957, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 1.971993479057338, "language_loss": 0.80432361, "learning_rate": 2.757984106559701e-06, "loss": 0.82637393, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.6980364322662354 }, { "auxiliary_loss_clip": 0.0115011, "auxiliary_loss_mlp": 0.01033449, "balance_loss_clip": 1.05443561, "balance_loss_mlp": 1.02509236, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.41799140846852, "language_loss": 0.71243304, "learning_rate": 2.7572631887801446e-06, "loss": 0.73426867, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.8443336486816406 }, { "auxiliary_loss_clip": 0.01174227, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 1.05733347, "balance_loss_mlp": 1.01768994, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.679170526344795, "language_loss": 0.76404518, "learning_rate": 2.7565421561243654e-06, "loss": 0.78604645, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.6618943214416504 }, { "auxiliary_loss_clip": 0.01136463, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.05198407, "balance_loss_mlp": 1.02348304, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 6.867656734554428, "language_loss": 0.81840849, "learning_rate": 2.7558210087017413e-06, "loss": 0.84008938, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 3.6330127716064453 }, { "auxiliary_loss_clip": 0.01137933, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.0554111, "balance_loss_mlp": 1.02450418, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 1.7949450618380391, "language_loss": 0.73159528, "learning_rate": 2.7550997466216724e-06, "loss": 0.75330174, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 3.581728219985962 }, { "auxiliary_loss_clip": 0.01156333, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.05705523, "balance_loss_mlp": 1.02518153, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 1.953481296310458, "language_loss": 0.81151581, "learning_rate": 2.7543783699935714e-06, "loss": 0.8334105, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.77272367477417 }, { "auxiliary_loss_clip": 0.01172603, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.05835724, "balance_loss_mlp": 1.02192092, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 3.7516438292846916, "language_loss": 0.85907423, "learning_rate": 2.753656878926872e-06, "loss": 0.88110626, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 3.7437636852264404 }, { "auxiliary_loss_clip": 0.01147986, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.05245423, "balance_loss_mlp": 1.02082872, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.8276680613311316, "language_loss": 0.74109173, "learning_rate": 2.752935273531023e-06, "loss": 0.7628594, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.673067569732666 }, { "auxiliary_loss_clip": 0.01175939, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.05687582, "balance_loss_mlp": 1.01714468, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 2.154645431812841, "language_loss": 0.78519827, "learning_rate": 2.752213553915492e-06, "loss": 0.80721712, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.6374073028564453 }, { "auxiliary_loss_clip": 0.010759, "auxiliary_loss_mlp": 0.01004893, "balance_loss_clip": 1.03517556, "balance_loss_mlp": 1.00302148, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.81295610428401, "language_loss": 0.66066295, "learning_rate": 2.751491720189762e-06, "loss": 0.68147087, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.1876025199890137 }, { "auxiliary_loss_clip": 0.01160286, "auxiliary_loss_mlp": 0.00711813, "balance_loss_clip": 1.0572226, "balance_loss_mlp": 1.00060034, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.1521059429962, "language_loss": 0.92240679, "learning_rate": 2.7507697724633364e-06, "loss": 0.94112778, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.6824519634246826 }, { "auxiliary_loss_clip": 0.01068823, "auxiliary_loss_mlp": 0.01005698, "balance_loss_clip": 1.04560411, "balance_loss_mlp": 1.00389767, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7785185070900764, "language_loss": 0.54744744, "learning_rate": 2.7500477108457327e-06, "loss": 0.5681926, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.062999725341797 }, { "auxiliary_loss_clip": 0.01175442, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.05836439, "balance_loss_mlp": 1.01899803, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.7746094291290029, "language_loss": 0.80762362, "learning_rate": 2.7493255354464877e-06, "loss": 0.82965267, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.6819539070129395 }, { "auxiliary_loss_clip": 0.01050244, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.04362726, "balance_loss_mlp": 1.02353942, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 2.2145254177542792, "language_loss": 0.76208687, "learning_rate": 2.748603246375156e-06, "loss": 0.78290308, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.01218581199646 }, { "auxiliary_loss_clip": 0.01192006, "auxiliary_loss_mlp": 0.01035801, "balance_loss_clip": 1.05960846, "balance_loss_mlp": 1.02802896, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 2.3066933036185926, "language_loss": 0.70490611, "learning_rate": 2.7478808437413055e-06, "loss": 0.72718424, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 3.018944025039673 }, { "auxiliary_loss_clip": 0.01127563, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.05525279, "balance_loss_mlp": 1.02083039, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 2.2410480361505516, "language_loss": 0.66066706, "learning_rate": 2.7471583276545263e-06, "loss": 0.68223351, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.812809467315674 }, { "auxiliary_loss_clip": 0.01159253, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.0548892, "balance_loss_mlp": 1.01911759, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 2.011949532779798, "language_loss": 0.70168889, "learning_rate": 2.7464356982244224e-06, "loss": 0.72355658, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.6420998573303223 }, { "auxiliary_loss_clip": 0.01100819, "auxiliary_loss_mlp": 0.01002751, "balance_loss_clip": 1.05122828, "balance_loss_mlp": 1.00104678, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7763242820773675, "language_loss": 0.61689866, "learning_rate": 2.745712955560617e-06, "loss": 0.63793445, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.1672160625457764 }, { "auxiliary_loss_clip": 0.0111079, "auxiliary_loss_mlp": 0.01029571, "balance_loss_clip": 1.05210567, "balance_loss_mlp": 1.02101767, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 2.071793308225696, "language_loss": 0.76830715, "learning_rate": 2.7449900997727496e-06, "loss": 0.78971076, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.8116166591644287 }, { "auxiliary_loss_clip": 0.01156533, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.05689538, "balance_loss_mlp": 1.02553296, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 2.0781347847851723, "language_loss": 0.84341842, "learning_rate": 2.744267130970476e-06, "loss": 0.8653183, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.699995994567871 }, { "auxiliary_loss_clip": 0.01153358, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.05650353, "balance_loss_mlp": 1.02327383, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 2.6628128997442064, "language_loss": 0.77214479, "learning_rate": 2.7435440492634697e-06, "loss": 0.79399407, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.7226414680480957 }, { "auxiliary_loss_clip": 0.01159683, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.05704856, "balance_loss_mlp": 1.02388716, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 2.0320543791547436, "language_loss": 0.67789215, "learning_rate": 2.7428208547614228e-06, "loss": 0.69982159, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.671464681625366 }, { "auxiliary_loss_clip": 0.01173878, "auxiliary_loss_mlp": 0.01030538, "balance_loss_clip": 1.0583446, "balance_loss_mlp": 1.02178836, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 4.122127251373048, "language_loss": 0.77495134, "learning_rate": 2.742097547574043e-06, "loss": 0.79699546, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.6254777908325195 }, { "auxiliary_loss_clip": 0.01165232, "auxiliary_loss_mlp": 0.00712098, "balance_loss_clip": 1.05467176, "balance_loss_mlp": 1.00056911, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 1.7668081985125286, "language_loss": 0.77711582, "learning_rate": 2.7413741278110544e-06, "loss": 0.79588914, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.679715394973755 }, { "auxiliary_loss_clip": 0.01162684, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.05712354, "balance_loss_mlp": 1.02139854, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 29.556647464643223, "language_loss": 0.69004464, "learning_rate": 2.7406505955822016e-06, "loss": 0.71197522, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.8441452980041504 }, { "auxiliary_loss_clip": 0.01156292, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.05323851, "balance_loss_mlp": 1.02596283, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 2.361328402114889, "language_loss": 0.66167092, "learning_rate": 2.7399269509972415e-06, "loss": 0.6835748, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.6094858646392822 }, { "auxiliary_loss_clip": 0.01153211, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.05141592, "balance_loss_mlp": 1.01668406, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.289035140646171, "language_loss": 0.85442543, "learning_rate": 2.7392031941659514e-06, "loss": 0.87621522, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.677921772003174 }, { "auxiliary_loss_clip": 0.01160019, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.05947423, "balance_loss_mlp": 1.02267528, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.9097685386555592, "language_loss": 0.86099875, "learning_rate": 2.7384793251981244e-06, "loss": 0.88290536, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.709534168243408 }, { "auxiliary_loss_clip": 0.01181585, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.05751419, "balance_loss_mlp": 1.0234704, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 2.3463856425699796, "language_loss": 0.80933368, "learning_rate": 2.737755344203571e-06, "loss": 0.83146751, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 4.03944206237793 }, { "auxiliary_loss_clip": 0.01181252, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.06181073, "balance_loss_mlp": 1.02208376, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.6791579729607375, "language_loss": 0.79665041, "learning_rate": 2.7370312512921186e-06, "loss": 0.81876653, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 3.716949462890625 }, { "auxiliary_loss_clip": 0.01160312, "auxiliary_loss_mlp": 0.01024704, "balance_loss_clip": 1.05361462, "balance_loss_mlp": 1.01666355, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 5.936668142119486, "language_loss": 0.77004808, "learning_rate": 2.736307046573611e-06, "loss": 0.79189825, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.5982744693756104 }, { "auxiliary_loss_clip": 0.01189225, "auxiliary_loss_mlp": 0.01029735, "balance_loss_clip": 1.05807543, "balance_loss_mlp": 1.02137864, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 3.961186430396137, "language_loss": 0.81534576, "learning_rate": 2.73558273015791e-06, "loss": 0.83753538, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 4.472063779830933 }, { "auxiliary_loss_clip": 0.01195966, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.06176829, "balance_loss_mlp": 1.0199914, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.4856296671173337, "language_loss": 0.69966906, "learning_rate": 2.734858302154894e-06, "loss": 0.72191554, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.6465446949005127 }, { "auxiliary_loss_clip": 0.0115529, "auxiliary_loss_mlp": 0.01036612, "balance_loss_clip": 1.0554111, "balance_loss_mlp": 1.02835119, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 2.096080085610661, "language_loss": 0.76276553, "learning_rate": 2.734133762674457e-06, "loss": 0.78468454, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.6149704456329346 }, { "auxiliary_loss_clip": 0.01157673, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 1.02101958, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 1.934067049649986, "language_loss": 0.70827246, "learning_rate": 2.7334091118265124e-06, "loss": 0.7301476, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.7089695930480957 }, { "auxiliary_loss_clip": 0.01096677, "auxiliary_loss_mlp": 0.01013845, "balance_loss_clip": 1.03860474, "balance_loss_mlp": 1.01212871, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6771933109731914, "language_loss": 0.57819438, "learning_rate": 2.732684349720989e-06, "loss": 0.59929967, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.2601184844970703 }, { "auxiliary_loss_clip": 0.01143466, "auxiliary_loss_mlp": 0.01026813, "balance_loss_clip": 1.05164886, "balance_loss_mlp": 1.01873422, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.7430988515607901, "language_loss": 0.75546396, "learning_rate": 2.7319594764678318e-06, "loss": 0.77716672, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.736701250076294 }, { "auxiliary_loss_clip": 0.01127677, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.05244911, "balance_loss_mlp": 1.02142775, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.8089036403187946, "language_loss": 0.83175766, "learning_rate": 2.7312344921770044e-06, "loss": 0.85333449, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.7513175010681152 }, { "auxiliary_loss_clip": 0.01153586, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.04961979, "balance_loss_mlp": 1.01922131, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 1.9953265126285733, "language_loss": 0.78674299, "learning_rate": 2.7305093969584857e-06, "loss": 0.80855435, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.7078957557678223 }, { "auxiliary_loss_clip": 0.01167282, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.05421233, "balance_loss_mlp": 1.01555228, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.7485081542206093, "language_loss": 0.79846257, "learning_rate": 2.729784190922272e-06, "loss": 0.82037127, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.7217459678649902 }, { "auxiliary_loss_clip": 0.01078258, "auxiliary_loss_mlp": 0.01005571, "balance_loss_clip": 1.03318071, "balance_loss_mlp": 1.00381887, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.9645486948265724, "language_loss": 0.57257462, "learning_rate": 2.729058874178378e-06, "loss": 0.59341294, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.2779781818389893 }, { "auxiliary_loss_clip": 0.01163628, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.05567336, "balance_loss_mlp": 1.0269928, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 2.4518865586606915, "language_loss": 0.69050789, "learning_rate": 2.7283334468368315e-06, "loss": 0.71250147, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.7191832065582275 }, { "auxiliary_loss_clip": 0.01073486, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.04118013, "balance_loss_mlp": 1.02421033, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.8512711969030247, "language_loss": 0.727314, "learning_rate": 2.72760790900768e-06, "loss": 0.74837387, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.9360671043395996 }, { "auxiliary_loss_clip": 0.01191873, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.05852103, "balance_loss_mlp": 1.01982832, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.885362416262673, "language_loss": 0.78495115, "learning_rate": 2.7268822608009875e-06, "loss": 0.80714846, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.859874963760376 }, { "auxiliary_loss_clip": 0.01147097, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.05288672, "balance_loss_mlp": 1.01845407, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 2.2183805277150155, "language_loss": 0.78589356, "learning_rate": 2.726156502326834e-06, "loss": 0.80763453, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 230.98104286193848 }, { "auxiliary_loss_clip": 0.01049771, "auxiliary_loss_mlp": 0.01002825, "balance_loss_clip": 1.0432235, "balance_loss_mlp": 1.00137067, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.6981086644741762, "language_loss": 0.60257864, "learning_rate": 2.725430633695316e-06, "loss": 0.62310457, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.4273669719696045 }, { "auxiliary_loss_clip": 0.01104442, "auxiliary_loss_mlp": 0.01002844, "balance_loss_clip": 1.0350244, "balance_loss_mlp": 1.00117481, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.8827611478868914, "language_loss": 0.5792563, "learning_rate": 2.7247046550165485e-06, "loss": 0.60032916, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.3945581912994385 }, { "auxiliary_loss_clip": 0.01191126, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.0589745, "balance_loss_mlp": 1.02729201, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.4713034170757646, "language_loss": 0.758892, "learning_rate": 2.7239785664006606e-06, "loss": 0.78116286, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.780625104904175 }, { "auxiliary_loss_clip": 0.01093731, "auxiliary_loss_mlp": 0.01003509, "balance_loss_clip": 1.03473043, "balance_loss_mlp": 1.00188816, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.7666427883203237, "language_loss": 0.61786395, "learning_rate": 2.7232523679578002e-06, "loss": 0.63883638, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.4000766277313232 }, { "auxiliary_loss_clip": 0.0117416, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.05825377, "balance_loss_mlp": 1.01673293, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.479083230594827, "language_loss": 0.7960999, "learning_rate": 2.7225260597981295e-06, "loss": 0.81808949, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.7158925533294678 }, { "auxiliary_loss_clip": 0.01140937, "auxiliary_loss_mlp": 0.00713707, "balance_loss_clip": 1.05418348, "balance_loss_mlp": 1.00092685, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.5346346449733392, "language_loss": 0.78429008, "learning_rate": 2.721799642031831e-06, "loss": 0.8028366, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.771851062774658 }, { "auxiliary_loss_clip": 0.01160781, "auxiliary_loss_mlp": 0.01027275, "balance_loss_clip": 1.05135369, "balance_loss_mlp": 1.01972961, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.191997095365995, "language_loss": 0.77570421, "learning_rate": 2.721073114769101e-06, "loss": 0.79758477, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.7493486404418945 }, { "auxiliary_loss_clip": 0.01138869, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.05380225, "balance_loss_mlp": 1.019238, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 2.176620740295396, "language_loss": 0.75216794, "learning_rate": 2.7203464781201523e-06, "loss": 0.77382982, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.765479564666748 }, { "auxiliary_loss_clip": 0.01192813, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.05878615, "balance_loss_mlp": 1.02438414, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.262014663368254, "language_loss": 0.78082108, "learning_rate": 2.719619732195215e-06, "loss": 0.80307704, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.7673933506011963 }, { "auxiliary_loss_clip": 0.01139193, "auxiliary_loss_mlp": 0.01030276, "balance_loss_clip": 1.05089974, "balance_loss_mlp": 1.02168679, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 3.7286912349233776, "language_loss": 0.72394657, "learning_rate": 2.7188928771045377e-06, "loss": 0.74564129, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 3.918506145477295 }, { "auxiliary_loss_clip": 0.01131785, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.0478301, "balance_loss_mlp": 1.02233613, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 2.12592424912745, "language_loss": 0.80030918, "learning_rate": 2.7181659129583815e-06, "loss": 0.82193363, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 3.7611794471740723 }, { "auxiliary_loss_clip": 0.01147387, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.0482899, "balance_loss_mlp": 1.02134359, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 3.0787288178354704, "language_loss": 0.76186585, "learning_rate": 2.7174388398670276e-06, "loss": 0.78363293, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 3.731858015060425 }, { "auxiliary_loss_clip": 0.01188939, "auxiliary_loss_mlp": 0.01024165, "balance_loss_clip": 1.05377722, "balance_loss_mlp": 1.01592779, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 2.4376305695515117, "language_loss": 0.92216861, "learning_rate": 2.716711657940773e-06, "loss": 0.94429964, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 3.6905617713928223 }, { "auxiliary_loss_clip": 0.01058948, "auxiliary_loss_mlp": 0.01003236, "balance_loss_clip": 1.02788162, "balance_loss_mlp": 1.00160313, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8163176518558984, "language_loss": 0.56519461, "learning_rate": 2.7159843672899284e-06, "loss": 0.58581644, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.4504616260528564 }, { "auxiliary_loss_clip": 0.01175428, "auxiliary_loss_mlp": 0.01033769, "balance_loss_clip": 1.05719852, "balance_loss_mlp": 1.02518022, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 2.1879800090685944, "language_loss": 0.8107394, "learning_rate": 2.715256968024825e-06, "loss": 0.83283132, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.6781177520751953 }, { "auxiliary_loss_clip": 0.01165671, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.05559742, "balance_loss_mlp": 1.020172, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.615921824731357, "language_loss": 0.82335615, "learning_rate": 2.7145294602558083e-06, "loss": 0.84530294, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.7370166778564453 }, { "auxiliary_loss_clip": 0.01174361, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.05622411, "balance_loss_mlp": 1.01905632, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 3.1050960955995093, "language_loss": 0.7047646, "learning_rate": 2.713801844093241e-06, "loss": 0.72678554, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.7519707679748535 }, { "auxiliary_loss_clip": 0.01174576, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.05551314, "balance_loss_mlp": 1.0213443, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 2.260428234224503, "language_loss": 0.88131154, "learning_rate": 2.7130741196475014e-06, "loss": 0.90334624, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.707531690597534 }, { "auxiliary_loss_clip": 0.01160069, "auxiliary_loss_mlp": 0.01040023, "balance_loss_clip": 1.05584753, "balance_loss_mlp": 1.03093958, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 2.1355792258828794, "language_loss": 0.79301769, "learning_rate": 2.7123462870289848e-06, "loss": 0.81501865, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.7594752311706543 }, { "auxiliary_loss_clip": 0.01158541, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.05216551, "balance_loss_mlp": 1.01600838, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.9227761760163664, "language_loss": 0.80857074, "learning_rate": 2.711618346348102e-06, "loss": 0.83039969, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.7271366119384766 }, { "auxiliary_loss_clip": 0.01149354, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.05269289, "balance_loss_mlp": 1.02590442, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 4.127618370145456, "language_loss": 0.63474298, "learning_rate": 2.7108902977152825e-06, "loss": 0.65657401, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.6565279960632324 }, { "auxiliary_loss_clip": 0.01170536, "auxiliary_loss_mlp": 0.01027395, "balance_loss_clip": 1.05410409, "balance_loss_mlp": 1.01909792, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.3601257022211413, "language_loss": 0.7529189, "learning_rate": 2.7101621412409704e-06, "loss": 0.77489823, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.687051296234131 }, { "auxiliary_loss_clip": 0.01189715, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.05619824, "balance_loss_mlp": 1.0191859, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 7.272156405192773, "language_loss": 0.86014402, "learning_rate": 2.7094338770356256e-06, "loss": 0.88231814, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.6006832122802734 }, { "auxiliary_loss_clip": 0.01152896, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.05272722, "balance_loss_mlp": 1.0227989, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.005414504943715, "language_loss": 0.64298195, "learning_rate": 2.708705505209726e-06, "loss": 0.66481453, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.7129533290863037 }, { "auxiliary_loss_clip": 0.01116922, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.04701877, "balance_loss_mlp": 1.0233767, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.6644052619806216, "language_loss": 0.91535926, "learning_rate": 2.7079770258737646e-06, "loss": 0.93684256, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.6893656253814697 }, { "auxiliary_loss_clip": 0.01134615, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.04846609, "balance_loss_mlp": 1.02340913, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.1511882779636786, "language_loss": 0.75606728, "learning_rate": 2.707248439138251e-06, "loss": 0.77773339, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.7033681869506836 }, { "auxiliary_loss_clip": 0.01150195, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.05376494, "balance_loss_mlp": 1.02146482, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.778868579913651, "language_loss": 0.65207684, "learning_rate": 2.7065197451137114e-06, "loss": 0.67386985, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.6773266792297363 }, { "auxiliary_loss_clip": 0.01157332, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.05664635, "balance_loss_mlp": 1.01572704, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.279953459973089, "language_loss": 0.67457545, "learning_rate": 2.7057909439106894e-06, "loss": 0.69638526, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.6342456340789795 }, { "auxiliary_loss_clip": 0.01166831, "auxiliary_loss_mlp": 0.00712853, "balance_loss_clip": 1.05452192, "balance_loss_mlp": 1.00087714, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 2.050879272978249, "language_loss": 0.78788519, "learning_rate": 2.7050620356397417e-06, "loss": 0.80668205, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.632236957550049 }, { "auxiliary_loss_clip": 0.01184686, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.05649805, "balance_loss_mlp": 1.02443039, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.8053719067993388, "language_loss": 0.72477627, "learning_rate": 2.7043330204114437e-06, "loss": 0.74694312, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.6106929779052734 }, { "auxiliary_loss_clip": 0.01181369, "auxiliary_loss_mlp": 0.01028476, "balance_loss_clip": 1.0526315, "balance_loss_mlp": 1.02042985, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.102416586423846, "language_loss": 0.86222792, "learning_rate": 2.7036038983363862e-06, "loss": 0.88432646, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.5358550548553467 }, { "auxiliary_loss_clip": 0.01166353, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.0538348, "balance_loss_mlp": 1.02112079, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.7812556670025759, "language_loss": 0.8434124, "learning_rate": 2.702874669525177e-06, "loss": 0.86536384, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.6575889587402344 }, { "auxiliary_loss_clip": 0.011439, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.05478466, "balance_loss_mlp": 1.02292585, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.034355155217013, "language_loss": 0.69473666, "learning_rate": 2.7021453340884394e-06, "loss": 0.71647763, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.7791430950164795 }, { "auxiliary_loss_clip": 0.01145895, "auxiliary_loss_mlp": 0.00711483, "balance_loss_clip": 1.05244899, "balance_loss_mlp": 1.00086069, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.7210896037428838, "language_loss": 0.72963488, "learning_rate": 2.7014158921368125e-06, "loss": 0.74820876, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.6457440853118896 }, { "auxiliary_loss_clip": 0.01185938, "auxiliary_loss_mlp": 0.01027361, "balance_loss_clip": 1.05492878, "balance_loss_mlp": 1.01948762, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 1.9883923346667651, "language_loss": 0.85558474, "learning_rate": 2.700686343780953e-06, "loss": 0.87771773, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.654855489730835 }, { "auxiliary_loss_clip": 0.01153738, "auxiliary_loss_mlp": 0.01024944, "balance_loss_clip": 1.05137467, "balance_loss_mlp": 1.01702213, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 1.6901580805148977, "language_loss": 0.88038057, "learning_rate": 2.699956689131532e-06, "loss": 0.90216732, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 3.6439266204833984 }, { "auxiliary_loss_clip": 0.01156122, "auxiliary_loss_mlp": 0.01028142, "balance_loss_clip": 1.05216146, "balance_loss_mlp": 1.02036393, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.981803176432371, "language_loss": 0.85258079, "learning_rate": 2.699226928299238e-06, "loss": 0.87442338, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 3.5931272506713867 }, { "auxiliary_loss_clip": 0.01172258, "auxiliary_loss_mlp": 0.01024827, "balance_loss_clip": 1.05445218, "balance_loss_mlp": 1.01707852, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.5450142634576007, "language_loss": 0.78871298, "learning_rate": 2.698497061394774e-06, "loss": 0.81068385, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 3.658963918685913 }, { "auxiliary_loss_clip": 0.01146269, "auxiliary_loss_mlp": 0.00711912, "balance_loss_clip": 1.05284357, "balance_loss_mlp": 1.00091529, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.934692163760627, "language_loss": 0.80838418, "learning_rate": 2.6977670885288627e-06, "loss": 0.82696593, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 3.6898748874664307 }, { "auxiliary_loss_clip": 0.0114326, "auxiliary_loss_mlp": 0.01027392, "balance_loss_clip": 1.04889226, "balance_loss_mlp": 1.0193994, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.265763716050859, "language_loss": 0.75251472, "learning_rate": 2.6970370098122378e-06, "loss": 0.7742213, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.6118123531341553 }, { "auxiliary_loss_clip": 0.01188105, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.05627573, "balance_loss_mlp": 1.02344012, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.7642505126441714, "language_loss": 0.86832976, "learning_rate": 2.6963068253556535e-06, "loss": 0.89052773, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.7411482334136963 }, { "auxiliary_loss_clip": 0.01177849, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.0543189, "balance_loss_mlp": 1.0190928, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 2.1394071804353416, "language_loss": 0.85804331, "learning_rate": 2.6955765352698763e-06, "loss": 0.88009858, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.658809185028076 }, { "auxiliary_loss_clip": 0.01188707, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.05519366, "balance_loss_mlp": 1.01820064, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 2.5555057009778195, "language_loss": 0.73684019, "learning_rate": 2.6948461396656923e-06, "loss": 0.7589947, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.6519031524658203 }, { "auxiliary_loss_clip": 0.01179198, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.05791652, "balance_loss_mlp": 1.02451694, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.6719229601272674, "language_loss": 0.74395692, "learning_rate": 2.6941156386539013e-06, "loss": 0.76607442, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.6405766010284424 }, { "auxiliary_loss_clip": 0.01153384, "auxiliary_loss_mlp": 0.01028554, "balance_loss_clip": 1.05728078, "balance_loss_mlp": 1.02052546, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 1.9352219142408986, "language_loss": 0.81318724, "learning_rate": 2.6933850323453203e-06, "loss": 0.83500659, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.6565561294555664 }, { "auxiliary_loss_clip": 0.01189105, "auxiliary_loss_mlp": 0.01028049, "balance_loss_clip": 1.05848575, "balance_loss_mlp": 1.02006817, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 1.9718385255394386, "language_loss": 0.74403179, "learning_rate": 2.6926543208507806e-06, "loss": 0.76620334, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.5766749382019043 }, { "auxiliary_loss_clip": 0.0117112, "auxiliary_loss_mlp": 0.01034946, "balance_loss_clip": 1.05512071, "balance_loss_mlp": 1.02629113, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 2.4089279399080725, "language_loss": 0.8036449, "learning_rate": 2.6919235042811316e-06, "loss": 0.82570553, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.659364700317383 }, { "auxiliary_loss_clip": 0.01139761, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.05287528, "balance_loss_mlp": 1.02277839, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.358395657060911, "language_loss": 0.76316905, "learning_rate": 2.691192582747237e-06, "loss": 0.78487778, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.8297462463378906 }, { "auxiliary_loss_clip": 0.01187888, "auxiliary_loss_mlp": 0.01028559, "balance_loss_clip": 1.05692601, "balance_loss_mlp": 1.02003002, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.7540004158712947, "language_loss": 0.74201977, "learning_rate": 2.6904615563599765e-06, "loss": 0.76418424, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.717782735824585 }, { "auxiliary_loss_clip": 0.01136278, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.05100131, "balance_loss_mlp": 1.01723266, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.9827664852468845, "language_loss": 0.8344363, "learning_rate": 2.6897304252302477e-06, "loss": 0.85605389, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.727135419845581 }, { "auxiliary_loss_clip": 0.01063182, "auxiliary_loss_mlp": 0.01002672, "balance_loss_clip": 1.03399622, "balance_loss_mlp": 1.00076473, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7834116440126675, "language_loss": 0.54821283, "learning_rate": 2.688999189468962e-06, "loss": 0.56887138, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.150918483734131 }, { "auxiliary_loss_clip": 0.01173962, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.05887377, "balance_loss_mlp": 1.02088928, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.651029848771, "language_loss": 0.75721765, "learning_rate": 2.6882678491870464e-06, "loss": 0.77924556, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.713738441467285 }, { "auxiliary_loss_clip": 0.01176633, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.05494809, "balance_loss_mlp": 1.02343261, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.758024748119846, "language_loss": 0.71441996, "learning_rate": 2.6875364044954453e-06, "loss": 0.73650414, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.6524658203125 }, { "auxiliary_loss_clip": 0.01152093, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.04917228, "balance_loss_mlp": 1.01782942, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.8613692992724165, "language_loss": 0.82276207, "learning_rate": 2.6868048555051185e-06, "loss": 0.8445425, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.7673768997192383 }, { "auxiliary_loss_clip": 0.01162281, "auxiliary_loss_mlp": 0.01022892, "balance_loss_clip": 1.05202198, "balance_loss_mlp": 1.01478541, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 3.624794270933033, "language_loss": 0.85308701, "learning_rate": 2.686073202327041e-06, "loss": 0.87493873, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.6756136417388916 }, { "auxiliary_loss_clip": 0.01144972, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.04849076, "balance_loss_mlp": 1.02163768, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 2.168253120392537, "language_loss": 0.73079765, "learning_rate": 2.6853414450722043e-06, "loss": 0.75254083, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.740118980407715 }, { "auxiliary_loss_clip": 0.0117138, "auxiliary_loss_mlp": 0.01027252, "balance_loss_clip": 1.05597293, "balance_loss_mlp": 1.01952732, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 2.2056968381998767, "language_loss": 0.85366499, "learning_rate": 2.684609583851616e-06, "loss": 0.87565136, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.6121559143066406 }, { "auxiliary_loss_clip": 0.01125037, "auxiliary_loss_mlp": 0.01026907, "balance_loss_clip": 1.05196655, "balance_loss_mlp": 1.01918221, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.6575333302994872, "language_loss": 0.80766559, "learning_rate": 2.683877618776297e-06, "loss": 0.82918501, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.817262649536133 }, { "auxiliary_loss_clip": 0.011494, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.04886723, "balance_loss_mlp": 1.02304506, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 5.715637699720826, "language_loss": 0.7432003, "learning_rate": 2.6831455499572876e-06, "loss": 0.76501656, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.671430826187134 }, { "auxiliary_loss_clip": 0.01189421, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.05750549, "balance_loss_mlp": 1.02389455, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 2.0457957704887764, "language_loss": 0.78025132, "learning_rate": 2.682413377505641e-06, "loss": 0.80246139, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.6415345668792725 }, { "auxiliary_loss_clip": 0.01172184, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.05309248, "balance_loss_mlp": 1.01995969, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 2.0693359941459586, "language_loss": 0.76592314, "learning_rate": 2.6816811015324284e-06, "loss": 0.7879262, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.6346983909606934 }, { "auxiliary_loss_clip": 0.01107746, "auxiliary_loss_mlp": 0.01004136, "balance_loss_clip": 1.03796148, "balance_loss_mlp": 1.00214541, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7953633713544596, "language_loss": 0.56710535, "learning_rate": 2.6809487221487343e-06, "loss": 0.58822417, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 3.156928777694702 }, { "auxiliary_loss_clip": 0.01164472, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.0523138, "balance_loss_mlp": 1.01764655, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.930151352033457, "language_loss": 0.81515247, "learning_rate": 2.6802162394656605e-06, "loss": 0.83705288, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 4.479551553726196 }, { "auxiliary_loss_clip": 0.01149855, "auxiliary_loss_mlp": 0.01026511, "balance_loss_clip": 1.04987168, "balance_loss_mlp": 1.01891756, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.9771245585450803, "language_loss": 0.71934354, "learning_rate": 2.679483653594324e-06, "loss": 0.74110723, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 3.637281894683838 }, { "auxiliary_loss_clip": 0.0117576, "auxiliary_loss_mlp": 0.01023332, "balance_loss_clip": 1.05528426, "balance_loss_mlp": 1.01560712, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.3515008745916606, "language_loss": 0.76816189, "learning_rate": 2.678750964645857e-06, "loss": 0.79015279, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 3.540358543395996 }, { "auxiliary_loss_clip": 0.01174139, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.05868316, "balance_loss_mlp": 1.02261949, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.454379148385163, "language_loss": 0.83602607, "learning_rate": 2.6780181727314094e-06, "loss": 0.85808003, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.6489758491516113 }, { "auxiliary_loss_clip": 0.01143815, "auxiliary_loss_mlp": 0.0071208, "balance_loss_clip": 1.05290091, "balance_loss_mlp": 1.00102901, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.888403136001796, "language_loss": 0.77766418, "learning_rate": 2.6772852779621435e-06, "loss": 0.79622322, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.6745033264160156 }, { "auxiliary_loss_clip": 0.01170404, "auxiliary_loss_mlp": 0.00711404, "balance_loss_clip": 1.05892837, "balance_loss_mlp": 1.00096238, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 8.039054457155132, "language_loss": 0.86763072, "learning_rate": 2.676552280449239e-06, "loss": 0.88644886, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.6886773109436035 }, { "auxiliary_loss_clip": 0.01162071, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.05244803, "balance_loss_mlp": 1.0222317, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 3.2064718918471304, "language_loss": 0.75903875, "learning_rate": 2.6758191803038917e-06, "loss": 0.78096223, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.6073496341705322 }, { "auxiliary_loss_clip": 0.01101272, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.04986024, "balance_loss_mlp": 1.02817833, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.9876342467112487, "language_loss": 0.83054316, "learning_rate": 2.6750859776373125e-06, "loss": 0.85191846, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.801513195037842 }, { "auxiliary_loss_clip": 0.0105565, "auxiliary_loss_mlp": 0.01000819, "balance_loss_clip": 1.0460161, "balance_loss_mlp": 0.99878067, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.764759108551324, "language_loss": 0.60419029, "learning_rate": 2.674352672560727e-06, "loss": 0.62475502, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.2837212085723877 }, { "auxiliary_loss_clip": 0.01137597, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.05098414, "balance_loss_mlp": 1.01988149, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 2.104697615035381, "language_loss": 0.76928079, "learning_rate": 2.673619265185377e-06, "loss": 0.79093528, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.7128233909606934 }, { "auxiliary_loss_clip": 0.01172906, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 1.05365539, "balance_loss_mlp": 1.02082491, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 2.4885287181557794, "language_loss": 0.78250813, "learning_rate": 2.672885755622521e-06, "loss": 0.80452889, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.722822904586792 }, { "auxiliary_loss_clip": 0.01119071, "auxiliary_loss_mlp": 0.01026681, "balance_loss_clip": 1.04805768, "balance_loss_mlp": 1.01868808, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 2.4956924997288708, "language_loss": 0.70627064, "learning_rate": 2.67215214398343e-06, "loss": 0.72772813, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.7491908073425293 }, { "auxiliary_loss_clip": 0.0112549, "auxiliary_loss_mlp": 0.01027442, "balance_loss_clip": 1.04855251, "balance_loss_mlp": 1.0192647, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.159810458836975, "language_loss": 0.78225428, "learning_rate": 2.671418430379393e-06, "loss": 0.80378366, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.7775189876556396 }, { "auxiliary_loss_clip": 0.0118688, "auxiliary_loss_mlp": 0.01029906, "balance_loss_clip": 1.05572546, "balance_loss_mlp": 1.02194262, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 1.9135771721820263, "language_loss": 0.8323099, "learning_rate": 2.670684614921715e-06, "loss": 0.85447776, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.5765466690063477 }, { "auxiliary_loss_clip": 0.01159438, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.05464196, "balance_loss_mlp": 1.02192354, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 2.118781465146001, "language_loss": 0.68870896, "learning_rate": 2.6699506977217128e-06, "loss": 0.71060288, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.7277188301086426 }, { "auxiliary_loss_clip": 0.01167689, "auxiliary_loss_mlp": 0.0103148, "balance_loss_clip": 1.05625582, "balance_loss_mlp": 1.02383876, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.370679728787362, "language_loss": 0.70330471, "learning_rate": 2.6692166788907233e-06, "loss": 0.72529644, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.6864736080169678 }, { "auxiliary_loss_clip": 0.01161018, "auxiliary_loss_mlp": 0.01028414, "balance_loss_clip": 1.05692816, "balance_loss_mlp": 1.0200876, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 3.0205312883967252, "language_loss": 0.77045691, "learning_rate": 2.6684825585400957e-06, "loss": 0.79235119, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.703380823135376 }, { "auxiliary_loss_clip": 0.01073342, "auxiliary_loss_mlp": 0.01010277, "balance_loss_clip": 1.03386927, "balance_loss_mlp": 1.00829816, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8182688372853815, "language_loss": 0.65136766, "learning_rate": 2.6677483367811947e-06, "loss": 0.67220378, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.323773145675659 }, { "auxiliary_loss_clip": 0.01172658, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.05373931, "balance_loss_mlp": 1.02765453, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.7605007247003694, "language_loss": 0.75162154, "learning_rate": 2.6670140137254028e-06, "loss": 0.77370214, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.6204593181610107 }, { "auxiliary_loss_clip": 0.01117935, "auxiliary_loss_mlp": 0.01024121, "balance_loss_clip": 1.04890454, "balance_loss_mlp": 1.01639652, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.8307432225163747, "language_loss": 0.89211965, "learning_rate": 2.666279589484115e-06, "loss": 0.91354012, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.7084827423095703 }, { "auxiliary_loss_clip": 0.01121743, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 1.04850852, "balance_loss_mlp": 1.01749468, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 2.643686951861745, "language_loss": 0.81433809, "learning_rate": 2.6655450641687435e-06, "loss": 0.83580494, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.7025375366210938 }, { "auxiliary_loss_clip": 0.01186546, "auxiliary_loss_mlp": 0.01023704, "balance_loss_clip": 1.05879021, "balance_loss_mlp": 1.01594627, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.8658475433480675, "language_loss": 0.69367039, "learning_rate": 2.664810437890715e-06, "loss": 0.71577287, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.7072951793670654 }, { "auxiliary_loss_clip": 0.01099446, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.05372643, "balance_loss_mlp": 1.02157962, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 2.3070040174232265, "language_loss": 0.79688072, "learning_rate": 2.6640757107614714e-06, "loss": 0.81816566, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.748333215713501 }, { "auxiliary_loss_clip": 0.01138321, "auxiliary_loss_mlp": 0.01027101, "balance_loss_clip": 1.05601966, "balance_loss_mlp": 1.01877999, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.7554857125984604, "language_loss": 0.69061184, "learning_rate": 2.6633408828924697e-06, "loss": 0.71226609, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.7828757762908936 }, { "auxiliary_loss_clip": 0.01148808, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.05336714, "balance_loss_mlp": 1.02142882, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.701127702143439, "language_loss": 0.69757324, "learning_rate": 2.662605954395185e-06, "loss": 0.71935678, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.718953847885132 }, { "auxiliary_loss_clip": 0.01170878, "auxiliary_loss_mlp": 0.01025647, "balance_loss_clip": 1.05406451, "balance_loss_mlp": 1.01779675, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 2.115781831125338, "language_loss": 0.83839524, "learning_rate": 2.6618709253811027e-06, "loss": 0.8603605, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 3.487112283706665 }, { "auxiliary_loss_clip": 0.01183298, "auxiliary_loss_mlp": 0.0102205, "balance_loss_clip": 1.05707192, "balance_loss_mlp": 1.01483798, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 2.046404084579151, "language_loss": 0.87615246, "learning_rate": 2.6611357959617277e-06, "loss": 0.89820594, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 3.6761178970336914 }, { "auxiliary_loss_clip": 0.0113277, "auxiliary_loss_mlp": 0.01026493, "balance_loss_clip": 1.05159044, "balance_loss_mlp": 1.01790988, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.8900532560463383, "language_loss": 0.91232657, "learning_rate": 2.660400566248578e-06, "loss": 0.93391925, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 3.6593151092529297 }, { "auxiliary_loss_clip": 0.01136777, "auxiliary_loss_mlp": 0.01024414, "balance_loss_clip": 1.05077982, "balance_loss_mlp": 1.01590824, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.344090534604166, "language_loss": 0.67187285, "learning_rate": 2.6596652363531876e-06, "loss": 0.69348478, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 3.614286184310913 }, { "auxiliary_loss_clip": 0.01186501, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 1.05880857, "balance_loss_mlp": 1.01783133, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.5851277332756795, "language_loss": 0.78342652, "learning_rate": 2.6589298063871055e-06, "loss": 0.80554289, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.839782238006592 }, { "auxiliary_loss_clip": 0.01190001, "auxiliary_loss_mlp": 0.01024905, "balance_loss_clip": 1.061553, "balance_loss_mlp": 1.01678681, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 3.0009404368513843, "language_loss": 0.69447821, "learning_rate": 2.658194276461895e-06, "loss": 0.71662724, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.6227304935455322 }, { "auxiliary_loss_clip": 0.01154669, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.05214322, "balance_loss_mlp": 1.01916015, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.3412913289490667, "language_loss": 0.67199904, "learning_rate": 2.6574586466891368e-06, "loss": 0.69382453, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.7822647094726562 }, { "auxiliary_loss_clip": 0.0115624, "auxiliary_loss_mlp": 0.00711034, "balance_loss_clip": 1.05469799, "balance_loss_mlp": 1.00091934, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 2.105053843977782, "language_loss": 0.64783686, "learning_rate": 2.6567229171804247e-06, "loss": 0.66650963, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.773070812225342 }, { "auxiliary_loss_clip": 0.01147202, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.05087495, "balance_loss_mlp": 1.01668632, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.4068918568925306, "language_loss": 0.88146579, "learning_rate": 2.655987088047368e-06, "loss": 0.90318966, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.6961631774902344 }, { "auxiliary_loss_clip": 0.01153089, "auxiliary_loss_mlp": 0.01024827, "balance_loss_clip": 1.05367458, "balance_loss_mlp": 1.01691747, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 1.9362411923076615, "language_loss": 0.78620458, "learning_rate": 2.6552511594015912e-06, "loss": 0.80798376, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.8054864406585693 }, { "auxiliary_loss_clip": 0.01150753, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.050318, "balance_loss_mlp": 1.01697683, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 2.269186301957088, "language_loss": 0.85214365, "learning_rate": 2.654515131354735e-06, "loss": 0.87390202, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.7200891971588135 }, { "auxiliary_loss_clip": 0.0114012, "auxiliary_loss_mlp": 0.01026604, "balance_loss_clip": 1.05409682, "balance_loss_mlp": 1.01867676, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.264485136403836, "language_loss": 0.8428843, "learning_rate": 2.653779004018453e-06, "loss": 0.86455154, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.8097000122070312 }, { "auxiliary_loss_clip": 0.01149157, "auxiliary_loss_mlp": 0.01022262, "balance_loss_clip": 1.05258167, "balance_loss_mlp": 1.01446569, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 2.1918464627578214, "language_loss": 0.82402885, "learning_rate": 2.653042777504417e-06, "loss": 0.84574306, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.7862207889556885 }, { "auxiliary_loss_clip": 0.01162871, "auxiliary_loss_mlp": 0.01023903, "balance_loss_clip": 1.05506182, "balance_loss_mlp": 1.01592183, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 1.891224385386458, "language_loss": 0.79996324, "learning_rate": 2.6523064519243105e-06, "loss": 0.82183093, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.82605242729187 }, { "auxiliary_loss_clip": 0.01171635, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.05804563, "balance_loss_mlp": 1.01628339, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.889040765051659, "language_loss": 0.79451317, "learning_rate": 2.6515700273898333e-06, "loss": 0.8164736, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.793617010116577 }, { "auxiliary_loss_clip": 0.01147634, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.05403686, "balance_loss_mlp": 1.01664221, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.845930423022378, "language_loss": 0.69066799, "learning_rate": 2.6508335040127018e-06, "loss": 0.7123853, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.8114352226257324 }, { "auxiliary_loss_clip": 0.01176052, "auxiliary_loss_mlp": 0.01027623, "balance_loss_clip": 1.05697036, "balance_loss_mlp": 1.01985621, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 2.071431009913368, "language_loss": 0.77209032, "learning_rate": 2.6500968819046446e-06, "loss": 0.79412705, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.8093960285186768 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01026699, "balance_loss_clip": 1.0484283, "balance_loss_mlp": 1.01936769, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.702090656830342, "language_loss": 0.59039176, "learning_rate": 2.649360161177408e-06, "loss": 0.61193204, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.718594789505005 }, { "auxiliary_loss_clip": 0.01178575, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.05666661, "balance_loss_mlp": 1.02545881, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 2.236993488587779, "language_loss": 0.73453677, "learning_rate": 2.6486233419427504e-06, "loss": 0.75665987, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.7520551681518555 }, { "auxiliary_loss_clip": 0.01130299, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.05299497, "balance_loss_mlp": 1.02010584, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 8.197503223610388, "language_loss": 0.75118291, "learning_rate": 2.6478864243124484e-06, "loss": 0.77277124, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.829798698425293 }, { "auxiliary_loss_clip": 0.01170216, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.05234003, "balance_loss_mlp": 1.01958477, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 1.7546184113885883, "language_loss": 0.85337865, "learning_rate": 2.6471494083982903e-06, "loss": 0.87535238, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.7339980602264404 }, { "auxiliary_loss_clip": 0.01141779, "auxiliary_loss_mlp": 0.01026557, "balance_loss_clip": 1.05177069, "balance_loss_mlp": 1.01850402, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 2.555106606839425, "language_loss": 0.74870181, "learning_rate": 2.6464122943120818e-06, "loss": 0.77038515, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.795606851577759 }, { "auxiliary_loss_clip": 0.01138213, "auxiliary_loss_mlp": 0.01025191, "balance_loss_clip": 1.05414629, "balance_loss_mlp": 1.01699507, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 3.1171782844113336, "language_loss": 0.82402986, "learning_rate": 2.645675082165642e-06, "loss": 0.84566391, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.765629768371582 }, { "auxiliary_loss_clip": 0.01156853, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.05582798, "balance_loss_mlp": 1.02091336, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.8506439379036004, "language_loss": 0.75159943, "learning_rate": 2.644937772070806e-06, "loss": 0.773458, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.7145276069641113 }, { "auxiliary_loss_clip": 0.01189472, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.05924022, "balance_loss_mlp": 1.01750886, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.7014481920330873, "language_loss": 0.83193231, "learning_rate": 2.6442003641394225e-06, "loss": 0.85407925, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.6814682483673096 }, { "auxiliary_loss_clip": 0.01152086, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.05187488, "balance_loss_mlp": 1.02158999, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.4822704476299766, "language_loss": 0.8375231, "learning_rate": 2.643462858483356e-06, "loss": 0.85933667, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.7844619750976562 }, { "auxiliary_loss_clip": 0.0112065, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.0505439, "balance_loss_mlp": 1.01970601, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.908791539493007, "language_loss": 0.72818244, "learning_rate": 2.6427252552144856e-06, "loss": 0.74967182, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.7701919078826904 }, { "auxiliary_loss_clip": 0.01185527, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.05570066, "balance_loss_mlp": 1.02456665, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 6.4551992663971, "language_loss": 0.75268883, "learning_rate": 2.6419875544447044e-06, "loss": 0.77487254, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 4.577687740325928 }, { "auxiliary_loss_clip": 0.01189329, "auxiliary_loss_mlp": 0.01029934, "balance_loss_clip": 1.05783963, "balance_loss_mlp": 1.02210176, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.6944170991657939, "language_loss": 0.71497154, "learning_rate": 2.6412497562859218e-06, "loss": 0.73716414, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 3.6613471508026123 }, { "auxiliary_loss_clip": 0.01177552, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.05666888, "balance_loss_mlp": 1.02469885, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 4.92203488861428, "language_loss": 0.75993729, "learning_rate": 2.6405118608500617e-06, "loss": 0.7820406, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 3.6368303298950195 }, { "auxiliary_loss_clip": 0.01132726, "auxiliary_loss_mlp": 0.01024859, "balance_loss_clip": 1.05351138, "balance_loss_mlp": 1.01748931, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 1.8416819429466924, "language_loss": 0.81463927, "learning_rate": 2.6397738682490613e-06, "loss": 0.83621508, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.6834752559661865 }, { "auxiliary_loss_clip": 0.01188442, "auxiliary_loss_mlp": 0.01025617, "balance_loss_clip": 1.05821896, "balance_loss_mlp": 1.01777339, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.8743543760441712, "language_loss": 0.75515962, "learning_rate": 2.6390357785948734e-06, "loss": 0.77730024, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.579084634780884 }, { "auxiliary_loss_clip": 0.01171607, "auxiliary_loss_mlp": 0.01026545, "balance_loss_clip": 1.05687416, "balance_loss_mlp": 1.01833153, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.7411055404387823, "language_loss": 0.79769611, "learning_rate": 2.6382975919994667e-06, "loss": 0.81967759, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 2.6457207202911377 }, { "auxiliary_loss_clip": 0.01157485, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.05409992, "balance_loss_mlp": 1.02328086, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.7545729930054188, "language_loss": 0.72889316, "learning_rate": 2.637559308574822e-06, "loss": 0.75077045, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.659238576889038 }, { "auxiliary_loss_clip": 0.01188807, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.05865502, "balance_loss_mlp": 1.02161157, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 27.658963136897047, "language_loss": 0.71010554, "learning_rate": 2.6368209284329376e-06, "loss": 0.73228586, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.6634576320648193 }, { "auxiliary_loss_clip": 0.01171888, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.05564809, "balance_loss_mlp": 1.02475858, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 2.75322695163948, "language_loss": 0.75075394, "learning_rate": 2.636082451685825e-06, "loss": 0.77279884, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.694155693054199 }, { "auxiliary_loss_clip": 0.01161098, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.05710101, "balance_loss_mlp": 1.01893508, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.761343263471496, "language_loss": 0.86603892, "learning_rate": 2.6353438784455094e-06, "loss": 0.88791579, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.689771890640259 }, { "auxiliary_loss_clip": 0.01151654, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.05445147, "balance_loss_mlp": 1.02387428, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.112723208665275, "language_loss": 0.71320474, "learning_rate": 2.6346052088240326e-06, "loss": 0.73504114, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.722808361053467 }, { "auxiliary_loss_clip": 0.0115524, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.05365324, "balance_loss_mlp": 1.02399707, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 1.9850008357181295, "language_loss": 0.77338696, "learning_rate": 2.63386644293345e-06, "loss": 0.79525858, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.6218464374542236 }, { "auxiliary_loss_clip": 0.01133557, "auxiliary_loss_mlp": 0.01024233, "balance_loss_clip": 1.0485127, "balance_loss_mlp": 1.01701736, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.487059343991328, "language_loss": 0.82910794, "learning_rate": 2.633127580885833e-06, "loss": 0.85068583, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.651275873184204 }, { "auxiliary_loss_clip": 0.01188429, "auxiliary_loss_mlp": 0.01027413, "balance_loss_clip": 1.05996418, "balance_loss_mlp": 1.01921177, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 2.085252081888909, "language_loss": 0.64793575, "learning_rate": 2.632388622793265e-06, "loss": 0.67009413, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.617628335952759 }, { "auxiliary_loss_clip": 0.01170622, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 1.05693316, "balance_loss_mlp": 1.02117932, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 2.158830950997215, "language_loss": 0.67876601, "learning_rate": 2.6316495687678457e-06, "loss": 0.70076096, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.646672010421753 }, { "auxiliary_loss_clip": 0.01116457, "auxiliary_loss_mlp": 0.0102938, "balance_loss_clip": 1.04849315, "balance_loss_mlp": 1.02232003, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 7.959551084142448, "language_loss": 0.76532823, "learning_rate": 2.6309104189216887e-06, "loss": 0.78678662, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.7886438369750977 }, { "auxiliary_loss_clip": 0.01125202, "auxiliary_loss_mlp": 0.00711986, "balance_loss_clip": 1.04866171, "balance_loss_mlp": 1.00074565, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.4244703154455745, "language_loss": 0.74859881, "learning_rate": 2.630171173366923e-06, "loss": 0.76697063, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.6706857681274414 }, { "auxiliary_loss_clip": 0.0112206, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.05018961, "balance_loss_mlp": 1.02334583, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.4698461014111595, "language_loss": 0.74935842, "learning_rate": 2.629431832215691e-06, "loss": 0.77089381, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.665259838104248 }, { "auxiliary_loss_clip": 0.01149162, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 1.05231547, "balance_loss_mlp": 1.01811028, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.6064248287372354, "language_loss": 0.87182945, "learning_rate": 2.628692395580151e-06, "loss": 0.89358026, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.6491012573242188 }, { "auxiliary_loss_clip": 0.01087046, "auxiliary_loss_mlp": 0.0102636, "balance_loss_clip": 1.04372454, "balance_loss_mlp": 1.018641, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 1.8502565926004038, "language_loss": 0.79229134, "learning_rate": 2.6279528635724747e-06, "loss": 0.81342542, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.8952038288116455 }, { "auxiliary_loss_clip": 0.01171286, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.05656242, "balance_loss_mlp": 1.01920831, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 4.4530070358321145, "language_loss": 0.7820307, "learning_rate": 2.627213236304848e-06, "loss": 0.80402362, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.6221864223480225 }, { "auxiliary_loss_clip": 0.01174548, "auxiliary_loss_mlp": 0.01025127, "balance_loss_clip": 1.05606556, "balance_loss_mlp": 1.01761723, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 1.7912729337333853, "language_loss": 0.70897818, "learning_rate": 2.626473513889472e-06, "loss": 0.73097497, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.766184091567993 }, { "auxiliary_loss_clip": 0.01163387, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.05467021, "balance_loss_mlp": 1.0196178, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 1.9122820097907984, "language_loss": 0.82768762, "learning_rate": 2.625733696438562e-06, "loss": 0.84959316, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.652785301208496 }, { "auxiliary_loss_clip": 0.0115187, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.05328989, "balance_loss_mlp": 1.01807046, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 2.168169156039986, "language_loss": 0.74890912, "learning_rate": 2.6249937840643476e-06, "loss": 0.7706877, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.6627261638641357 }, { "auxiliary_loss_clip": 0.01187369, "auxiliary_loss_mlp": 0.00711643, "balance_loss_clip": 1.05885983, "balance_loss_mlp": 1.00082397, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.766070135922566, "language_loss": 0.66548705, "learning_rate": 2.6242537768790733e-06, "loss": 0.68447721, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.6268069744110107 }, { "auxiliary_loss_clip": 0.01168966, "auxiliary_loss_mlp": 0.0102791, "balance_loss_clip": 1.05464065, "balance_loss_mlp": 1.01972067, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.9961933514938448, "language_loss": 0.68898511, "learning_rate": 2.6235136749949975e-06, "loss": 0.71095383, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.7493607997894287 }, { "auxiliary_loss_clip": 0.01183587, "auxiliary_loss_mlp": 0.01026296, "balance_loss_clip": 1.05543542, "balance_loss_mlp": 1.01855886, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 3.182542812716345, "language_loss": 0.61688071, "learning_rate": 2.6227734785243924e-06, "loss": 0.63897955, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 4.582306146621704 }, { "auxiliary_loss_clip": 0.01098106, "auxiliary_loss_mlp": 0.01023789, "balance_loss_clip": 1.04607296, "balance_loss_mlp": 1.01642811, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.9157843094491909, "language_loss": 0.79317039, "learning_rate": 2.6220331875795466e-06, "loss": 0.81438935, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 3.7012221813201904 }, { "auxiliary_loss_clip": 0.01168122, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.05695295, "balance_loss_mlp": 1.02173245, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.629642989916144, "language_loss": 0.74958515, "learning_rate": 2.62129280227276e-06, "loss": 0.77156246, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 3.6730401515960693 }, { "auxiliary_loss_clip": 0.01175873, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.05639219, "balance_loss_mlp": 1.02224374, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 1.8990847986421515, "language_loss": 0.68845987, "learning_rate": 2.62055232271635e-06, "loss": 0.71051788, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.0832769870758057 }, { "auxiliary_loss_clip": 0.01126842, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 1.04704082, "balance_loss_mlp": 1.02101827, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.2981462655157086, "language_loss": 0.87799668, "learning_rate": 2.619811749022646e-06, "loss": 0.89955521, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.7304160594940186 }, { "auxiliary_loss_clip": 0.01172182, "auxiliary_loss_mlp": 0.01026379, "balance_loss_clip": 1.05707598, "balance_loss_mlp": 1.01734924, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.973501150753605, "language_loss": 0.71000069, "learning_rate": 2.6190710813039917e-06, "loss": 0.73198628, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.6943399906158447 }, { "auxiliary_loss_clip": 0.01115837, "auxiliary_loss_mlp": 0.00712107, "balance_loss_clip": 1.04545903, "balance_loss_mlp": 1.00076032, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.183514169555396, "language_loss": 0.83488214, "learning_rate": 2.618330319672747e-06, "loss": 0.85316163, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.7513420581817627 }, { "auxiliary_loss_clip": 0.01183569, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 1.05492318, "balance_loss_mlp": 1.01912713, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.9967467869306774, "language_loss": 0.92178375, "learning_rate": 2.617589464241284e-06, "loss": 0.94388366, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.5773234367370605 }, { "auxiliary_loss_clip": 0.0114104, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.05215049, "balance_loss_mlp": 1.02200902, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 1.9420494115364844, "language_loss": 0.74571931, "learning_rate": 2.6168485151219914e-06, "loss": 0.76742756, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.709616184234619 }, { "auxiliary_loss_clip": 0.01171223, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.05730438, "balance_loss_mlp": 1.02084947, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.962483445289872, "language_loss": 0.71639425, "learning_rate": 2.616107472427269e-06, "loss": 0.73839468, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.5814106464385986 }, { "auxiliary_loss_clip": 0.01176311, "auxiliary_loss_mlp": 0.01025824, "balance_loss_clip": 1.05589914, "balance_loss_mlp": 1.01809335, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.5074195825935988, "language_loss": 0.7600255, "learning_rate": 2.615366336269533e-06, "loss": 0.78204685, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.6337051391601562 }, { "auxiliary_loss_clip": 0.01189596, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.0576067, "balance_loss_mlp": 1.01913607, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.603435211031999, "language_loss": 0.80391371, "learning_rate": 2.6146251067612126e-06, "loss": 0.82608527, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.5805368423461914 }, { "auxiliary_loss_clip": 0.01171272, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.05837274, "balance_loss_mlp": 1.01597905, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 2.5502896503987325, "language_loss": 0.82772458, "learning_rate": 2.6138837840147525e-06, "loss": 0.84967822, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.6888928413391113 }, { "auxiliary_loss_clip": 0.0113349, "auxiliary_loss_mlp": 0.01023678, "balance_loss_clip": 1.05004025, "balance_loss_mlp": 1.01608467, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 3.746036513659577, "language_loss": 0.76408297, "learning_rate": 2.6131423681426103e-06, "loss": 0.78565466, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.638364791870117 }, { "auxiliary_loss_clip": 0.01187138, "auxiliary_loss_mlp": 0.01021514, "balance_loss_clip": 1.05860233, "balance_loss_mlp": 1.01389122, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 1.6447868824060885, "language_loss": 0.73048544, "learning_rate": 2.6124008592572587e-06, "loss": 0.75257194, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.75152325630188 }, { "auxiliary_loss_clip": 0.01190051, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.05760825, "balance_loss_mlp": 1.01909471, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.59837767440697, "language_loss": 0.82041574, "learning_rate": 2.6116592574711835e-06, "loss": 0.84258783, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.5931942462921143 }, { "auxiliary_loss_clip": 0.01189025, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.05854547, "balance_loss_mlp": 1.02504849, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.9360382670563612, "language_loss": 0.844239, "learning_rate": 2.6109175628968853e-06, "loss": 0.86646825, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.6640005111694336 }, { "auxiliary_loss_clip": 0.01159335, "auxiliary_loss_mlp": 0.01020229, "balance_loss_clip": 1.05246091, "balance_loss_mlp": 1.01333261, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 2.351752393658586, "language_loss": 0.82974041, "learning_rate": 2.610175775646878e-06, "loss": 0.85153604, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.6151363849639893 }, { "auxiliary_loss_clip": 0.01150338, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.05073547, "balance_loss_mlp": 1.01887727, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 2.1123186558627722, "language_loss": 0.72778016, "learning_rate": 2.6094338958336907e-06, "loss": 0.74955404, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.701979398727417 }, { "auxiliary_loss_clip": 0.01155384, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.0559231, "balance_loss_mlp": 1.0211885, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 2.4401151139885853, "language_loss": 0.82350993, "learning_rate": 2.608691923569867e-06, "loss": 0.84535289, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.6311194896698 }, { "auxiliary_loss_clip": 0.01175435, "auxiliary_loss_mlp": 0.01026634, "balance_loss_clip": 1.0593822, "balance_loss_mlp": 1.01908195, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.9840925596843033, "language_loss": 0.75630265, "learning_rate": 2.6079498589679616e-06, "loss": 0.77832335, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.69476318359375 }, { "auxiliary_loss_clip": 0.01103982, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.04821253, "balance_loss_mlp": 1.02489948, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 1.6911807692004108, "language_loss": 0.76100326, "learning_rate": 2.6072077021405465e-06, "loss": 0.78238171, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.7627804279327393 }, { "auxiliary_loss_clip": 0.01143438, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.05059648, "balance_loss_mlp": 1.01993012, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.7580733239475441, "language_loss": 0.6964705, "learning_rate": 2.6064654532002054e-06, "loss": 0.71818054, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.7379226684570312 }, { "auxiliary_loss_clip": 0.01185573, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.05782795, "balance_loss_mlp": 1.02208495, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 1.4499050636536877, "language_loss": 0.75856984, "learning_rate": 2.6057231122595375e-06, "loss": 0.78071761, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.6884374618530273 }, { "auxiliary_loss_clip": 0.01153997, "auxiliary_loss_mlp": 0.01028848, "balance_loss_clip": 1.05132723, "balance_loss_mlp": 1.02139759, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.8784424928492798, "language_loss": 0.72746146, "learning_rate": 2.604980679431154e-06, "loss": 0.74928999, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.683349847793579 }, { "auxiliary_loss_clip": 0.0117179, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.05409086, "balance_loss_mlp": 1.01798058, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 2.459566990486723, "language_loss": 0.74562824, "learning_rate": 2.604238154827684e-06, "loss": 0.76760507, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.7187211513519287 }, { "auxiliary_loss_clip": 0.01171015, "auxiliary_loss_mlp": 0.01024163, "balance_loss_clip": 1.05607796, "balance_loss_mlp": 1.01656365, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 2.0366075043264855, "language_loss": 0.72756934, "learning_rate": 2.6034955385617656e-06, "loss": 0.7495212, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 4.6957619190216064 }, { "auxiliary_loss_clip": 0.01084884, "auxiliary_loss_mlp": 0.01011679, "balance_loss_clip": 1.05096245, "balance_loss_mlp": 1.00991464, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7256424503126127, "language_loss": 0.6174109, "learning_rate": 2.6027528307460544e-06, "loss": 0.63837653, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 4.3891870975494385 }, { "auxiliary_loss_clip": 0.01184229, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 1.05632305, "balance_loss_mlp": 1.01558805, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 2.0984783081365017, "language_loss": 0.86548424, "learning_rate": 2.602010031493217e-06, "loss": 0.88755882, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 3.6727237701416016 }, { "auxiliary_loss_clip": 0.01134203, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 1.05198741, "balance_loss_mlp": 1.02219772, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 6.078133581787771, "language_loss": 0.86685628, "learning_rate": 2.6012671409159367e-06, "loss": 0.88849497, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.782172203063965 }, { "auxiliary_loss_clip": 0.01147091, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.05212379, "balance_loss_mlp": 1.02043474, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.999383079498289, "language_loss": 0.81957507, "learning_rate": 2.6005241591269097e-06, "loss": 0.84132695, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.7365901470184326 }, { "auxiliary_loss_clip": 0.0113582, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.05585241, "balance_loss_mlp": 1.02092171, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 2.3387156326641496, "language_loss": 0.79895586, "learning_rate": 2.5997810862388454e-06, "loss": 0.82059222, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.8111307621002197 }, { "auxiliary_loss_clip": 0.01153273, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.05139685, "balance_loss_mlp": 1.02202058, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.4983768886845716, "language_loss": 0.75725162, "learning_rate": 2.599037922364467e-06, "loss": 0.77907979, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.7472434043884277 }, { "auxiliary_loss_clip": 0.01134504, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.05439925, "balance_loss_mlp": 1.02083695, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.305284076472486, "language_loss": 0.7588619, "learning_rate": 2.5982946676165112e-06, "loss": 0.78048927, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.7420847415924072 }, { "auxiliary_loss_clip": 0.01086531, "auxiliary_loss_mlp": 0.01012834, "balance_loss_clip": 1.06173933, "balance_loss_mlp": 1.01127267, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7287026247896257, "language_loss": 0.5758487, "learning_rate": 2.5975513221077313e-06, "loss": 0.59684235, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.406538724899292 }, { "auxiliary_loss_clip": 0.01144962, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.05201423, "balance_loss_mlp": 1.0183351, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 3.1071827349386214, "language_loss": 0.88473308, "learning_rate": 2.5968078859508897e-06, "loss": 0.90643722, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.638103485107422 }, { "auxiliary_loss_clip": 0.01169303, "auxiliary_loss_mlp": 0.01023486, "balance_loss_clip": 1.05569696, "balance_loss_mlp": 1.0158087, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 8.159652132895095, "language_loss": 0.79739904, "learning_rate": 2.5960643592587673e-06, "loss": 0.81932694, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.684910297393799 }, { "auxiliary_loss_clip": 0.01136336, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.05140257, "balance_loss_mlp": 1.02232432, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 5.2478582473217195, "language_loss": 0.81432176, "learning_rate": 2.5953207421441553e-06, "loss": 0.83598405, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.7198853492736816 }, { "auxiliary_loss_clip": 0.0114171, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.05467999, "balance_loss_mlp": 1.02429605, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.4174757841615686, "language_loss": 0.75289083, "learning_rate": 2.5945770347198603e-06, "loss": 0.7746256, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.7946972846984863 }, { "auxiliary_loss_clip": 0.01150986, "auxiliary_loss_mlp": 0.01021022, "balance_loss_clip": 1.05369282, "balance_loss_mlp": 1.01390493, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 2.8530842738526885, "language_loss": 0.8192668, "learning_rate": 2.593833237098701e-06, "loss": 0.84098691, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.772216320037842 }, { "auxiliary_loss_clip": 0.01167386, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.05299938, "balance_loss_mlp": 1.01798105, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 4.1125737452118765, "language_loss": 0.62430686, "learning_rate": 2.593089349393512e-06, "loss": 0.64623612, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.7251579761505127 }, { "auxiliary_loss_clip": 0.01170013, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05690432, "balance_loss_mlp": 1.02081347, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 6.297006529834029, "language_loss": 0.83422124, "learning_rate": 2.592345371717141e-06, "loss": 0.85620487, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.6441092491149902 }, { "auxiliary_loss_clip": 0.01169215, "auxiliary_loss_mlp": 0.01024598, "balance_loss_clip": 1.05936718, "balance_loss_mlp": 1.01671267, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.3925449689928597, "language_loss": 0.72469437, "learning_rate": 2.591601304182448e-06, "loss": 0.74663246, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.653825044631958 }, { "auxiliary_loss_clip": 0.01154006, "auxiliary_loss_mlp": 0.01033947, "balance_loss_clip": 1.05641198, "balance_loss_mlp": 1.02629995, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.7929391050257673, "language_loss": 0.79320294, "learning_rate": 2.5908571469023067e-06, "loss": 0.81508243, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.6263768672943115 }, { "auxiliary_loss_clip": 0.01183429, "auxiliary_loss_mlp": 0.01025857, "balance_loss_clip": 1.0573324, "balance_loss_mlp": 1.01836514, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 3.5843351808170376, "language_loss": 0.75912756, "learning_rate": 2.5901128999896067e-06, "loss": 0.78122044, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.5994794368743896 }, { "auxiliary_loss_clip": 0.01165817, "auxiliary_loss_mlp": 0.01025913, "balance_loss_clip": 1.05580211, "balance_loss_mlp": 1.01885569, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 12.515396651050057, "language_loss": 0.67821741, "learning_rate": 2.5893685635572487e-06, "loss": 0.70013469, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.7446367740631104 }, { "auxiliary_loss_clip": 0.01150685, "auxiliary_loss_mlp": 0.01024393, "balance_loss_clip": 1.05498219, "balance_loss_mlp": 1.01700771, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 4.203763302169974, "language_loss": 0.68931615, "learning_rate": 2.5886241377181483e-06, "loss": 0.71106696, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.6739299297332764 }, { "auxiliary_loss_clip": 0.0117055, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.05584025, "balance_loss_mlp": 1.01939702, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 2.675807054964017, "language_loss": 0.8156426, "learning_rate": 2.587879622585234e-06, "loss": 0.8376264, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.709069013595581 }, { "auxiliary_loss_clip": 0.01170136, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.0595032, "balance_loss_mlp": 1.02130795, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 5.16831099549837, "language_loss": 0.76216263, "learning_rate": 2.5871350182714486e-06, "loss": 0.78415751, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.6927390098571777 }, { "auxiliary_loss_clip": 0.01182274, "auxiliary_loss_mlp": 0.01028259, "balance_loss_clip": 1.05557346, "balance_loss_mlp": 1.0208199, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 2.147512233617259, "language_loss": 0.80538833, "learning_rate": 2.586390324889748e-06, "loss": 0.82749367, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.6429574489593506 }, { "auxiliary_loss_clip": 0.011692, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.05758357, "balance_loss_mlp": 1.02145708, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.993991510619036, "language_loss": 0.67700994, "learning_rate": 2.5856455425531003e-06, "loss": 0.69898796, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.6789355278015137 }, { "auxiliary_loss_clip": 0.01166469, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.05586958, "balance_loss_mlp": 1.02101362, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 2.034242161119198, "language_loss": 0.80985272, "learning_rate": 2.5849006713744902e-06, "loss": 0.83179992, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.638953924179077 }, { "auxiliary_loss_clip": 0.01150215, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.05359411, "balance_loss_mlp": 1.01755118, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.920535749688769, "language_loss": 0.73384237, "learning_rate": 2.5841557114669135e-06, "loss": 0.75559586, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 4.544079303741455 }, { "auxiliary_loss_clip": 0.01187488, "auxiliary_loss_mlp": 0.01027113, "balance_loss_clip": 1.05610466, "balance_loss_mlp": 1.0192095, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 3.0143189980605203, "language_loss": 0.67232591, "learning_rate": 2.58341066294338e-06, "loss": 0.69447196, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 3.5252180099487305 }, { "auxiliary_loss_clip": 0.01121431, "auxiliary_loss_mlp": 0.00710723, "balance_loss_clip": 1.04855251, "balance_loss_mlp": 1.00074244, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 2.084426684123736, "language_loss": 0.84944177, "learning_rate": 2.5826655259169124e-06, "loss": 0.86776328, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 3.6045870780944824 }, { "auxiliary_loss_clip": 0.01187525, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.06020343, "balance_loss_mlp": 1.02011585, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 2.4205949305827663, "language_loss": 0.90537596, "learning_rate": 2.5819203005005475e-06, "loss": 0.92752659, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.5906624794006348 }, { "auxiliary_loss_clip": 0.01144909, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.05223894, "balance_loss_mlp": 1.02170038, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.5657406766393447, "language_loss": 0.78698653, "learning_rate": 2.581174986807336e-06, "loss": 0.808725, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.696953058242798 }, { "auxiliary_loss_clip": 0.01160204, "auxiliary_loss_mlp": 0.0071136, "balance_loss_clip": 1.05370569, "balance_loss_mlp": 1.00077891, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.7505717222096218, "language_loss": 0.91203678, "learning_rate": 2.580429584950341e-06, "loss": 0.9307524, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.7505805492401123 }, { "auxiliary_loss_clip": 0.01142048, "auxiliary_loss_mlp": 0.01026928, "balance_loss_clip": 1.05233693, "balance_loss_mlp": 1.01886392, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.0980760623671295, "language_loss": 0.66455859, "learning_rate": 2.5796840950426397e-06, "loss": 0.68624836, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.635525703430176 }, { "auxiliary_loss_clip": 0.01160964, "auxiliary_loss_mlp": 0.0102964, "balance_loss_clip": 1.05330682, "balance_loss_mlp": 1.02256465, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.6794361709212389, "language_loss": 0.65924895, "learning_rate": 2.578938517197322e-06, "loss": 0.68115503, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.6842544078826904 }, { "auxiliary_loss_clip": 0.01144814, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.05205035, "balance_loss_mlp": 1.02170944, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.3154370330741703, "language_loss": 0.62670696, "learning_rate": 2.5781928515274916e-06, "loss": 0.64844644, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.693995714187622 }, { "auxiliary_loss_clip": 0.01172825, "auxiliary_loss_mlp": 0.01025734, "balance_loss_clip": 1.05691683, "balance_loss_mlp": 1.01781249, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 2.7484562626554343, "language_loss": 0.67736769, "learning_rate": 2.577447098146265e-06, "loss": 0.69935322, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.6242640018463135 }, { "auxiliary_loss_clip": 0.01136075, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.05037665, "balance_loss_mlp": 1.02054942, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.9293027048097464, "language_loss": 0.79275054, "learning_rate": 2.5767012571667724e-06, "loss": 0.81439078, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.7463762760162354 }, { "auxiliary_loss_clip": 0.01168996, "auxiliary_loss_mlp": 0.01025866, "balance_loss_clip": 1.05319941, "balance_loss_mlp": 1.01684153, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 2.141690659990256, "language_loss": 0.69130063, "learning_rate": 2.5759553287021587e-06, "loss": 0.71324921, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.6101434230804443 }, { "auxiliary_loss_clip": 0.01149984, "auxiliary_loss_mlp": 0.01027263, "balance_loss_clip": 1.05288899, "balance_loss_mlp": 1.0195446, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 1.8842517244922612, "language_loss": 0.77704352, "learning_rate": 2.5752093128655786e-06, "loss": 0.79881597, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.66762638092041 }, { "auxiliary_loss_clip": 0.01145358, "auxiliary_loss_mlp": 0.01026246, "balance_loss_clip": 1.05202985, "balance_loss_mlp": 1.01852727, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 2.080531405596557, "language_loss": 0.73698193, "learning_rate": 2.574463209770204e-06, "loss": 0.75869799, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.679311513900757 }, { "auxiliary_loss_clip": 0.01132729, "auxiliary_loss_mlp": 0.01025877, "balance_loss_clip": 1.04836965, "balance_loss_mlp": 1.01763403, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.9791141460308521, "language_loss": 0.79857028, "learning_rate": 2.5737170195292165e-06, "loss": 0.82015634, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.747129440307617 }, { "auxiliary_loss_clip": 0.01136111, "auxiliary_loss_mlp": 0.01024574, "balance_loss_clip": 1.05059731, "balance_loss_mlp": 1.01698685, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 2.0885789881374444, "language_loss": 0.78296256, "learning_rate": 2.572970742255814e-06, "loss": 0.80456942, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.744114875793457 }, { "auxiliary_loss_clip": 0.0116657, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.0559144, "balance_loss_mlp": 1.01944411, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.826030060616974, "language_loss": 0.81482208, "learning_rate": 2.5722243780632046e-06, "loss": 0.83675045, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.662548780441284 }, { "auxiliary_loss_clip": 0.01068462, "auxiliary_loss_mlp": 0.01007851, "balance_loss_clip": 1.04741108, "balance_loss_mlp": 1.00624132, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7508248353294679, "language_loss": 0.6047833, "learning_rate": 2.5714779270646125e-06, "loss": 0.62554634, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.223353147506714 }, { "auxiliary_loss_clip": 0.01154857, "auxiliary_loss_mlp": 0.00711895, "balance_loss_clip": 1.05615997, "balance_loss_mlp": 1.00079477, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 14.301371986479054, "language_loss": 0.7772823, "learning_rate": 2.5707313893732735e-06, "loss": 0.79594988, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.674060583114624 }, { "auxiliary_loss_clip": 0.01083185, "auxiliary_loss_mlp": 0.01027248, "balance_loss_clip": 1.04311872, "balance_loss_mlp": 1.01939225, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 2.405004669034587, "language_loss": 0.77283132, "learning_rate": 2.5699847651024364e-06, "loss": 0.79393566, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.823683500289917 }, { "auxiliary_loss_clip": 0.01165014, "auxiliary_loss_mlp": 0.0102374, "balance_loss_clip": 1.05565083, "balance_loss_mlp": 1.01674819, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 4.933641805907061, "language_loss": 0.769786, "learning_rate": 2.5692380543653627e-06, "loss": 0.79167354, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.688343048095703 }, { "auxiliary_loss_clip": 0.01175941, "auxiliary_loss_mlp": 0.00710922, "balance_loss_clip": 1.05866647, "balance_loss_mlp": 1.00064969, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 1.870427068870709, "language_loss": 0.69854403, "learning_rate": 2.5684912572753293e-06, "loss": 0.71741259, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.5876879692077637 }, { "auxiliary_loss_clip": 0.01184496, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.05893087, "balance_loss_mlp": 1.02010894, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.823871702169719, "language_loss": 0.84095585, "learning_rate": 2.5677443739456245e-06, "loss": 0.86307132, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.650726556777954 }, { "auxiliary_loss_clip": 0.01155144, "auxiliary_loss_mlp": 0.01023644, "balance_loss_clip": 1.05382884, "balance_loss_mlp": 1.01575828, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.3940425726487824, "language_loss": 0.79436326, "learning_rate": 2.5669974044895495e-06, "loss": 0.81615114, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.6935949325561523 }, { "auxiliary_loss_clip": 0.01141267, "auxiliary_loss_mlp": 0.01024776, "balance_loss_clip": 1.05100429, "balance_loss_mlp": 1.01747131, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 2.973088370354957, "language_loss": 0.79692757, "learning_rate": 2.5662503490204187e-06, "loss": 0.81858808, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.768605947494507 }, { "auxiliary_loss_clip": 0.01151706, "auxiliary_loss_mlp": 0.01027084, "balance_loss_clip": 1.05312908, "balance_loss_mlp": 1.01993132, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 1.9807511507442712, "language_loss": 0.76076943, "learning_rate": 2.5655032076515603e-06, "loss": 0.78255737, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.7159600257873535 }, { "auxiliary_loss_clip": 0.0115364, "auxiliary_loss_mlp": 0.01025545, "balance_loss_clip": 1.05546141, "balance_loss_mlp": 1.01762915, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 2.1973212119973202, "language_loss": 0.82348788, "learning_rate": 2.5647559804963155e-06, "loss": 0.84527969, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 3.61559796333313 }, { "auxiliary_loss_clip": 0.01127617, "auxiliary_loss_mlp": 0.01029678, "balance_loss_clip": 1.05247629, "balance_loss_mlp": 1.02178621, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 1.9974381274645612, "language_loss": 0.78933525, "learning_rate": 2.5640086676680364e-06, "loss": 0.8109082, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 3.7340211868286133 }, { "auxiliary_loss_clip": 0.01168318, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.05537307, "balance_loss_mlp": 1.02163076, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.373710407702981, "language_loss": 0.81002152, "learning_rate": 2.5632612692800923e-06, "loss": 0.83199608, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 3.6129703521728516 }, { "auxiliary_loss_clip": 0.01138785, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.05370712, "balance_loss_mlp": 1.02059186, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 1.9857042878058675, "language_loss": 0.75464195, "learning_rate": 2.5625137854458603e-06, "loss": 0.77631885, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.73359751701355 }, { "auxiliary_loss_clip": 0.01153875, "auxiliary_loss_mlp": 0.01020841, "balance_loss_clip": 1.0525105, "balance_loss_mlp": 1.01409066, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 4.275597166400312, "language_loss": 0.80283797, "learning_rate": 2.561766216278735e-06, "loss": 0.82458508, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.698004961013794 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.01024092, "balance_loss_clip": 1.0500046, "balance_loss_mlp": 1.01624179, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 1.9221330067912261, "language_loss": 0.81271124, "learning_rate": 2.561018561892121e-06, "loss": 0.83413815, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.7316315174102783 }, { "auxiliary_loss_clip": 0.0114771, "auxiliary_loss_mlp": 0.01024889, "balance_loss_clip": 1.05060244, "balance_loss_mlp": 1.01785541, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 3.251332175881807, "language_loss": 0.76850367, "learning_rate": 2.5602708223994363e-06, "loss": 0.79022968, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.701843500137329 }, { "auxiliary_loss_clip": 0.01135587, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 1.04769766, "balance_loss_mlp": 1.0180738, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.368632712959917, "language_loss": 0.67783225, "learning_rate": 2.559522997914115e-06, "loss": 0.6994406, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.748548984527588 }, { "auxiliary_loss_clip": 0.01184567, "auxiliary_loss_mlp": 0.0102427, "balance_loss_clip": 1.05935001, "balance_loss_mlp": 1.01766574, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 2.441398697480029, "language_loss": 0.84763378, "learning_rate": 2.558775088549599e-06, "loss": 0.86972213, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.6419219970703125 }, { "auxiliary_loss_clip": 0.0117629, "auxiliary_loss_mlp": 0.01022045, "balance_loss_clip": 1.0572927, "balance_loss_mlp": 1.01412344, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 3.0808989266478104, "language_loss": 0.66272265, "learning_rate": 2.5580270944193467e-06, "loss": 0.68470597, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.57704758644104 }, { "auxiliary_loss_clip": 0.01118667, "auxiliary_loss_mlp": 0.01002667, "balance_loss_clip": 1.05147362, "balance_loss_mlp": 1.00127244, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.7382080704481814, "language_loss": 0.55492485, "learning_rate": 2.557279015636827e-06, "loss": 0.5761382, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.175847053527832 }, { "auxiliary_loss_clip": 0.01088328, "auxiliary_loss_mlp": 0.01001633, "balance_loss_clip": 1.03578138, "balance_loss_mlp": 1.00017858, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.763920906378452, "language_loss": 0.6120612, "learning_rate": 2.5565308523155245e-06, "loss": 0.6329608, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.15075421333313 }, { "auxiliary_loss_clip": 0.0111537, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.05172431, "balance_loss_mlp": 1.02203882, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.727626274695074, "language_loss": 0.81703293, "learning_rate": 2.5557826045689336e-06, "loss": 0.83847773, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.8061342239379883 }, { "auxiliary_loss_clip": 0.01084115, "auxiliary_loss_mlp": 0.01005334, "balance_loss_clip": 1.05842066, "balance_loss_mlp": 1.00380802, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8243925795843381, "language_loss": 0.58808273, "learning_rate": 2.5550342725105643e-06, "loss": 0.6089772, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.193441152572632 }, { "auxiliary_loss_clip": 0.01169949, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.05899858, "balance_loss_mlp": 1.01719856, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 2.1051646174189775, "language_loss": 0.8067019, "learning_rate": 2.554285856253937e-06, "loss": 0.82865322, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.698765277862549 }, { "auxiliary_loss_clip": 0.01152392, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.05672503, "balance_loss_mlp": 1.02033758, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.7286101600244121, "language_loss": 0.7727114, "learning_rate": 2.5535373559125855e-06, "loss": 0.79451346, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.7021844387054443 }, { "auxiliary_loss_clip": 0.01089008, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.04597235, "balance_loss_mlp": 1.02139378, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 3.6784685239666803, "language_loss": 0.82028735, "learning_rate": 2.552788771600057e-06, "loss": 0.84147292, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.9384772777557373 }, { "auxiliary_loss_clip": 0.01139921, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.05481935, "balance_loss_mlp": 1.0250144, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 1.9124524433548724, "language_loss": 0.82376003, "learning_rate": 2.5520401034299118e-06, "loss": 0.84548283, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.81343150138855 }, { "auxiliary_loss_clip": 0.01170787, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.05640018, "balance_loss_mlp": 1.0179317, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 2.613011424086957, "language_loss": 0.87797153, "learning_rate": 2.551291351515722e-06, "loss": 0.89993781, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.674694299697876 }, { "auxiliary_loss_clip": 0.0113161, "auxiliary_loss_mlp": 0.00710963, "balance_loss_clip": 1.04768372, "balance_loss_mlp": 1.0006727, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 6.084063988276391, "language_loss": 0.85419995, "learning_rate": 2.5505425159710726e-06, "loss": 0.87262571, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.7198891639709473 }, { "auxiliary_loss_clip": 0.01156762, "auxiliary_loss_mlp": 0.00711467, "balance_loss_clip": 1.05216849, "balance_loss_mlp": 1.00077975, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 3.6382571444793417, "language_loss": 0.83296621, "learning_rate": 2.549793596909561e-06, "loss": 0.85164851, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.743722915649414 }, { "auxiliary_loss_clip": 0.01149235, "auxiliary_loss_mlp": 0.01024355, "balance_loss_clip": 1.05467749, "balance_loss_mlp": 1.01729214, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 2.528394615498961, "language_loss": 0.66555279, "learning_rate": 2.5490445944447976e-06, "loss": 0.68728876, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.603545665740967 }, { "auxiliary_loss_clip": 0.01167764, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.05512369, "balance_loss_mlp": 1.02223074, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 2.1813366766989093, "language_loss": 0.65112484, "learning_rate": 2.548295508690406e-06, "loss": 0.67309916, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.7543649673461914 }, { "auxiliary_loss_clip": 0.01171076, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.05495262, "balance_loss_mlp": 1.01782775, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 2.7955646367177334, "language_loss": 0.76740789, "learning_rate": 2.5475463397600217e-06, "loss": 0.78936756, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.672008752822876 }, { "auxiliary_loss_clip": 0.01185007, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.05653131, "balance_loss_mlp": 1.0259856, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 5.565536306719502, "language_loss": 0.7786516, "learning_rate": 2.546797087767293e-06, "loss": 0.800843, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.6874122619628906 }, { "auxiliary_loss_clip": 0.01119194, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.0514071, "balance_loss_mlp": 1.01725888, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.6743621717556754, "language_loss": 0.86946607, "learning_rate": 2.546047752825881e-06, "loss": 0.89090776, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 3.8784923553466797 }, { "auxiliary_loss_clip": 0.01122853, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.04800212, "balance_loss_mlp": 1.02022326, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.132473832508164, "language_loss": 0.93505371, "learning_rate": 2.5452983350494595e-06, "loss": 0.95655817, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 4.583038091659546 }, { "auxiliary_loss_clip": 0.01170727, "auxiliary_loss_mlp": 0.00711119, "balance_loss_clip": 1.05662477, "balance_loss_mlp": 1.00079107, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.01760109830687, "language_loss": 0.64651811, "learning_rate": 2.544548834551713e-06, "loss": 0.66533661, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.6207714080810547 }, { "auxiliary_loss_clip": 0.01131574, "auxiliary_loss_mlp": 0.00711217, "balance_loss_clip": 1.05130816, "balance_loss_mlp": 1.00071454, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.2992458213788436, "language_loss": 0.94593835, "learning_rate": 2.5437992514463424e-06, "loss": 0.96436632, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 3.5724408626556396 }, { "auxiliary_loss_clip": 0.01163769, "auxiliary_loss_mlp": 0.01027885, "balance_loss_clip": 1.05169845, "balance_loss_mlp": 1.02068472, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 2.027504839545944, "language_loss": 0.88233346, "learning_rate": 2.5430495858470565e-06, "loss": 0.90425003, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.633152484893799 }, { "auxiliary_loss_clip": 0.01163673, "auxiliary_loss_mlp": 0.0102353, "balance_loss_clip": 1.0532968, "balance_loss_mlp": 1.01595449, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 2.337801005772396, "language_loss": 0.77366984, "learning_rate": 2.54229983786758e-06, "loss": 0.79554188, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.607337236404419 }, { "auxiliary_loss_clip": 0.01152267, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.05132091, "balance_loss_mlp": 1.02529299, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 2.3252889637899785, "language_loss": 0.8548665, "learning_rate": 2.541550007621651e-06, "loss": 0.87671649, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.6483092308044434 }, { "auxiliary_loss_clip": 0.01167596, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.05769491, "balance_loss_mlp": 1.0225873, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 1.9007908817730284, "language_loss": 0.8007834, "learning_rate": 2.5408000952230156e-06, "loss": 0.82275653, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.643311023712158 }, { "auxiliary_loss_clip": 0.01147654, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.05262375, "balance_loss_mlp": 1.0216527, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 7.059865247682878, "language_loss": 0.90679121, "learning_rate": 2.5400501007854357e-06, "loss": 0.92856264, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.7397003173828125 }, { "auxiliary_loss_clip": 0.0111718, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.04588544, "balance_loss_mlp": 1.0193615, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 2.3284979578630907, "language_loss": 0.75738657, "learning_rate": 2.539300024422685e-06, "loss": 0.77882564, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.713865280151367 }, { "auxiliary_loss_clip": 0.01058449, "auxiliary_loss_mlp": 0.01001017, "balance_loss_clip": 1.02735639, "balance_loss_mlp": 0.99966383, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7895955950500557, "language_loss": 0.60902315, "learning_rate": 2.538549866248549e-06, "loss": 0.62961781, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.1127278804779053 }, { "auxiliary_loss_clip": 0.01170268, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.05464363, "balance_loss_mlp": 1.02230263, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 5.010843425668659, "language_loss": 0.81189048, "learning_rate": 2.5377996263768274e-06, "loss": 0.83389568, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.624480724334717 }, { "auxiliary_loss_clip": 0.01163002, "auxiliary_loss_mlp": 0.01024849, "balance_loss_clip": 1.05252433, "balance_loss_mlp": 1.01733279, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 2.3221108390641816, "language_loss": 0.68738616, "learning_rate": 2.5370493049213293e-06, "loss": 0.70926464, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.685520887374878 }, { "auxiliary_loss_clip": 0.01062446, "auxiliary_loss_mlp": 0.01025635, "balance_loss_clip": 1.04103255, "balance_loss_mlp": 1.0176177, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 2.3117064495925437, "language_loss": 0.80111569, "learning_rate": 2.536298901995878e-06, "loss": 0.82199645, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 3.093594551086426 }, { "auxiliary_loss_clip": 0.01150297, "auxiliary_loss_mlp": 0.01027764, "balance_loss_clip": 1.05043221, "balance_loss_mlp": 1.01968765, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 2.3477787999379047, "language_loss": 0.80167568, "learning_rate": 2.535548417714311e-06, "loss": 0.82345629, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.948709011077881 }, { "auxiliary_loss_clip": 0.01171177, "auxiliary_loss_mlp": 0.0102827, "balance_loss_clip": 1.05191672, "balance_loss_mlp": 1.02057517, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.6507803829715255, "language_loss": 0.87339568, "learning_rate": 2.534797852190474e-06, "loss": 0.89539015, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.8240966796875 }, { "auxiliary_loss_clip": 0.01164494, "auxiliary_loss_mlp": 0.01027464, "balance_loss_clip": 1.05163324, "balance_loss_mlp": 1.01984918, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 2.079338343173776, "language_loss": 0.81388515, "learning_rate": 2.5340472055382283e-06, "loss": 0.8358047, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.6339173316955566 }, { "auxiliary_loss_clip": 0.01132742, "auxiliary_loss_mlp": 0.010243, "balance_loss_clip": 1.04774415, "balance_loss_mlp": 1.01666462, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.6200946042428703, "language_loss": 0.81204391, "learning_rate": 2.5332964778714468e-06, "loss": 0.83361429, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.866649627685547 }, { "auxiliary_loss_clip": 0.01134611, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.05206108, "balance_loss_mlp": 1.01891422, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.6318506091203417, "language_loss": 0.66389084, "learning_rate": 2.5325456693040123e-06, "loss": 0.68550205, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.7533130645751953 }, { "auxiliary_loss_clip": 0.01172279, "auxiliary_loss_mlp": 0.01024712, "balance_loss_clip": 1.0528971, "balance_loss_mlp": 1.01608098, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.469221080317052, "language_loss": 0.74447536, "learning_rate": 2.531794779949824e-06, "loss": 0.76644528, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.674365997314453 }, { "auxiliary_loss_clip": 0.01126582, "auxiliary_loss_mlp": 0.0102525, "balance_loss_clip": 1.04868889, "balance_loss_mlp": 1.01781762, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 7.195383061754711, "language_loss": 0.88026726, "learning_rate": 2.5310438099227903e-06, "loss": 0.90178555, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.7854466438293457 }, { "auxiliary_loss_clip": 0.01091189, "auxiliary_loss_mlp": 0.01002716, "balance_loss_clip": 1.03701615, "balance_loss_mlp": 1.00144672, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.8149954267055806, "language_loss": 0.53394461, "learning_rate": 2.530292759336833e-06, "loss": 0.55488366, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.2878451347351074 }, { "auxiliary_loss_clip": 0.01149716, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.05286026, "balance_loss_mlp": 1.01566672, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 3.0383987395080507, "language_loss": 0.69665676, "learning_rate": 2.5295416283058855e-06, "loss": 0.71838838, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.704149007797241 }, { "auxiliary_loss_clip": 0.01146332, "auxiliary_loss_mlp": 0.00711153, "balance_loss_clip": 1.0503788, "balance_loss_mlp": 1.00064027, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 2.0366119979425026, "language_loss": 0.65856993, "learning_rate": 2.5287904169438943e-06, "loss": 0.67714477, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.7689239978790283 }, { "auxiliary_loss_clip": 0.01095108, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 1.04727459, "balance_loss_mlp": 1.01988029, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 3.235579729107891, "language_loss": 0.63864505, "learning_rate": 2.528039125364817e-06, "loss": 0.65987545, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.873253107070923 }, { "auxiliary_loss_clip": 0.01139056, "auxiliary_loss_mlp": 0.01023911, "balance_loss_clip": 1.05123127, "balance_loss_mlp": 1.01634121, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.2957105170309697, "language_loss": 0.75856006, "learning_rate": 2.5272877536826246e-06, "loss": 0.78018975, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.73635196685791 }, { "auxiliary_loss_clip": 0.0111999, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.04543114, "balance_loss_mlp": 1.02446389, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 3.603306139772889, "language_loss": 0.71421421, "learning_rate": 2.5265363020112986e-06, "loss": 0.7357384, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 3.765897274017334 }, { "auxiliary_loss_clip": 0.01165913, "auxiliary_loss_mlp": 0.01026765, "balance_loss_clip": 1.05399776, "balance_loss_mlp": 1.0190165, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 1.8817844256943246, "language_loss": 0.8394978, "learning_rate": 2.5257847704648344e-06, "loss": 0.86142457, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 4.539721727371216 }, { "auxiliary_loss_clip": 0.01182159, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.05534208, "balance_loss_mlp": 1.02595305, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 2.062681037113589, "language_loss": 0.75434363, "learning_rate": 2.525033159157239e-06, "loss": 0.7765038, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 3.537393569946289 }, { "auxiliary_loss_clip": 0.01162857, "auxiliary_loss_mlp": 0.01031822, "balance_loss_clip": 1.05209231, "balance_loss_mlp": 1.024055, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 2.081088257404711, "language_loss": 0.77148628, "learning_rate": 2.52428146820253e-06, "loss": 0.79343307, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.6677961349487305 }, { "auxiliary_loss_clip": 0.01139744, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.05368114, "balance_loss_mlp": 1.02445745, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.8409667234421987, "language_loss": 0.81568122, "learning_rate": 2.52352969771474e-06, "loss": 0.83739871, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.850914716720581 }, { "auxiliary_loss_clip": 0.01150684, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.05058694, "balance_loss_mlp": 1.01935363, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.290858033447635, "language_loss": 0.88384843, "learning_rate": 2.5227778478079106e-06, "loss": 0.90562129, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.6598706245422363 }, { "auxiliary_loss_clip": 0.01162312, "auxiliary_loss_mlp": 0.01028196, "balance_loss_clip": 1.05193508, "balance_loss_mlp": 1.02047133, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.948201041343656, "language_loss": 0.76860631, "learning_rate": 2.522025918596098e-06, "loss": 0.79051137, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.663785934448242 }, { "auxiliary_loss_clip": 0.01168439, "auxiliary_loss_mlp": 0.01024065, "balance_loss_clip": 1.05416131, "balance_loss_mlp": 1.01710868, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.9634292808997564, "language_loss": 0.6526531, "learning_rate": 2.521273910193368e-06, "loss": 0.67457807, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.707498788833618 }, { "auxiliary_loss_clip": 0.01173507, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.05493104, "balance_loss_mlp": 1.02194726, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 3.1511179876286644, "language_loss": 0.87231815, "learning_rate": 2.5205218227138006e-06, "loss": 0.89434934, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.635547399520874 }, { "auxiliary_loss_clip": 0.01184478, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.05541134, "balance_loss_mlp": 1.01973891, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 2.320288163310381, "language_loss": 0.79001606, "learning_rate": 2.519769656271486e-06, "loss": 0.81213403, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.5672173500061035 }, { "auxiliary_loss_clip": 0.0110868, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.04746544, "balance_loss_mlp": 1.0208869, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 2.5452705349801596, "language_loss": 0.6731863, "learning_rate": 2.5190174109805285e-06, "loss": 0.69455981, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.703411817550659 }, { "auxiliary_loss_clip": 0.01142683, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.05002165, "balance_loss_mlp": 1.02301383, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.37929811212804, "language_loss": 0.64115286, "learning_rate": 2.518265086955042e-06, "loss": 0.6628899, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.5973637104034424 }, { "auxiliary_loss_clip": 0.01181232, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.05367231, "balance_loss_mlp": 1.016096, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 1.813432896598142, "language_loss": 0.83689839, "learning_rate": 2.5175126843091534e-06, "loss": 0.85894954, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.588491916656494 }, { "auxiliary_loss_clip": 0.01153491, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.05337763, "balance_loss_mlp": 1.01887357, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.252034970798004, "language_loss": 0.75263333, "learning_rate": 2.5167602031570034e-06, "loss": 0.77443045, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.72175931930542 }, { "auxiliary_loss_clip": 0.011833, "auxiliary_loss_mlp": 0.01024233, "balance_loss_clip": 1.05618012, "balance_loss_mlp": 1.01662731, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.7856642101335432, "language_loss": 0.73417151, "learning_rate": 2.51600764361274e-06, "loss": 0.75624681, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.6678526401519775 }, { "auxiliary_loss_clip": 0.01185592, "auxiliary_loss_mlp": 0.01031438, "balance_loss_clip": 1.05825329, "balance_loss_mlp": 1.023368, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.7434557694815425, "language_loss": 0.78890669, "learning_rate": 2.5152550057905283e-06, "loss": 0.811077, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.589038133621216 }, { "auxiliary_loss_clip": 0.01170405, "auxiliary_loss_mlp": 0.00712005, "balance_loss_clip": 1.05701876, "balance_loss_mlp": 1.000664, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 3.7682098729385824, "language_loss": 0.77080458, "learning_rate": 2.5145022898045415e-06, "loss": 0.78962868, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.6616992950439453 }, { "auxiliary_loss_clip": 0.01154088, "auxiliary_loss_mlp": 0.01021958, "balance_loss_clip": 1.05169964, "balance_loss_mlp": 1.01388788, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 5.421107537645588, "language_loss": 0.89907074, "learning_rate": 2.5137494957689664e-06, "loss": 0.9208312, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.636263370513916 }, { "auxiliary_loss_clip": 0.01075764, "auxiliary_loss_mlp": 0.01000715, "balance_loss_clip": 1.03733873, "balance_loss_mlp": 0.99943966, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7672119545842975, "language_loss": 0.57310331, "learning_rate": 2.5129966237980016e-06, "loss": 0.59386802, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.260298490524292 }, { "auxiliary_loss_clip": 0.01138698, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05303407, "balance_loss_mlp": 1.01882291, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 3.878627516536017, "language_loss": 0.78017241, "learning_rate": 2.512243674005857e-06, "loss": 0.80182576, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.697664260864258 }, { "auxiliary_loss_clip": 0.01099997, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.04598284, "balance_loss_mlp": 1.02396536, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 1.8499373590462322, "language_loss": 0.86362493, "learning_rate": 2.5114906465067537e-06, "loss": 0.88494265, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.7817628383636475 }, { "auxiliary_loss_clip": 0.01168367, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.05340862, "balance_loss_mlp": 1.02089608, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 2.533620093858914, "language_loss": 0.75474572, "learning_rate": 2.5107375414149264e-06, "loss": 0.77670813, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.667172908782959 }, { "auxiliary_loss_clip": 0.01113971, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04641318, "balance_loss_mlp": 1.01896501, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 4.481873514438856, "language_loss": 0.7141999, "learning_rate": 2.5099843588446197e-06, "loss": 0.73561019, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.6822457313537598 }, { "auxiliary_loss_clip": 0.01124655, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.05155396, "balance_loss_mlp": 1.01973784, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 2.284511310917252, "language_loss": 0.61676514, "learning_rate": 2.509231098910091e-06, "loss": 0.63828051, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.7382709980010986 }, { "auxiliary_loss_clip": 0.01154029, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.05890548, "balance_loss_mlp": 1.0183599, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.57699007056035, "language_loss": 0.7501291, "learning_rate": 2.508477761725611e-06, "loss": 0.77192426, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.609834909439087 }, { "auxiliary_loss_clip": 0.01169665, "auxiliary_loss_mlp": 0.01023952, "balance_loss_clip": 1.05334866, "balance_loss_mlp": 1.01625681, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 2.149832871056429, "language_loss": 0.80645072, "learning_rate": 2.507724347405458e-06, "loss": 0.8283869, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.625225067138672 }, { "auxiliary_loss_clip": 0.01114991, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.04695177, "balance_loss_mlp": 1.01782775, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 3.4206352073949273, "language_loss": 0.82054436, "learning_rate": 2.5069708560639243e-06, "loss": 0.84194529, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.648651361465454 }, { "auxiliary_loss_clip": 0.01137191, "auxiliary_loss_mlp": 0.01022493, "balance_loss_clip": 1.05171657, "balance_loss_mlp": 1.01487529, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 2.0389582693955624, "language_loss": 0.61472964, "learning_rate": 2.5062172878153158e-06, "loss": 0.63632649, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 5.487696170806885 }, { "auxiliary_loss_clip": 0.01109581, "auxiliary_loss_mlp": 0.01022702, "balance_loss_clip": 1.04821789, "balance_loss_mlp": 1.01544857, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.1999240801943003, "language_loss": 0.87617058, "learning_rate": 2.505463642773947e-06, "loss": 0.89749348, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 3.7064785957336426 }, { "auxiliary_loss_clip": 0.01136342, "auxiliary_loss_mlp": 0.00711633, "balance_loss_clip": 1.05047715, "balance_loss_mlp": 1.00052202, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 7.934138707021812, "language_loss": 0.74774545, "learning_rate": 2.504709921054146e-06, "loss": 0.76622522, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.70645809173584 }, { "auxiliary_loss_clip": 0.01132207, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.04929376, "balance_loss_mlp": 1.0214901, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.2647587842226646, "language_loss": 0.83978629, "learning_rate": 2.50395612277025e-06, "loss": 0.86140066, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.652071475982666 }, { "auxiliary_loss_clip": 0.01154902, "auxiliary_loss_mlp": 0.0102561, "balance_loss_clip": 1.05148029, "balance_loss_mlp": 1.01829696, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.155998035505842, "language_loss": 0.72872221, "learning_rate": 2.503202248036612e-06, "loss": 0.75052738, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.6771743297576904 }, { "auxiliary_loss_clip": 0.01183582, "auxiliary_loss_mlp": 0.01025901, "balance_loss_clip": 1.05654383, "balance_loss_mlp": 1.01770556, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 3.4166276683742756, "language_loss": 0.73395181, "learning_rate": 2.5024482969675927e-06, "loss": 0.75604665, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.6174721717834473 }, { "auxiliary_loss_clip": 0.01124387, "auxiliary_loss_mlp": 0.01019844, "balance_loss_clip": 1.0486747, "balance_loss_mlp": 1.01255095, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 3.7211942391647757, "language_loss": 0.84550482, "learning_rate": 2.501694269677566e-06, "loss": 0.86694711, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.7222859859466553 }, { "auxiliary_loss_clip": 0.01171178, "auxiliary_loss_mlp": 0.0102247, "balance_loss_clip": 1.05376887, "balance_loss_mlp": 1.01485264, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 2.2986656972638317, "language_loss": 0.80620265, "learning_rate": 2.500940166280918e-06, "loss": 0.82813907, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.6303932666778564 }, { "auxiliary_loss_clip": 0.01161909, "auxiliary_loss_mlp": 0.01024038, "balance_loss_clip": 1.05148458, "balance_loss_mlp": 1.01671219, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.8229862205410405, "language_loss": 0.79065526, "learning_rate": 2.500185986892045e-06, "loss": 0.81251478, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.634521245956421 }, { "auxiliary_loss_clip": 0.01160113, "auxiliary_loss_mlp": 0.01019642, "balance_loss_clip": 1.05057156, "balance_loss_mlp": 1.0120188, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 3.664942439498866, "language_loss": 0.77165174, "learning_rate": 2.499431731625355e-06, "loss": 0.79344928, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.662137746810913 }, { "auxiliary_loss_clip": 0.01184014, "auxiliary_loss_mlp": 0.01022211, "balance_loss_clip": 1.05417895, "balance_loss_mlp": 1.01454616, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 3.544086535634771, "language_loss": 0.79670799, "learning_rate": 2.4986774005952686e-06, "loss": 0.81877029, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.6884212493896484 }, { "auxiliary_loss_clip": 0.01167325, "auxiliary_loss_mlp": 0.01024815, "balance_loss_clip": 1.05642366, "balance_loss_mlp": 1.01723909, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 2.6058160597656506, "language_loss": 0.84599817, "learning_rate": 2.4979229939162166e-06, "loss": 0.8679195, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.655519485473633 }, { "auxiliary_loss_clip": 0.01162479, "auxiliary_loss_mlp": 0.01027443, "balance_loss_clip": 1.0532136, "balance_loss_mlp": 1.0198493, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.8789430245247976, "language_loss": 0.80572689, "learning_rate": 2.4971685117026433e-06, "loss": 0.82762605, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.6660730838775635 }, { "auxiliary_loss_clip": 0.01169636, "auxiliary_loss_mlp": 0.01020728, "balance_loss_clip": 1.05411744, "balance_loss_mlp": 1.01350451, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.716962640853469, "language_loss": 0.76770025, "learning_rate": 2.4964139540690018e-06, "loss": 0.78960389, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.6959080696105957 }, { "auxiliary_loss_clip": 0.01137128, "auxiliary_loss_mlp": 0.01021751, "balance_loss_clip": 1.05198216, "balance_loss_mlp": 1.01394916, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.456469771336192, "language_loss": 0.72616351, "learning_rate": 2.495659321129758e-06, "loss": 0.74775237, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.692526340484619 }, { "auxiliary_loss_clip": 0.01161595, "auxiliary_loss_mlp": 0.01025519, "balance_loss_clip": 1.05122399, "balance_loss_mlp": 1.01825345, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.8939315656455202, "language_loss": 0.75201118, "learning_rate": 2.494904612999389e-06, "loss": 0.77388227, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.6647117137908936 }, { "auxiliary_loss_clip": 0.01081928, "auxiliary_loss_mlp": 0.01000908, "balance_loss_clip": 1.02820158, "balance_loss_mlp": 0.99965632, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7460225900787829, "language_loss": 0.56479108, "learning_rate": 2.4941498297923843e-06, "loss": 0.58561933, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.1943533420562744 }, { "auxiliary_loss_clip": 0.01163026, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.05298305, "balance_loss_mlp": 1.01844072, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.7103964159339247, "language_loss": 0.69921994, "learning_rate": 2.4933949716232424e-06, "loss": 0.72111052, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.577495574951172 }, { "auxiliary_loss_clip": 0.01138401, "auxiliary_loss_mlp": 0.01021148, "balance_loss_clip": 1.05476344, "balance_loss_mlp": 1.01430535, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.56945131293948, "language_loss": 0.73749936, "learning_rate": 2.492640038606476e-06, "loss": 0.75909483, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.679520606994629 }, { "auxiliary_loss_clip": 0.01165041, "auxiliary_loss_mlp": 0.01027508, "balance_loss_clip": 1.05192423, "balance_loss_mlp": 1.01989031, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 2.0867596189895625, "language_loss": 0.78729612, "learning_rate": 2.491885030856608e-06, "loss": 0.80922163, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.5932905673980713 }, { "auxiliary_loss_clip": 0.01151284, "auxiliary_loss_mlp": 0.01024785, "balance_loss_clip": 1.05187809, "balance_loss_mlp": 1.01666045, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.1515351680237185, "language_loss": 0.82452863, "learning_rate": 2.4911299484881713e-06, "loss": 0.84628934, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.7894082069396973 }, { "auxiliary_loss_clip": 0.01144523, "auxiliary_loss_mlp": 0.01023675, "balance_loss_clip": 1.04840374, "balance_loss_mlp": 1.01622987, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 1.907911151103297, "language_loss": 0.81205904, "learning_rate": 2.490374791615712e-06, "loss": 0.83374107, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.6686601638793945 }, { "auxiliary_loss_clip": 0.01187547, "auxiliary_loss_mlp": 0.0071206, "balance_loss_clip": 1.05658925, "balance_loss_mlp": 1.00072408, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.93127853998214, "language_loss": 0.77415287, "learning_rate": 2.4896195603537867e-06, "loss": 0.79314899, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.585923194885254 }, { "auxiliary_loss_clip": 0.01113474, "auxiliary_loss_mlp": 0.01024744, "balance_loss_clip": 1.05172658, "balance_loss_mlp": 1.01648855, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.042842453938719, "language_loss": 0.73893154, "learning_rate": 2.488864254816964e-06, "loss": 0.76031375, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.6749773025512695 }, { "auxiliary_loss_clip": 0.01169401, "auxiliary_loss_mlp": 0.01021819, "balance_loss_clip": 1.05524683, "balance_loss_mlp": 1.01372433, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 3.885355191847315, "language_loss": 0.68015981, "learning_rate": 2.4881088751198218e-06, "loss": 0.70207202, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.655170440673828 }, { "auxiliary_loss_clip": 0.01150481, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.05012178, "balance_loss_mlp": 1.0225271, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 3.201468257510913, "language_loss": 0.6487754, "learning_rate": 2.4873534213769517e-06, "loss": 0.67058647, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 3.5738072395324707 }, { "auxiliary_loss_clip": 0.01131353, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.05222988, "balance_loss_mlp": 1.02158928, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 2.7235376338483097, "language_loss": 0.71879584, "learning_rate": 2.4865978937029547e-06, "loss": 0.7403965, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 3.964426040649414 }, { "auxiliary_loss_clip": 0.01108388, "auxiliary_loss_mlp": 0.01026757, "balance_loss_clip": 1.04834127, "balance_loss_mlp": 1.01917553, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.8857965555685983, "language_loss": 0.66264302, "learning_rate": 2.485842292212445e-06, "loss": 0.68399441, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 3.692941427230835 }, { "auxiliary_loss_clip": 0.01186984, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.05710733, "balance_loss_mlp": 1.01805258, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 3.9871138761299783, "language_loss": 0.80261481, "learning_rate": 2.485086617020045e-06, "loss": 0.82474262, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.592902421951294 }, { "auxiliary_loss_clip": 0.01144161, "auxiliary_loss_mlp": 0.01026222, "balance_loss_clip": 1.04876423, "balance_loss_mlp": 1.01773429, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 5.862197439396001, "language_loss": 0.81840724, "learning_rate": 2.4843308682403903e-06, "loss": 0.84011108, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.632610321044922 }, { "auxiliary_loss_clip": 0.01181491, "auxiliary_loss_mlp": 0.01022114, "balance_loss_clip": 1.05397201, "balance_loss_mlp": 1.01520014, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 4.068420971356822, "language_loss": 0.82525873, "learning_rate": 2.4835750459881294e-06, "loss": 0.84729481, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.5173768997192383 }, { "auxiliary_loss_clip": 0.01143305, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.04910445, "balance_loss_mlp": 1.02292585, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 1.8038301292148424, "language_loss": 0.82127082, "learning_rate": 2.4828191503779177e-06, "loss": 0.84300697, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.650526523590088 }, { "auxiliary_loss_clip": 0.0113438, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 1.05004609, "balance_loss_mlp": 1.0221231, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 2.388613518690145, "language_loss": 0.89510238, "learning_rate": 2.482063181524425e-06, "loss": 0.9167403, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.6951780319213867 }, { "auxiliary_loss_clip": 0.01185101, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.05584407, "balance_loss_mlp": 1.01883101, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.3856034098776315, "language_loss": 0.81098962, "learning_rate": 2.4813071395423307e-06, "loss": 0.83311558, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.571680784225464 }, { "auxiliary_loss_clip": 0.01169622, "auxiliary_loss_mlp": 0.010288, "balance_loss_clip": 1.05394006, "balance_loss_mlp": 1.02070594, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 2.8122273296492364, "language_loss": 0.64725429, "learning_rate": 2.4805510245463263e-06, "loss": 0.66923851, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.6986567974090576 }, { "auxiliary_loss_clip": 0.0116608, "auxiliary_loss_mlp": 0.01027556, "balance_loss_clip": 1.05027723, "balance_loss_mlp": 1.02005768, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.2023369587251485, "language_loss": 0.59965062, "learning_rate": 2.4797948366511137e-06, "loss": 0.62158692, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.63004994392395 }, { "auxiliary_loss_clip": 0.01131521, "auxiliary_loss_mlp": 0.01022947, "balance_loss_clip": 1.04719424, "balance_loss_mlp": 1.01515079, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 21.1066444977324, "language_loss": 0.75935471, "learning_rate": 2.4790385759714055e-06, "loss": 0.78089941, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.755250930786133 }, { "auxiliary_loss_clip": 0.0116662, "auxiliary_loss_mlp": 0.01027406, "balance_loss_clip": 1.05511141, "balance_loss_mlp": 1.01981831, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.8416767687784725, "language_loss": 0.71258563, "learning_rate": 2.478282242621926e-06, "loss": 0.73452592, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.5985352993011475 }, { "auxiliary_loss_clip": 0.0105253, "auxiliary_loss_mlp": 0.01004076, "balance_loss_clip": 1.02279329, "balance_loss_mlp": 1.00251436, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8462477638411199, "language_loss": 0.59510642, "learning_rate": 2.477525836717411e-06, "loss": 0.61567247, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.40606427192688 }, { "auxiliary_loss_clip": 0.01168227, "auxiliary_loss_mlp": 0.01027578, "balance_loss_clip": 1.05304873, "balance_loss_mlp": 1.01959121, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.7384669673061737, "language_loss": 0.79228175, "learning_rate": 2.476769358372606e-06, "loss": 0.81423986, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.7063801288604736 }, { "auxiliary_loss_clip": 0.0113329, "auxiliary_loss_mlp": 0.01029384, "balance_loss_clip": 1.0518384, "balance_loss_mlp": 1.0217669, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.109890470623054, "language_loss": 0.74896824, "learning_rate": 2.4760128077022683e-06, "loss": 0.77059501, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.6824071407318115 }, { "auxiliary_loss_clip": 0.01112679, "auxiliary_loss_mlp": 0.01024962, "balance_loss_clip": 1.04944456, "balance_loss_mlp": 1.01699328, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.5470658720758852, "language_loss": 0.68441117, "learning_rate": 2.4752561848211672e-06, "loss": 0.70578754, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.755591630935669 }, { "auxiliary_loss_clip": 0.01165614, "auxiliary_loss_mlp": 0.01028503, "balance_loss_clip": 1.0560658, "balance_loss_mlp": 1.02083814, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 2.5697045957869054, "language_loss": 0.71272051, "learning_rate": 2.4744994898440797e-06, "loss": 0.7346617, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.6165056228637695 }, { "auxiliary_loss_clip": 0.01140684, "auxiliary_loss_mlp": 0.01024294, "balance_loss_clip": 1.05178809, "balance_loss_mlp": 1.01653624, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 3.3953983523381748, "language_loss": 0.83520317, "learning_rate": 2.473742722885797e-06, "loss": 0.85685295, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.632378339767456 }, { "auxiliary_loss_clip": 0.01171615, "auxiliary_loss_mlp": 0.00712137, "balance_loss_clip": 1.05757618, "balance_loss_mlp": 1.00087166, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.5066014491921997, "language_loss": 0.6535449, "learning_rate": 2.4729858840611197e-06, "loss": 0.67238241, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.6822776794433594 }, { "auxiliary_loss_clip": 0.01182083, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 1.05534744, "balance_loss_mlp": 1.01903081, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 2.0606488002725407, "language_loss": 0.72465062, "learning_rate": 2.4722289734848605e-06, "loss": 0.74673879, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.663663387298584 }, { "auxiliary_loss_clip": 0.01134201, "auxiliary_loss_mlp": 0.01025592, "balance_loss_clip": 1.05236459, "balance_loss_mlp": 1.0183413, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 2.2715480121042066, "language_loss": 0.77673137, "learning_rate": 2.471471991271841e-06, "loss": 0.79832929, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.6674089431762695 }, { "auxiliary_loss_clip": 0.0115697, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 1.04886413, "balance_loss_mlp": 1.01884437, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 2.0715753564857513, "language_loss": 0.7934531, "learning_rate": 2.470714937536896e-06, "loss": 0.81528604, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.674198865890503 }, { "auxiliary_loss_clip": 0.01117452, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 1.04835939, "balance_loss_mlp": 1.02067101, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 2.0666955903432775, "language_loss": 0.70467746, "learning_rate": 2.469957812394868e-06, "loss": 0.7261405, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.710331916809082 }, { "auxiliary_loss_clip": 0.01184223, "auxiliary_loss_mlp": 0.0102329, "balance_loss_clip": 1.05775726, "balance_loss_mlp": 1.01592839, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 2.3033890645627286, "language_loss": 0.76011223, "learning_rate": 2.4692006159606148e-06, "loss": 0.78218734, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.624272584915161 }, { "auxiliary_loss_clip": 0.01182924, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.05469859, "balance_loss_mlp": 1.01915443, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 2.1750311291021607, "language_loss": 0.78658772, "learning_rate": 2.468443348349e-06, "loss": 0.80867904, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.5950300693511963 }, { "auxiliary_loss_clip": 0.01117618, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.04725266, "balance_loss_mlp": 1.01949, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 32.87851122734553, "language_loss": 0.82982296, "learning_rate": 2.467686009674902e-06, "loss": 0.85127729, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 3.542322874069214 }, { "auxiliary_loss_clip": 0.01160357, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.05069041, "balance_loss_mlp": 1.01739466, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 2.873990373008936, "language_loss": 0.85247624, "learning_rate": 2.466928600053209e-06, "loss": 0.87433559, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 3.7746808528900146 }, { "auxiliary_loss_clip": 0.01148259, "auxiliary_loss_mlp": 0.01029243, "balance_loss_clip": 1.04859638, "balance_loss_mlp": 1.02108932, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 6.374542535746944, "language_loss": 0.71337509, "learning_rate": 2.466171119598818e-06, "loss": 0.7351501, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 3.7546863555908203 }, { "auxiliary_loss_clip": 0.01173355, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.05260766, "balance_loss_mlp": 1.01968324, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.9717074358792435, "language_loss": 0.77310669, "learning_rate": 2.465413568426639e-06, "loss": 0.79512388, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.721750020980835 }, { "auxiliary_loss_clip": 0.01159465, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.05112016, "balance_loss_mlp": 1.01807702, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.7057553658449272, "language_loss": 0.81256473, "learning_rate": 2.464655946651591e-06, "loss": 0.83441311, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.6321165561676025 }, { "auxiliary_loss_clip": 0.01169039, "auxiliary_loss_mlp": 0.01026863, "balance_loss_clip": 1.05518651, "balance_loss_mlp": 1.01897144, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 2.6118879739502097, "language_loss": 0.81107092, "learning_rate": 2.4638982543886065e-06, "loss": 0.83302999, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.620351791381836 }, { "auxiliary_loss_clip": 0.01172643, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.05669355, "balance_loss_mlp": 1.02197289, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.548488406788172, "language_loss": 0.87385672, "learning_rate": 2.4631404917526254e-06, "loss": 0.89588118, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.653843641281128 }, { "auxiliary_loss_clip": 0.01161378, "auxiliary_loss_mlp": 0.01026344, "balance_loss_clip": 1.05175877, "balance_loss_mlp": 1.01889396, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 2.134162505754537, "language_loss": 0.79387325, "learning_rate": 2.4623826588586e-06, "loss": 0.81575048, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.658881902694702 }, { "auxiliary_loss_clip": 0.01146097, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.04796314, "balance_loss_mlp": 1.02122676, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.553613566902073, "language_loss": 0.82666653, "learning_rate": 2.461624755821492e-06, "loss": 0.84842199, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.7282822132110596 }, { "auxiliary_loss_clip": 0.01139249, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 1.05332613, "balance_loss_mlp": 1.01996779, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.8230787757877835, "language_loss": 0.76575685, "learning_rate": 2.4608667827562763e-06, "loss": 0.78742665, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.715639591217041 }, { "auxiliary_loss_clip": 0.01170648, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.05307615, "balance_loss_mlp": 1.02012348, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 1.988964189812962, "language_loss": 0.89911711, "learning_rate": 2.460108739777936e-06, "loss": 0.92110413, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.6308987140655518 }, { "auxiliary_loss_clip": 0.01151002, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.0527693, "balance_loss_mlp": 1.01716161, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.666837541033021, "language_loss": 0.76664984, "learning_rate": 2.4593506270014656e-06, "loss": 0.7884047, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.7155098915100098 }, { "auxiliary_loss_clip": 0.01152618, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.05011189, "balance_loss_mlp": 1.01871812, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.7597997611649887, "language_loss": 0.82323396, "learning_rate": 2.45859244454187e-06, "loss": 0.84502459, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.7383766174316406 }, { "auxiliary_loss_clip": 0.01160814, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 1.0513736, "balance_loss_mlp": 1.01704001, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.7908507353184202, "language_loss": 0.6634714, "learning_rate": 2.4578341925141655e-06, "loss": 0.68532455, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.596675157546997 }, { "auxiliary_loss_clip": 0.01171914, "auxiliary_loss_mlp": 0.01025156, "balance_loss_clip": 1.05205703, "balance_loss_mlp": 1.01696658, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 3.019598491168433, "language_loss": 0.72322989, "learning_rate": 2.457075871033378e-06, "loss": 0.74520057, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.767101287841797 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.04960525, "balance_loss_mlp": 1.0222106, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 2.3833113661405445, "language_loss": 0.88567686, "learning_rate": 2.4563174802145445e-06, "loss": 0.90728652, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.601783037185669 }, { "auxiliary_loss_clip": 0.01079748, "auxiliary_loss_mlp": 0.01011955, "balance_loss_clip": 1.03861141, "balance_loss_mlp": 1.01044655, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6401203155511841, "language_loss": 0.48611391, "learning_rate": 2.455559020172712e-06, "loss": 0.50703096, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.3534176349639893 }, { "auxiliary_loss_clip": 0.01124441, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.0526706, "balance_loss_mlp": 1.02566171, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 2.0762490142874066, "language_loss": 0.89687312, "learning_rate": 2.4548004910229385e-06, "loss": 0.91845012, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.75765323638916 }, { "auxiliary_loss_clip": 0.01169893, "auxiliary_loss_mlp": 0.00711495, "balance_loss_clip": 1.05475879, "balance_loss_mlp": 1.00071895, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 2.2186716877785915, "language_loss": 0.86923575, "learning_rate": 2.4540418928802913e-06, "loss": 0.88804966, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.608802080154419 }, { "auxiliary_loss_clip": 0.01149949, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.05059338, "balance_loss_mlp": 1.01957369, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.0090956274443377, "language_loss": 0.65572786, "learning_rate": 2.4532832258598506e-06, "loss": 0.67750669, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.7754998207092285 }, { "auxiliary_loss_clip": 0.01179659, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.05474305, "balance_loss_mlp": 1.02026892, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 2.3912930929316025, "language_loss": 0.80774313, "learning_rate": 2.4525244900767047e-06, "loss": 0.82981861, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.6069116592407227 }, { "auxiliary_loss_clip": 0.01089361, "auxiliary_loss_mlp": 0.01001228, "balance_loss_clip": 1.03959763, "balance_loss_mlp": 0.99975538, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7660455135699723, "language_loss": 0.60449576, "learning_rate": 2.4517656856459536e-06, "loss": 0.62540162, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.3306801319122314 }, { "auxiliary_loss_clip": 0.01167438, "auxiliary_loss_mlp": 0.01024343, "balance_loss_clip": 1.05333161, "balance_loss_mlp": 1.01683283, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 2.040454840823341, "language_loss": 0.68092018, "learning_rate": 2.4510068126827073e-06, "loss": 0.70283806, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.6574339866638184 }, { "auxiliary_loss_clip": 0.01150891, "auxiliary_loss_mlp": 0.01028217, "balance_loss_clip": 1.05336416, "balance_loss_mlp": 1.02029574, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.4655132690917707, "language_loss": 0.81986934, "learning_rate": 2.450247871302086e-06, "loss": 0.84166038, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.649972915649414 }, { "auxiliary_loss_clip": 0.01168908, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.05319107, "balance_loss_mlp": 1.02092505, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 13.297408717923737, "language_loss": 0.83555144, "learning_rate": 2.44948886161922e-06, "loss": 0.85752201, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.624152660369873 }, { "auxiliary_loss_clip": 0.01166754, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.05237365, "balance_loss_mlp": 1.01814651, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.7537027212832716, "language_loss": 0.84670883, "learning_rate": 2.4487297837492524e-06, "loss": 0.86863232, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.6284797191619873 }, { "auxiliary_loss_clip": 0.01134739, "auxiliary_loss_mlp": 0.01026087, "balance_loss_clip": 1.05094814, "balance_loss_mlp": 1.01876485, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 2.3360069771559555, "language_loss": 0.61887205, "learning_rate": 2.4479706378073323e-06, "loss": 0.64048028, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 5.462984561920166 }, { "auxiliary_loss_clip": 0.01125292, "auxiliary_loss_mlp": 0.01028963, "balance_loss_clip": 1.04779911, "balance_loss_mlp": 1.02129221, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.5993257924973652, "language_loss": 0.83771271, "learning_rate": 2.447211423908623e-06, "loss": 0.85925531, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.745067596435547 }, { "auxiliary_loss_clip": 0.01166506, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.05241978, "balance_loss_mlp": 1.01862168, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 1.9512575072239466, "language_loss": 0.75009048, "learning_rate": 2.4464521421682966e-06, "loss": 0.77201152, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 3.529644012451172 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.01026809, "balance_loss_clip": 1.05426049, "balance_loss_mlp": 1.02001703, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.360547241109377, "language_loss": 0.87722015, "learning_rate": 2.4456927927015345e-06, "loss": 0.89909983, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.714022397994995 }, { "auxiliary_loss_clip": 0.01151134, "auxiliary_loss_mlp": 0.01026351, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.01836395, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.182761383978414, "language_loss": 0.76178133, "learning_rate": 2.4449333756235307e-06, "loss": 0.7835561, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.6254467964172363 }, { "auxiliary_loss_clip": 0.01170603, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.05608511, "balance_loss_mlp": 1.02126324, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.670037016409771, "language_loss": 0.7866742, "learning_rate": 2.4441738910494876e-06, "loss": 0.80867028, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.59584903717041 }, { "auxiliary_loss_clip": 0.01153882, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04934692, "balance_loss_mlp": 1.01638937, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 2.0782297457135024, "language_loss": 0.82152045, "learning_rate": 2.4434143390946176e-06, "loss": 0.84330046, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.66776704788208 }, { "auxiliary_loss_clip": 0.01129198, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04793942, "balance_loss_mlp": 1.01865339, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 3.7643449719140274, "language_loss": 0.85761368, "learning_rate": 2.4426547198741457e-06, "loss": 0.87916321, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.7126858234405518 }, { "auxiliary_loss_clip": 0.01119184, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 1.05067325, "balance_loss_mlp": 1.02229989, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.4212539404686844, "language_loss": 0.74676037, "learning_rate": 2.441895033503305e-06, "loss": 0.7682566, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.7261404991149902 }, { "auxiliary_loss_clip": 0.01165867, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.05407047, "balance_loss_mlp": 1.02033973, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.8920476016101564, "language_loss": 0.82386303, "learning_rate": 2.4411352800973375e-06, "loss": 0.84580201, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.6079108715057373 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.0459466, "balance_loss_mlp": 1.01852965, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.7082094266170027, "language_loss": 0.74476922, "learning_rate": 2.4403754597715005e-06, "loss": 0.7662878, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.7370505332946777 }, { "auxiliary_loss_clip": 0.01149993, "auxiliary_loss_mlp": 0.01024134, "balance_loss_clip": 1.04779303, "balance_loss_mlp": 1.01602149, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 3.504923801101499, "language_loss": 0.92887741, "learning_rate": 2.4396155726410553e-06, "loss": 0.95061862, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.6697375774383545 }, { "auxiliary_loss_clip": 0.01168716, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.05282688, "balance_loss_mlp": 1.01708138, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.830134444833075, "language_loss": 0.90804529, "learning_rate": 2.438855618821278e-06, "loss": 0.92998028, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.650912284851074 }, { "auxiliary_loss_clip": 0.01155983, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.04882967, "balance_loss_mlp": 1.01893377, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 5.216050985766172, "language_loss": 0.67496741, "learning_rate": 2.4380955984274517e-06, "loss": 0.69679058, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.614037036895752 }, { "auxiliary_loss_clip": 0.0116165, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.04958534, "balance_loss_mlp": 1.02938986, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 2.250379838388173, "language_loss": 0.77252007, "learning_rate": 2.4373355115748716e-06, "loss": 0.79451048, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.7463536262512207 }, { "auxiliary_loss_clip": 0.01142494, "auxiliary_loss_mlp": 0.01026171, "balance_loss_clip": 1.05133247, "balance_loss_mlp": 1.01920342, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.6727114608932814, "language_loss": 0.72317255, "learning_rate": 2.436575358378842e-06, "loss": 0.74485922, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.6763970851898193 }, { "auxiliary_loss_clip": 0.01154609, "auxiliary_loss_mlp": 0.01036349, "balance_loss_clip": 1.05272937, "balance_loss_mlp": 1.02862477, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 2.8301933102067727, "language_loss": 0.82589173, "learning_rate": 2.4358151389546782e-06, "loss": 0.84780133, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.6835498809814453 }, { "auxiliary_loss_clip": 0.01182858, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 1.05589056, "balance_loss_mlp": 1.02438521, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.602667308511563, "language_loss": 0.76485938, "learning_rate": 2.4350548534177035e-06, "loss": 0.78700256, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.569938898086548 }, { "auxiliary_loss_clip": 0.01136539, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.05197668, "balance_loss_mlp": 1.01808739, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 3.139468018644687, "language_loss": 0.66671467, "learning_rate": 2.434294501883254e-06, "loss": 0.68833452, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.863049268722534 }, { "auxiliary_loss_clip": 0.01138842, "auxiliary_loss_mlp": 0.01029153, "balance_loss_clip": 1.04778624, "balance_loss_mlp": 1.02152967, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 2.3316550100233306, "language_loss": 0.66019118, "learning_rate": 2.433534084466674e-06, "loss": 0.68187112, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.6378729343414307 }, { "auxiliary_loss_clip": 0.01175138, "auxiliary_loss_mlp": 0.01021385, "balance_loss_clip": 1.05239129, "balance_loss_mlp": 1.01419032, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.7083446911026363, "language_loss": 0.71268713, "learning_rate": 2.4327736012833178e-06, "loss": 0.73465234, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.671983003616333 }, { "auxiliary_loss_clip": 0.01163922, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.05244792, "balance_loss_mlp": 1.02176666, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.1986728230093617, "language_loss": 0.76402926, "learning_rate": 2.4320130524485506e-06, "loss": 0.78596115, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.611098051071167 }, { "auxiliary_loss_clip": 0.0114331, "auxiliary_loss_mlp": 0.01021229, "balance_loss_clip": 1.05387175, "balance_loss_mlp": 1.0141356, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 2.226996954924941, "language_loss": 0.7945255, "learning_rate": 2.431252438077746e-06, "loss": 0.81617087, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.6953318119049072 }, { "auxiliary_loss_clip": 0.01167106, "auxiliary_loss_mlp": 0.0071104, "balance_loss_clip": 1.05260694, "balance_loss_mlp": 1.0007658, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.3568644376160246, "language_loss": 0.76967096, "learning_rate": 2.4304917582862906e-06, "loss": 0.78845251, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.674412250518799 }, { "auxiliary_loss_clip": 0.01180554, "auxiliary_loss_mlp": 0.01026588, "balance_loss_clip": 1.05507731, "balance_loss_mlp": 1.01905727, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.2249553225150427, "language_loss": 0.8758629, "learning_rate": 2.4297310131895774e-06, "loss": 0.89793432, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.5627944469451904 }, { "auxiliary_loss_clip": 0.01162562, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.05151248, "balance_loss_mlp": 1.01920629, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 2.491505856185266, "language_loss": 0.74372762, "learning_rate": 2.4289702029030113e-06, "loss": 0.76562095, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 2.6538095474243164 }, { "auxiliary_loss_clip": 0.01165454, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.05470753, "balance_loss_mlp": 1.02316451, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 2.9950730427235324, "language_loss": 0.83371222, "learning_rate": 2.4282093275420057e-06, "loss": 0.85567021, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 3.572448253631592 }, { "auxiliary_loss_clip": 0.01168621, "auxiliary_loss_mlp": 0.01026503, "balance_loss_clip": 1.05455589, "balance_loss_mlp": 1.01912045, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.166602281699954, "language_loss": 0.70858645, "learning_rate": 2.4274483872219863e-06, "loss": 0.73053771, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.687981605529785 }, { "auxiliary_loss_clip": 0.01160122, "auxiliary_loss_mlp": 0.01023052, "balance_loss_clip": 1.05077684, "balance_loss_mlp": 1.01557136, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 5.3441423584642775, "language_loss": 0.93585539, "learning_rate": 2.426687382058386e-06, "loss": 0.95768714, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 3.560642957687378 }, { "auxiliary_loss_clip": 0.01088132, "auxiliary_loss_mlp": 0.01001511, "balance_loss_clip": 1.03853488, "balance_loss_mlp": 1.00020528, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8610585761486761, "language_loss": 0.59796369, "learning_rate": 2.425926312166649e-06, "loss": 0.61886013, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.0395357608795166 }, { "auxiliary_loss_clip": 0.01152246, "auxiliary_loss_mlp": 0.01024175, "balance_loss_clip": 1.05187964, "balance_loss_mlp": 1.01600885, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 2.18598939602607, "language_loss": 0.73244935, "learning_rate": 2.42516517766223e-06, "loss": 0.75421357, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.6712539196014404 }, { "auxiliary_loss_clip": 0.01180951, "auxiliary_loss_mlp": 0.0102955, "balance_loss_clip": 1.0565815, "balance_loss_mlp": 1.02230787, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.2342440782366895, "language_loss": 0.68165147, "learning_rate": 2.4244039786605907e-06, "loss": 0.70375651, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.599043369293213 }, { "auxiliary_loss_clip": 0.01117495, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04564929, "balance_loss_mlp": 1.0166924, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.4767893179790446, "language_loss": 0.82817823, "learning_rate": 2.4236427152772055e-06, "loss": 0.84959179, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.7670984268188477 }, { "auxiliary_loss_clip": 0.01045938, "auxiliary_loss_mlp": 0.01002002, "balance_loss_clip": 1.02540004, "balance_loss_mlp": 1.0007441, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8308470294119887, "language_loss": 0.57345653, "learning_rate": 2.422881387627557e-06, "loss": 0.59393591, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 2.9617388248443604 }, { "auxiliary_loss_clip": 0.01150464, "auxiliary_loss_mlp": 0.01022634, "balance_loss_clip": 1.05238008, "balance_loss_mlp": 1.01521373, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.6267405006270201, "language_loss": 0.77353144, "learning_rate": 2.422119995827139e-06, "loss": 0.7952624, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.6931891441345215 }, { "auxiliary_loss_clip": 0.01169989, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.05654597, "balance_loss_mlp": 1.02527356, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 3.0519022038242727, "language_loss": 0.74231422, "learning_rate": 2.4213585399914528e-06, "loss": 0.76434159, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.6233019828796387 }, { "auxiliary_loss_clip": 0.01161493, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.05289698, "balance_loss_mlp": 1.02340853, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.7702014281767149, "language_loss": 0.85184038, "learning_rate": 2.4205970202360113e-06, "loss": 0.87376046, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.68174147605896 }, { "auxiliary_loss_clip": 0.01106496, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.04655159, "balance_loss_mlp": 1.01556945, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 2.952872993481104, "language_loss": 0.78129101, "learning_rate": 2.4198354366763354e-06, "loss": 0.80258226, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.7266125679016113 }, { "auxiliary_loss_clip": 0.0115021, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 1.05079257, "balance_loss_mlp": 1.01877046, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 1.9763943078147252, "language_loss": 0.78545737, "learning_rate": 2.4190737894279587e-06, "loss": 0.80721879, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.644784450531006 }, { "auxiliary_loss_clip": 0.01117593, "auxiliary_loss_mlp": 0.01023399, "balance_loss_clip": 1.04245234, "balance_loss_mlp": 1.01655674, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.2927326918300515, "language_loss": 0.81185937, "learning_rate": 2.4183120786064203e-06, "loss": 0.83326936, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.681561231613159 }, { "auxiliary_loss_clip": 0.01163975, "auxiliary_loss_mlp": 0.00711151, "balance_loss_clip": 1.05378461, "balance_loss_mlp": 1.00075412, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 5.236176660165282, "language_loss": 0.86102259, "learning_rate": 2.417550304327273e-06, "loss": 0.87977386, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.6060845851898193 }, { "auxiliary_loss_clip": 0.01182529, "auxiliary_loss_mlp": 0.01022855, "balance_loss_clip": 1.05446434, "balance_loss_mlp": 1.01503515, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.6800325786131622, "language_loss": 0.75691426, "learning_rate": 2.4167884667060763e-06, "loss": 0.7789681, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.6918280124664307 }, { "auxiliary_loss_clip": 0.01147008, "auxiliary_loss_mlp": 0.01026296, "balance_loss_clip": 1.04918158, "balance_loss_mlp": 1.01836598, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.172041714322595, "language_loss": 0.87829906, "learning_rate": 2.4160265658584e-06, "loss": 0.9000321, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.622363567352295 }, { "auxiliary_loss_clip": 0.01167821, "auxiliary_loss_mlp": 0.01026262, "balance_loss_clip": 1.05454326, "balance_loss_mlp": 1.018996, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 2.0111444172024027, "language_loss": 0.68747318, "learning_rate": 2.4152646018998253e-06, "loss": 0.70941401, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.6173763275146484 }, { "auxiliary_loss_clip": 0.01161231, "auxiliary_loss_mlp": 0.01025503, "balance_loss_clip": 1.05199301, "balance_loss_mlp": 1.01775157, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.9060275295206113, "language_loss": 0.71814489, "learning_rate": 2.4145025749459403e-06, "loss": 0.74001217, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.639820098876953 }, { "auxiliary_loss_clip": 0.01085073, "auxiliary_loss_mlp": 0.01029656, "balance_loss_clip": 1.04640532, "balance_loss_mlp": 1.02164555, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 2.1171185997110076, "language_loss": 0.70333743, "learning_rate": 2.413740485112344e-06, "loss": 0.72448468, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.785604953765869 }, { "auxiliary_loss_clip": 0.01144745, "auxiliary_loss_mlp": 0.01028421, "balance_loss_clip": 1.0526073, "balance_loss_mlp": 1.02131033, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.6225165547836373, "language_loss": 0.82318521, "learning_rate": 2.412978332514646e-06, "loss": 0.84491694, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.587010383605957 }, { "auxiliary_loss_clip": 0.01148814, "auxiliary_loss_mlp": 0.01025062, "balance_loss_clip": 1.05077529, "balance_loss_mlp": 1.01794219, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 2.878748121145458, "language_loss": 0.72413784, "learning_rate": 2.4122161172684623e-06, "loss": 0.74587661, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.6410610675811768 }, { "auxiliary_loss_clip": 0.01149186, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.05107963, "balance_loss_mlp": 1.01811862, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 9.093987009266154, "language_loss": 0.84321254, "learning_rate": 2.4114538394894216e-06, "loss": 0.8649615, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.533473014831543 }, { "auxiliary_loss_clip": 0.01144402, "auxiliary_loss_mlp": 0.01024823, "balance_loss_clip": 1.04704189, "balance_loss_mlp": 1.01759291, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.820902565782978, "language_loss": 0.83344012, "learning_rate": 2.410691499293161e-06, "loss": 0.85513234, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.5356433391571045 }, { "auxiliary_loss_clip": 0.01164258, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.0526247, "balance_loss_mlp": 1.01786709, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.7587231200613984, "language_loss": 0.73891687, "learning_rate": 2.409929096795326e-06, "loss": 0.76081127, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.575021743774414 }, { "auxiliary_loss_clip": 0.01164582, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.05169427, "balance_loss_mlp": 1.01910698, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 1.9216755408862194, "language_loss": 0.79167557, "learning_rate": 2.409166632111573e-06, "loss": 0.81358421, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.608405113220215 }, { "auxiliary_loss_clip": 0.0117102, "auxiliary_loss_mlp": 0.01021132, "balance_loss_clip": 1.05302715, "balance_loss_mlp": 1.0135529, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 2.4594932859076177, "language_loss": 0.80815971, "learning_rate": 2.4084041053575674e-06, "loss": 0.83008116, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 5.442884922027588 }, { "auxiliary_loss_clip": 0.01154024, "auxiliary_loss_mlp": 0.01030282, "balance_loss_clip": 1.05435252, "balance_loss_mlp": 1.02238464, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 2.0350984156700895, "language_loss": 0.72783124, "learning_rate": 2.4076415166489834e-06, "loss": 0.74967426, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 3.5799331665039062 }, { "auxiliary_loss_clip": 0.01122995, "auxiliary_loss_mlp": 0.01020878, "balance_loss_clip": 1.04863119, "balance_loss_mlp": 1.01402938, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.67523680871833, "language_loss": 0.79136145, "learning_rate": 2.406878866101506e-06, "loss": 0.81280017, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.6858487129211426 }, { "auxiliary_loss_clip": 0.01182088, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05767453, "balance_loss_mlp": 1.0202558, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 2.937556022900389, "language_loss": 0.78597033, "learning_rate": 2.4061161538308273e-06, "loss": 0.80806673, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.6145288944244385 }, { "auxiliary_loss_clip": 0.01164281, "auxiliary_loss_mlp": 0.01025917, "balance_loss_clip": 1.05393696, "balance_loss_mlp": 1.01810312, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 3.140684985545743, "language_loss": 0.89225769, "learning_rate": 2.4053533799526523e-06, "loss": 0.91415966, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.5743706226348877 }, { "auxiliary_loss_clip": 0.0114226, "auxiliary_loss_mlp": 0.01021861, "balance_loss_clip": 1.05151427, "balance_loss_mlp": 1.01495898, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.762999572373544, "language_loss": 0.8633076, "learning_rate": 2.404590544582691e-06, "loss": 0.88494885, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.7276971340179443 }, { "auxiliary_loss_clip": 0.01118982, "auxiliary_loss_mlp": 0.0102706, "balance_loss_clip": 1.04572487, "balance_loss_mlp": 1.0192188, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 2.156479023922719, "language_loss": 0.81137764, "learning_rate": 2.403827647836666e-06, "loss": 0.83283806, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.8431360721588135 }, { "auxiliary_loss_clip": 0.01179101, "auxiliary_loss_mlp": 0.01027579, "balance_loss_clip": 1.05242538, "balance_loss_mlp": 1.01981223, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 2.3566240531301563, "language_loss": 0.6942873, "learning_rate": 2.4030646898303075e-06, "loss": 0.71635407, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.674048900604248 }, { "auxiliary_loss_clip": 0.01153811, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.05206704, "balance_loss_mlp": 1.01886654, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.131283517039849, "language_loss": 0.82248342, "learning_rate": 2.4023016706793566e-06, "loss": 0.84428132, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.7127232551574707 }, { "auxiliary_loss_clip": 0.01075623, "auxiliary_loss_mlp": 0.01014806, "balance_loss_clip": 1.04225802, "balance_loss_mlp": 1.01342928, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7790051530351834, "language_loss": 0.56866491, "learning_rate": 2.401538590499561e-06, "loss": 0.58956921, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.369040012359619 }, { "auxiliary_loss_clip": 0.01168261, "auxiliary_loss_mlp": 0.00711388, "balance_loss_clip": 1.05416846, "balance_loss_mlp": 1.00069451, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 1.9610455142883512, "language_loss": 0.71646833, "learning_rate": 2.400775449406682e-06, "loss": 0.73526478, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.7230756282806396 }, { "auxiliary_loss_clip": 0.01161379, "auxiliary_loss_mlp": 0.01021168, "balance_loss_clip": 1.05063987, "balance_loss_mlp": 1.01429546, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 2.12136669830026, "language_loss": 0.72969681, "learning_rate": 2.400012247516485e-06, "loss": 0.7515223, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.630669355392456 }, { "auxiliary_loss_clip": 0.01135178, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.05044258, "balance_loss_mlp": 1.01973248, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.9375359454896683, "language_loss": 0.90399313, "learning_rate": 2.3992489849447484e-06, "loss": 0.92561543, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.7341442108154297 }, { "auxiliary_loss_clip": 0.01139581, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.05168378, "balance_loss_mlp": 1.02143931, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 1.873940986908636, "language_loss": 0.79026055, "learning_rate": 2.3984856618072584e-06, "loss": 0.8119421, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.6668946743011475 }, { "auxiliary_loss_clip": 0.01137383, "auxiliary_loss_mlp": 0.01024686, "balance_loss_clip": 1.05000627, "balance_loss_mlp": 1.0180341, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 2.066887706705572, "language_loss": 0.74071443, "learning_rate": 2.3977222782198098e-06, "loss": 0.76233512, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.7156498432159424 }, { "auxiliary_loss_clip": 0.01124229, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.04670858, "balance_loss_mlp": 1.02217984, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 2.080870083615059, "language_loss": 0.75511003, "learning_rate": 2.3969588342982077e-06, "loss": 0.77665234, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.697392225265503 }, { "auxiliary_loss_clip": 0.011604, "auxiliary_loss_mlp": 0.01021537, "balance_loss_clip": 1.05285716, "balance_loss_mlp": 1.01419926, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 1.503209733977893, "language_loss": 0.72288942, "learning_rate": 2.396195330158267e-06, "loss": 0.74470878, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.6514275074005127 }, { "auxiliary_loss_clip": 0.01178743, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.05306756, "balance_loss_mlp": 1.01843715, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 2.6931784442951323, "language_loss": 0.7923277, "learning_rate": 2.3954317659158094e-06, "loss": 0.81437337, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.6318788528442383 }, { "auxiliary_loss_clip": 0.01110384, "auxiliary_loss_mlp": 0.01004287, "balance_loss_clip": 1.04417861, "balance_loss_mlp": 1.00289202, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.9191971170446231, "language_loss": 0.56951725, "learning_rate": 2.394668141686667e-06, "loss": 0.59066403, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.2300426959991455 }, { "auxiliary_loss_clip": 0.01159253, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.05116761, "balance_loss_mlp": 1.01852298, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 1.8988469106763328, "language_loss": 0.69382513, "learning_rate": 2.3939044575866813e-06, "loss": 0.71567553, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.834911346435547 }, { "auxiliary_loss_clip": 0.01145339, "auxiliary_loss_mlp": 0.00711565, "balance_loss_clip": 1.05066156, "balance_loss_mlp": 1.00067759, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.409989995873832, "language_loss": 0.75405002, "learning_rate": 2.3931407137317024e-06, "loss": 0.77261907, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.8054821491241455 }, { "auxiliary_loss_clip": 0.01130623, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.04704797, "balance_loss_mlp": 1.01972008, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 2.15718146753176, "language_loss": 0.84697294, "learning_rate": 2.3923769102375907e-06, "loss": 0.86854959, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.7215960025787354 }, { "auxiliary_loss_clip": 0.01132412, "auxiliary_loss_mlp": 0.01028936, "balance_loss_clip": 1.04747367, "balance_loss_mlp": 1.02146196, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.6899677413734144, "language_loss": 0.78553349, "learning_rate": 2.391613047220213e-06, "loss": 0.80714697, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.6930055618286133 }, { "auxiliary_loss_clip": 0.01123388, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.04779303, "balance_loss_mlp": 1.01848316, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 2.478899551257643, "language_loss": 0.79104811, "learning_rate": 2.390849124795447e-06, "loss": 0.81254262, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.6841089725494385 }, { "auxiliary_loss_clip": 0.01181446, "auxiliary_loss_mlp": 0.01026428, "balance_loss_clip": 1.05532205, "balance_loss_mlp": 1.01878047, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 2.0679480325449697, "language_loss": 0.84239584, "learning_rate": 2.3900851430791804e-06, "loss": 0.86447459, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.5590760707855225 }, { "auxiliary_loss_clip": 0.01184674, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 1.05646932, "balance_loss_mlp": 1.02105498, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 4.72040432461376, "language_loss": 0.84475023, "learning_rate": 2.389321102187307e-06, "loss": 0.86688918, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.6435587406158447 }, { "auxiliary_loss_clip": 0.01150028, "auxiliary_loss_mlp": 0.00712213, "balance_loss_clip": 1.05407739, "balance_loss_mlp": 1.00063539, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 2.1800969290271, "language_loss": 0.81474966, "learning_rate": 2.3885570022357326e-06, "loss": 0.83337212, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 4.613722324371338 }, { "auxiliary_loss_clip": 0.01078353, "auxiliary_loss_mlp": 0.01006709, "balance_loss_clip": 1.0452981, "balance_loss_mlp": 1.00520122, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.8058325358761584, "language_loss": 0.60886472, "learning_rate": 2.38779284334037e-06, "loss": 0.62971538, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 4.195195436477661 }, { "auxiliary_loss_clip": 0.01101922, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.0437839, "balance_loss_mlp": 1.01915753, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.1038268104179467, "language_loss": 0.79119098, "learning_rate": 2.387028625617141e-06, "loss": 0.81247568, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 2.6974525451660156 }, { "auxiliary_loss_clip": 0.01136796, "auxiliary_loss_mlp": 0.0102359, "balance_loss_clip": 1.04809022, "balance_loss_mlp": 1.01664567, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 4.492853081313329, "language_loss": 0.84845638, "learning_rate": 2.3862643491819766e-06, "loss": 0.87006027, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.669712781906128 }, { "auxiliary_loss_clip": 0.0116215, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.05149615, "balance_loss_mlp": 1.01949787, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.7643055171355788, "language_loss": 0.84595484, "learning_rate": 2.3855000141508186e-06, "loss": 0.86784816, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.704719305038452 }, { "auxiliary_loss_clip": 0.01155925, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.05513799, "balance_loss_mlp": 1.02757668, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.2632669970211285, "language_loss": 0.84249318, "learning_rate": 2.3847356206396143e-06, "loss": 0.8644042, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.6817984580993652 }, { "auxiliary_loss_clip": 0.01181429, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.05686283, "balance_loss_mlp": 1.01988852, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.5292873746143676, "language_loss": 0.78708446, "learning_rate": 2.3839711687643227e-06, "loss": 0.80917299, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.5656166076660156 }, { "auxiliary_loss_clip": 0.01164591, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 1.05370903, "balance_loss_mlp": 1.02088773, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 2.625746271397017, "language_loss": 0.73773193, "learning_rate": 2.38320665864091e-06, "loss": 0.75966644, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.630930185317993 }, { "auxiliary_loss_clip": 0.01102262, "auxiliary_loss_mlp": 0.01029576, "balance_loss_clip": 1.04496336, "balance_loss_mlp": 1.02130866, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.7997618574790126, "language_loss": 0.81736761, "learning_rate": 2.3824420903853516e-06, "loss": 0.83868593, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.721111536026001 }, { "auxiliary_loss_clip": 0.01164123, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.05511117, "balance_loss_mlp": 1.01879585, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.620860801134461, "language_loss": 0.82197082, "learning_rate": 2.3816774641136324e-06, "loss": 0.84387791, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.6100661754608154 }, { "auxiliary_loss_clip": 0.0116469, "auxiliary_loss_mlp": 0.00711474, "balance_loss_clip": 1.05476832, "balance_loss_mlp": 1.00065148, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.9283984810221262, "language_loss": 0.71535671, "learning_rate": 2.380912779941745e-06, "loss": 0.7341184, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.7128117084503174 }, { "auxiliary_loss_clip": 0.01166726, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.05099702, "balance_loss_mlp": 1.01820636, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 2.277394536264046, "language_loss": 0.83573234, "learning_rate": 2.3801480379856918e-06, "loss": 0.85766298, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.6446123123168945 }, { "auxiliary_loss_clip": 0.01152396, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.05406141, "balance_loss_mlp": 1.01958728, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 3.230341752793307, "language_loss": 0.83846009, "learning_rate": 2.379383238361484e-06, "loss": 0.86025476, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.6098861694335938 }, { "auxiliary_loss_clip": 0.01160451, "auxiliary_loss_mlp": 0.01022987, "balance_loss_clip": 1.05047083, "balance_loss_mlp": 1.01581001, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.5590335199308516, "language_loss": 0.7939924, "learning_rate": 2.3786183811851407e-06, "loss": 0.81582677, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.75274920463562 }, { "auxiliary_loss_clip": 0.01181937, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.05557835, "balance_loss_mlp": 1.01939642, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.9141193207128049, "language_loss": 0.80350918, "learning_rate": 2.3778534665726892e-06, "loss": 0.82559752, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.6450047492980957 }, { "auxiliary_loss_clip": 0.01157159, "auxiliary_loss_mlp": 0.01022454, "balance_loss_clip": 1.05280089, "balance_loss_mlp": 1.01568913, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.8183929983423717, "language_loss": 0.72465944, "learning_rate": 2.377088494640168e-06, "loss": 0.74645561, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.773730993270874 }, { "auxiliary_loss_clip": 0.01163373, "auxiliary_loss_mlp": 0.01022877, "balance_loss_clip": 1.05609965, "balance_loss_mlp": 1.0149914, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.9838044437342228, "language_loss": 0.78474003, "learning_rate": 2.3763234655036216e-06, "loss": 0.80660254, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.6669762134552 }, { "auxiliary_loss_clip": 0.01130785, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04649365, "balance_loss_mlp": 1.02056015, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.045158380681259, "language_loss": 0.87212557, "learning_rate": 2.3755583792791046e-06, "loss": 0.89371026, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.680251359939575 }, { "auxiliary_loss_clip": 0.01164781, "auxiliary_loss_mlp": 0.01025081, "balance_loss_clip": 1.05203319, "balance_loss_mlp": 1.01759493, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 4.283094437384374, "language_loss": 0.74998564, "learning_rate": 2.3747932360826803e-06, "loss": 0.7718842, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.614345073699951 }, { "auxiliary_loss_clip": 0.01162499, "auxiliary_loss_mlp": 0.01022873, "balance_loss_clip": 1.05258632, "balance_loss_mlp": 1.01513958, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 2.1780834152885564, "language_loss": 0.8240121, "learning_rate": 2.3740280360304205e-06, "loss": 0.84586585, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.606808662414551 }, { "auxiliary_loss_clip": 0.01129812, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.0482831, "balance_loss_mlp": 1.0200876, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 2.154214237598063, "language_loss": 0.68165445, "learning_rate": 2.3732627792384038e-06, "loss": 0.70322585, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.6749300956726074 }, { "auxiliary_loss_clip": 0.01181732, "auxiliary_loss_mlp": 0.0102426, "balance_loss_clip": 1.05489969, "balance_loss_mlp": 1.01710773, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 1.899247573609975, "language_loss": 0.75541091, "learning_rate": 2.3724974658227207e-06, "loss": 0.77747083, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.6347737312316895 }, { "auxiliary_loss_clip": 0.0114666, "auxiliary_loss_mlp": 0.00711425, "balance_loss_clip": 1.05270481, "balance_loss_mlp": 1.00055885, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 1.9896892175826808, "language_loss": 0.71442389, "learning_rate": 2.3717320958994687e-06, "loss": 0.73300481, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.7228007316589355 }, { "auxiliary_loss_clip": 0.01131639, "auxiliary_loss_mlp": 0.01025957, "balance_loss_clip": 1.04518533, "balance_loss_mlp": 1.01856351, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 5.3453130332606635, "language_loss": 0.70607233, "learning_rate": 2.3709666695847534e-06, "loss": 0.72764826, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.631410598754883 }, { "auxiliary_loss_clip": 0.01111901, "auxiliary_loss_mlp": 0.01022742, "balance_loss_clip": 1.04758644, "balance_loss_mlp": 1.0153389, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 2.1717699750546373, "language_loss": 0.70427489, "learning_rate": 2.370201186994689e-06, "loss": 0.72562128, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.900660276412964 }, { "auxiliary_loss_clip": 0.01142115, "auxiliary_loss_mlp": 0.01022152, "balance_loss_clip": 1.05107105, "balance_loss_mlp": 1.01455283, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 2.076595792021217, "language_loss": 0.70068896, "learning_rate": 2.369435648245399e-06, "loss": 0.72233164, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.7036707401275635 }, { "auxiliary_loss_clip": 0.01147114, "auxiliary_loss_mlp": 0.01023415, "balance_loss_clip": 1.05226922, "balance_loss_mlp": 1.01616955, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 2.0917210285558814, "language_loss": 0.84941304, "learning_rate": 2.368670053453015e-06, "loss": 0.87111837, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 4.602996349334717 }, { "auxiliary_loss_clip": 0.01170246, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.05537534, "balance_loss_mlp": 1.02363443, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.91206030829986, "language_loss": 0.74768651, "learning_rate": 2.3679044027336757e-06, "loss": 0.76970857, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 3.49434494972229 }, { "auxiliary_loss_clip": 0.0118153, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.05443132, "balance_loss_mlp": 1.01899421, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 3.07754322258644, "language_loss": 0.69207346, "learning_rate": 2.3671386962035326e-06, "loss": 0.71415317, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.564845085144043 }, { "auxiliary_loss_clip": 0.01166957, "auxiliary_loss_mlp": 0.0102613, "balance_loss_clip": 1.05394542, "balance_loss_mlp": 1.01845026, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.3204644273292567, "language_loss": 0.68751383, "learning_rate": 2.3663729339787405e-06, "loss": 0.70944476, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.58154034614563 }, { "auxiliary_loss_clip": 0.01180476, "auxiliary_loss_mlp": 0.01023374, "balance_loss_clip": 1.05478668, "balance_loss_mlp": 1.01559007, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 3.0550830490641117, "language_loss": 0.73863482, "learning_rate": 2.365607116175466e-06, "loss": 0.76067328, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.613718271255493 }, { "auxiliary_loss_clip": 0.01179708, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.05452967, "balance_loss_mlp": 1.01588273, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 4.124081805748985, "language_loss": 0.67020619, "learning_rate": 2.3648412429098825e-06, "loss": 0.6922375, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.5760140419006348 }, { "auxiliary_loss_clip": 0.01128353, "auxiliary_loss_mlp": 0.01028559, "balance_loss_clip": 1.05032969, "balance_loss_mlp": 1.02036381, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 2.4514774720208248, "language_loss": 0.81906331, "learning_rate": 2.364075314298172e-06, "loss": 0.84063238, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.7061660289764404 }, { "auxiliary_loss_clip": 0.01167208, "auxiliary_loss_mlp": 0.00711463, "balance_loss_clip": 1.05333042, "balance_loss_mlp": 1.00052881, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 2.1043694255269743, "language_loss": 0.7040543, "learning_rate": 2.3633093304565267e-06, "loss": 0.72284102, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.5854032039642334 }, { "auxiliary_loss_clip": 0.01185036, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.05607128, "balance_loss_mlp": 1.0201515, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 2.5969948197501784, "language_loss": 0.63379478, "learning_rate": 2.3625432915011443e-06, "loss": 0.65592295, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.6291682720184326 }, { "auxiliary_loss_clip": 0.0114138, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 1.048877, "balance_loss_mlp": 1.02026236, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.7459713193511208, "language_loss": 0.65445805, "learning_rate": 2.3617771975482334e-06, "loss": 0.6761508, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.629992961883545 }, { "auxiliary_loss_clip": 0.01115069, "auxiliary_loss_mlp": 0.01026194, "balance_loss_clip": 1.04780924, "balance_loss_mlp": 1.0185889, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.86545895578675, "language_loss": 0.7483865, "learning_rate": 2.3610110487140083e-06, "loss": 0.76979917, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.720956325531006 }, { "auxiliary_loss_clip": 0.01150418, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 1.05230403, "balance_loss_mlp": 1.0200913, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.9315688246393419, "language_loss": 0.80882853, "learning_rate": 2.360244845114695e-06, "loss": 0.83060724, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.6792685985565186 }, { "auxiliary_loss_clip": 0.01144036, "auxiliary_loss_mlp": 0.01024238, "balance_loss_clip": 1.05033088, "balance_loss_mlp": 1.01660228, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.8604343849115557, "language_loss": 0.68398428, "learning_rate": 2.3594785868665245e-06, "loss": 0.70566696, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.6203339099884033 }, { "auxiliary_loss_clip": 0.01132436, "auxiliary_loss_mlp": 0.0071171, "balance_loss_clip": 1.04807377, "balance_loss_mlp": 1.00061703, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 3.586965696781632, "language_loss": 0.80470204, "learning_rate": 2.3587122740857386e-06, "loss": 0.82314348, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.670715093612671 }, { "auxiliary_loss_clip": 0.01163994, "auxiliary_loss_mlp": 0.01020869, "balance_loss_clip": 1.05241942, "balance_loss_mlp": 1.01336467, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.7396829914729919, "language_loss": 0.77916527, "learning_rate": 2.357945906888586e-06, "loss": 0.80101383, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.6676228046417236 }, { "auxiliary_loss_clip": 0.01165316, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.05271363, "balance_loss_mlp": 1.01911902, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 3.9811960256478014, "language_loss": 0.80048454, "learning_rate": 2.357179485391324e-06, "loss": 0.82240832, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.578986644744873 }, { "auxiliary_loss_clip": 0.01177535, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.053967, "balance_loss_mlp": 1.01874065, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 1.86102833040062, "language_loss": 0.86029875, "learning_rate": 2.3564130097102173e-06, "loss": 0.88233137, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.6035258769989014 }, { "auxiliary_loss_clip": 0.01140217, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.05075431, "balance_loss_mlp": 1.02202225, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 2.0918381551605694, "language_loss": 0.75317299, "learning_rate": 2.355646479961541e-06, "loss": 0.77486897, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.789717674255371 }, { "auxiliary_loss_clip": 0.01177618, "auxiliary_loss_mlp": 0.01025127, "balance_loss_clip": 1.05072331, "balance_loss_mlp": 1.01775956, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 2.0514286513552324, "language_loss": 0.71696424, "learning_rate": 2.354879896261576e-06, "loss": 0.73899162, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.664646863937378 }, { "auxiliary_loss_clip": 0.01127617, "auxiliary_loss_mlp": 0.01024827, "balance_loss_clip": 1.04950273, "balance_loss_mlp": 1.01724255, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 1.929045087431728, "language_loss": 0.56722301, "learning_rate": 2.3541132587266133e-06, "loss": 0.58874744, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.795649528503418 }, { "auxiliary_loss_clip": 0.01135414, "auxiliary_loss_mlp": 0.01023278, "balance_loss_clip": 1.04803896, "balance_loss_mlp": 1.01536858, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 2.0438950866141603, "language_loss": 0.68963361, "learning_rate": 2.3533465674729515e-06, "loss": 0.71122056, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.659651756286621 }, { "auxiliary_loss_clip": 0.01178321, "auxiliary_loss_mlp": 0.01031422, "balance_loss_clip": 1.05323768, "balance_loss_mlp": 1.02283287, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 2.2381458262410523, "language_loss": 0.72916049, "learning_rate": 2.352579822616895e-06, "loss": 0.7512579, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.5843570232391357 }, { "auxiliary_loss_clip": 0.01147993, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.05029833, "balance_loss_mlp": 1.01862717, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.7590640711490992, "language_loss": 0.77514195, "learning_rate": 2.351813024274761e-06, "loss": 0.79688281, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.7201693058013916 }, { "auxiliary_loss_clip": 0.01138413, "auxiliary_loss_mlp": 0.01026858, "balance_loss_clip": 1.05232406, "balance_loss_mlp": 1.01982486, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 2.290058085437996, "language_loss": 0.7351743, "learning_rate": 2.3510461725628693e-06, "loss": 0.756827, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.7778308391571045 }, { "auxiliary_loss_clip": 0.01134377, "auxiliary_loss_mlp": 0.01024457, "balance_loss_clip": 1.05023444, "balance_loss_mlp": 1.01694083, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 1.7924469777771521, "language_loss": 0.70899081, "learning_rate": 2.350279267597554e-06, "loss": 0.7305792, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.747358560562134 }, { "auxiliary_loss_clip": 0.01164986, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 1.05316675, "balance_loss_mlp": 1.01759303, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.7027464139226804, "language_loss": 0.8263365, "learning_rate": 2.3495123094951515e-06, "loss": 0.84824574, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.649301052093506 }, { "auxiliary_loss_clip": 0.01141114, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.05077088, "balance_loss_mlp": 1.02028561, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 2.4588672818042046, "language_loss": 0.75574088, "learning_rate": 2.34874529837201e-06, "loss": 0.77743143, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 4.7427356243133545 }, { "auxiliary_loss_clip": 0.01095759, "auxiliary_loss_mlp": 0.01027853, "balance_loss_clip": 1.04502082, "balance_loss_mlp": 1.02058101, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 1.863728516210877, "language_loss": 0.78882849, "learning_rate": 2.347978234344483e-06, "loss": 0.81006461, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 3.6741387844085693 }, { "auxiliary_loss_clip": 0.01171663, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.05623794, "balance_loss_mlp": 1.01941276, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.934343015874534, "language_loss": 0.69095314, "learning_rate": 2.347211117528935e-06, "loss": 0.71294135, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 2.770033597946167 }, { "auxiliary_loss_clip": 0.01143574, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 1.0554322, "balance_loss_mlp": 1.01948607, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.6848690236416208, "language_loss": 0.71618581, "learning_rate": 2.3464439480417374e-06, "loss": 0.73789072, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.722201108932495 }, { "auxiliary_loss_clip": 0.01170548, "auxiliary_loss_mlp": 0.01028095, "balance_loss_clip": 1.05591524, "balance_loss_mlp": 1.0201143, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 3.624222799126823, "language_loss": 0.77293038, "learning_rate": 2.3456767259992676e-06, "loss": 0.79491681, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.5605292320251465 }, { "auxiliary_loss_clip": 0.01182949, "auxiliary_loss_mlp": 0.00711872, "balance_loss_clip": 1.05427647, "balance_loss_mlp": 1.00060654, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.853308121730173, "language_loss": 0.88935143, "learning_rate": 2.3449094515179135e-06, "loss": 0.90829962, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 2.5621771812438965 }, { "auxiliary_loss_clip": 0.01155314, "auxiliary_loss_mlp": 0.01025855, "balance_loss_clip": 1.05131721, "balance_loss_mlp": 1.01886392, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.8363728731282858, "language_loss": 0.82032311, "learning_rate": 2.34414212471407e-06, "loss": 0.84213483, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.641094923019409 }, { "auxiliary_loss_clip": 0.01170981, "auxiliary_loss_mlp": 0.01024787, "balance_loss_clip": 1.0533967, "balance_loss_mlp": 1.0167253, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 1.8755854115564186, "language_loss": 0.72816771, "learning_rate": 2.3433747457041394e-06, "loss": 0.75012541, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.6441750526428223 }, { "auxiliary_loss_clip": 0.01135199, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.05140924, "balance_loss_mlp": 1.01822317, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 3.147356430087298, "language_loss": 0.85174036, "learning_rate": 2.342607314604533e-06, "loss": 0.87335742, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.7440741062164307 }, { "auxiliary_loss_clip": 0.01166748, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.05422592, "balance_loss_mlp": 1.0227133, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 4.019585680704881, "language_loss": 0.83636051, "learning_rate": 2.3418398315316694e-06, "loss": 0.85833746, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.6372969150543213 }, { "auxiliary_loss_clip": 0.01180864, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.0547291, "balance_loss_mlp": 1.02441835, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 3.2343396597916603, "language_loss": 0.78436381, "learning_rate": 2.3410722966019755e-06, "loss": 0.80649215, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.5980756282806396 }, { "auxiliary_loss_clip": 0.01163936, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 1.05221117, "balance_loss_mlp": 1.01833904, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.9329074456560456, "language_loss": 0.65686095, "learning_rate": 2.3403047099318848e-06, "loss": 0.6787641, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.779881477355957 }, { "auxiliary_loss_clip": 0.01108924, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.04468858, "balance_loss_mlp": 1.01886678, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.4821181974778272, "language_loss": 0.75245345, "learning_rate": 2.3395370716378405e-06, "loss": 0.77380747, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.6693027019500732 }, { "auxiliary_loss_clip": 0.01167872, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.05232096, "balance_loss_mlp": 1.01894832, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.6055280693282037, "language_loss": 0.72609985, "learning_rate": 2.338769381836292e-06, "loss": 0.74804676, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.63283109664917 }, { "auxiliary_loss_clip": 0.01137042, "auxiliary_loss_mlp": 0.01024676, "balance_loss_clip": 1.05420423, "balance_loss_mlp": 1.01707029, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 1.931324675113935, "language_loss": 0.73077166, "learning_rate": 2.3380016406436984e-06, "loss": 0.75238883, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.627354860305786 }, { "auxiliary_loss_clip": 0.01119142, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.05288434, "balance_loss_mlp": 1.02168632, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 2.264474453297908, "language_loss": 0.8157028, "learning_rate": 2.337233848176524e-06, "loss": 0.83719814, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.767322063446045 }, { "auxiliary_loss_clip": 0.01110225, "auxiliary_loss_mlp": 0.0102461, "balance_loss_clip": 1.0470767, "balance_loss_mlp": 1.01635516, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 1.9506087438089124, "language_loss": 0.83305252, "learning_rate": 2.3364660045512435e-06, "loss": 0.85440087, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.7665138244628906 }, { "auxiliary_loss_clip": 0.01080578, "auxiliary_loss_mlp": 0.01006733, "balance_loss_clip": 1.04449165, "balance_loss_mlp": 1.0052371, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.7451761499642926, "language_loss": 0.58220339, "learning_rate": 2.335698109884337e-06, "loss": 0.60307658, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.3705530166625977 }, { "auxiliary_loss_clip": 0.01058184, "auxiliary_loss_mlp": 0.01005833, "balance_loss_clip": 1.04798388, "balance_loss_mlp": 1.00456321, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7868971025108956, "language_loss": 0.59868479, "learning_rate": 2.334930164292294e-06, "loss": 0.61932498, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.404738187789917 }, { "auxiliary_loss_clip": 0.01107941, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04246962, "balance_loss_mlp": 1.0215385, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 2.1378495310858745, "language_loss": 0.80186546, "learning_rate": 2.334162167891612e-06, "loss": 0.82324147, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.688904047012329 }, { "auxiliary_loss_clip": 0.01151684, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.05031276, "balance_loss_mlp": 1.01701713, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.1537622601259745, "language_loss": 0.7511102, "learning_rate": 2.333394120798795e-06, "loss": 0.77288055, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.687880754470825 }, { "auxiliary_loss_clip": 0.01152982, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.05269599, "balance_loss_mlp": 1.01965821, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 3.023489872953376, "language_loss": 0.72016782, "learning_rate": 2.3326260231303545e-06, "loss": 0.74197334, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.71868896484375 }, { "auxiliary_loss_clip": 0.0118197, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.05660725, "balance_loss_mlp": 1.01951396, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 2.027832705236883, "language_loss": 0.86379099, "learning_rate": 2.331857875002811e-06, "loss": 0.88587594, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.6071314811706543 }, { "auxiliary_loss_clip": 0.01152646, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.0546124, "balance_loss_mlp": 1.02281356, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.6773735868908102, "language_loss": 0.76249659, "learning_rate": 2.3310896765326916e-06, "loss": 0.78433192, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.7423675060272217 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.05107379, "balance_loss_mlp": 1.02292919, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.728787681618865, "language_loss": 0.8409068, "learning_rate": 2.330321427836531e-06, "loss": 0.86254871, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.6994130611419678 }, { "auxiliary_loss_clip": 0.011674, "auxiliary_loss_mlp": 0.01027646, "balance_loss_clip": 1.05558658, "balance_loss_mlp": 1.01881862, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 1.836009345455603, "language_loss": 0.82856292, "learning_rate": 2.3295531290308733e-06, "loss": 0.85051334, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.6529393196105957 }, { "auxiliary_loss_clip": 0.01188319, "auxiliary_loss_mlp": 0.00712737, "balance_loss_clip": 1.05991483, "balance_loss_mlp": 1.00063062, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 2.896698351773108, "language_loss": 0.75612938, "learning_rate": 2.3287847802322678e-06, "loss": 0.77513993, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 4.410417079925537 }, { "auxiliary_loss_clip": 0.01163721, "auxiliary_loss_mlp": 0.01030182, "balance_loss_clip": 1.05755782, "balance_loss_mlp": 1.02128863, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 47.28001175349007, "language_loss": 0.8387996, "learning_rate": 2.3280163815572723e-06, "loss": 0.86073864, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 3.601983070373535 }, { "auxiliary_loss_clip": 0.01141738, "auxiliary_loss_mlp": 0.01025913, "balance_loss_clip": 1.04923022, "balance_loss_mlp": 1.01766372, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 3.815362186273649, "language_loss": 0.7764082, "learning_rate": 2.3272479331224522e-06, "loss": 0.79808474, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.6801671981811523 }, { "auxiliary_loss_clip": 0.0118365, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 1.0552808, "balance_loss_mlp": 1.01663589, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.8330754544313441, "language_loss": 0.78096569, "learning_rate": 2.3264794350443817e-06, "loss": 0.80304909, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.6578149795532227 }, { "auxiliary_loss_clip": 0.01165998, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.05145276, "balance_loss_mlp": 1.02012253, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.794799308601577, "language_loss": 0.7854389, "learning_rate": 2.3257108874396396e-06, "loss": 0.80738688, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.6531260013580322 }, { "auxiliary_loss_clip": 0.0114981, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.04971695, "balance_loss_mlp": 1.01887822, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 2.045935112094618, "language_loss": 0.73923004, "learning_rate": 2.3249422904248152e-06, "loss": 0.7609973, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.684037208557129 }, { "auxiliary_loss_clip": 0.011687, "auxiliary_loss_mlp": 0.01030219, "balance_loss_clip": 1.05282056, "balance_loss_mlp": 1.02267945, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 3.0625937703840487, "language_loss": 0.87258768, "learning_rate": 2.324173644116504e-06, "loss": 0.89457691, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.6802234649658203 }, { "auxiliary_loss_clip": 0.01162275, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 1.05353546, "balance_loss_mlp": 1.02370954, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 2.410831855281071, "language_loss": 0.81213892, "learning_rate": 2.3234049486313087e-06, "loss": 0.83407772, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.6490514278411865 }, { "auxiliary_loss_clip": 0.01166271, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.05326033, "balance_loss_mlp": 1.01805615, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.8036923467620145, "language_loss": 0.75833231, "learning_rate": 2.322636204085839e-06, "loss": 0.78024721, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.6734585762023926 }, { "auxiliary_loss_clip": 0.01142516, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.04875505, "balance_loss_mlp": 1.01739109, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.7006242874649957, "language_loss": 0.78795803, "learning_rate": 2.3218674105967143e-06, "loss": 0.80963111, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.6615209579467773 }, { "auxiliary_loss_clip": 0.01141862, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.04978347, "balance_loss_mlp": 1.02373946, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.8246423156763791, "language_loss": 0.83615744, "learning_rate": 2.3210985682805593e-06, "loss": 0.8578862, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.7078664302825928 }, { "auxiliary_loss_clip": 0.01184843, "auxiliary_loss_mlp": 0.01030835, "balance_loss_clip": 1.05852413, "balance_loss_mlp": 1.02298856, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 3.240318761976947, "language_loss": 0.68199545, "learning_rate": 2.320329677254007e-06, "loss": 0.70415223, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.5856471061706543 }, { "auxiliary_loss_clip": 0.01183794, "auxiliary_loss_mlp": 0.01023676, "balance_loss_clip": 1.0569818, "balance_loss_mlp": 1.01545024, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.6490364513978086, "language_loss": 0.72511852, "learning_rate": 2.319560737633697e-06, "loss": 0.74719322, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.5779836177825928 }, { "auxiliary_loss_clip": 0.01139196, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.04910779, "balance_loss_mlp": 1.02062249, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.775431418881516, "language_loss": 0.6819973, "learning_rate": 2.3187917495362775e-06, "loss": 0.70367789, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.880948781967163 }, { "auxiliary_loss_clip": 0.01115594, "auxiliary_loss_mlp": 0.01025715, "balance_loss_clip": 1.04883122, "balance_loss_mlp": 1.01785874, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.4305724591263678, "language_loss": 0.76712132, "learning_rate": 2.318022713078403e-06, "loss": 0.7885344, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.694352388381958 }, { "auxiliary_loss_clip": 0.01146593, "auxiliary_loss_mlp": 0.01023921, "balance_loss_clip": 1.04937053, "balance_loss_mlp": 1.01614904, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.3832941218114887, "language_loss": 0.8530823, "learning_rate": 2.3172536283767354e-06, "loss": 0.87478745, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.6194002628326416 }, { "auxiliary_loss_clip": 0.01128783, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.05107534, "balance_loss_mlp": 1.02326965, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 2.6988909257171723, "language_loss": 0.8097623, "learning_rate": 2.3164844955479447e-06, "loss": 0.83136177, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.6786534786224365 }, { "auxiliary_loss_clip": 0.01126953, "auxiliary_loss_mlp": 0.01029242, "balance_loss_clip": 1.05094075, "balance_loss_mlp": 1.02161217, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 2.680086718123478, "language_loss": 0.71064287, "learning_rate": 2.3157153147087082e-06, "loss": 0.73220479, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.8118739128112793 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.05191731, "balance_loss_mlp": 1.02126098, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.899040551750076, "language_loss": 0.83194977, "learning_rate": 2.314946085975709e-06, "loss": 0.85350382, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.6813738346099854 }, { "auxiliary_loss_clip": 0.0112128, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.04970443, "balance_loss_mlp": 1.02066255, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 2.6762224422111665, "language_loss": 0.824175, "learning_rate": 2.3141768094656393e-06, "loss": 0.84566617, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.717089891433716 }, { "auxiliary_loss_clip": 0.01091416, "auxiliary_loss_mlp": 0.01021085, "balance_loss_clip": 1.04409266, "balance_loss_mlp": 1.01335096, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 3.2058254852775674, "language_loss": 0.82918918, "learning_rate": 2.3134074852951966e-06, "loss": 0.8503142, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.855583906173706 }, { "auxiliary_loss_clip": 0.01112592, "auxiliary_loss_mlp": 0.01024592, "balance_loss_clip": 1.04717886, "balance_loss_mlp": 1.01705456, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.7010994180726076, "language_loss": 0.7761085, "learning_rate": 2.312638113581088e-06, "loss": 0.79748034, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.9326729774475098 }, { "auxiliary_loss_clip": 0.01163923, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.05107725, "balance_loss_mlp": 1.02009523, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 3.4063078207270556, "language_loss": 0.78078151, "learning_rate": 2.311868694440027e-06, "loss": 0.80270261, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.628553867340088 }, { "auxiliary_loss_clip": 0.01113157, "auxiliary_loss_mlp": 0.01002771, "balance_loss_clip": 1.04678917, "balance_loss_mlp": 1.0012095, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.7332696972050605, "language_loss": 0.62429929, "learning_rate": 2.3110992279887323e-06, "loss": 0.64545858, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.280419111251831 }, { "auxiliary_loss_clip": 0.01140624, "auxiliary_loss_mlp": 0.01024463, "balance_loss_clip": 1.05139649, "balance_loss_mlp": 1.01642799, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 3.211964152520244, "language_loss": 0.85648048, "learning_rate": 2.310329714343932e-06, "loss": 0.87813133, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.6802544593811035 }, { "auxiliary_loss_clip": 0.01145424, "auxiliary_loss_mlp": 0.01026697, "balance_loss_clip": 1.0516808, "balance_loss_mlp": 1.01922297, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 3.1018030790313893, "language_loss": 0.81889051, "learning_rate": 2.309560153622361e-06, "loss": 0.8406117, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.6444756984710693 }, { "auxiliary_loss_clip": 0.01134737, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 1.05149126, "balance_loss_mlp": 1.01738334, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 3.081666316835856, "language_loss": 0.74745381, "learning_rate": 2.3087905459407602e-06, "loss": 0.76905745, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 5.552383184432983 }, { "auxiliary_loss_clip": 0.01100327, "auxiliary_loss_mlp": 0.01002895, "balance_loss_clip": 1.04571009, "balance_loss_mlp": 1.00154161, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.8153518939247345, "language_loss": 0.62897801, "learning_rate": 2.3080208914158795e-06, "loss": 0.65001023, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 4.211169481277466 }, { "auxiliary_loss_clip": 0.01148849, "auxiliary_loss_mlp": 0.01025035, "balance_loss_clip": 1.05391693, "balance_loss_mlp": 1.01686895, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.4849672421318556, "language_loss": 0.72452974, "learning_rate": 2.3072511901644753e-06, "loss": 0.74626857, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.705543279647827 }, { "auxiliary_loss_clip": 0.01181398, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.05708897, "balance_loss_mlp": 1.01842773, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 2.1650750925663287, "language_loss": 0.81245905, "learning_rate": 2.306481442303309e-06, "loss": 0.83452731, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.6109330654144287 }, { "auxiliary_loss_clip": 0.01164108, "auxiliary_loss_mlp": 0.01021421, "balance_loss_clip": 1.05086517, "balance_loss_mlp": 1.01388669, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 9.848377022705378, "language_loss": 0.73333603, "learning_rate": 2.3057116479491515e-06, "loss": 0.75519133, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.660228729248047 }, { "auxiliary_loss_clip": 0.0115971, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.04962289, "balance_loss_mlp": 1.01665545, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 2.220017619434115, "language_loss": 0.76275647, "learning_rate": 2.30494180721878e-06, "loss": 0.78459233, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.6266732215881348 }, { "auxiliary_loss_clip": 0.01163907, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.05303347, "balance_loss_mlp": 1.01720965, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 2.559103978487947, "language_loss": 0.90043849, "learning_rate": 2.3041719202289794e-06, "loss": 0.92232144, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.6393492221832275 }, { "auxiliary_loss_clip": 0.01168018, "auxiliary_loss_mlp": 0.01020067, "balance_loss_clip": 1.05601633, "balance_loss_mlp": 1.01318312, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.7972242190736758, "language_loss": 0.80471551, "learning_rate": 2.30340198709654e-06, "loss": 0.82659638, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.621776819229126 }, { "auxiliary_loss_clip": 0.01154715, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.050753, "balance_loss_mlp": 1.0224998, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.4180711889600772, "language_loss": 0.74565458, "learning_rate": 2.3026320079382605e-06, "loss": 0.7675029, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.682504653930664 }, { "auxiliary_loss_clip": 0.01179908, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.05628896, "balance_loss_mlp": 1.02264106, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 2.213642843429387, "language_loss": 0.76445222, "learning_rate": 2.3018619828709454e-06, "loss": 0.78655875, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.739528179168701 }, { "auxiliary_loss_clip": 0.01162689, "auxiliary_loss_mlp": 0.00711467, "balance_loss_clip": 1.05411386, "balance_loss_mlp": 1.00048649, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 2.126763012481864, "language_loss": 0.82420075, "learning_rate": 2.3010919120114084e-06, "loss": 0.84294236, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.6630992889404297 }, { "auxiliary_loss_clip": 0.01159043, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.04810929, "balance_loss_mlp": 1.01849771, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 4.26113591527406, "language_loss": 0.6601696, "learning_rate": 2.3003217954764672e-06, "loss": 0.68202847, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.6304476261138916 }, { "auxiliary_loss_clip": 0.01166002, "auxiliary_loss_mlp": 0.01022377, "balance_loss_clip": 1.04982924, "balance_loss_mlp": 1.01482487, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.6984000232594132, "language_loss": 0.79480886, "learning_rate": 2.299551633382949e-06, "loss": 0.81669259, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.6904141902923584 }, { "auxiliary_loss_clip": 0.01138914, "auxiliary_loss_mlp": 0.01023707, "balance_loss_clip": 1.04740846, "balance_loss_mlp": 1.01629186, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 2.863166673544422, "language_loss": 0.85323167, "learning_rate": 2.2987814258476854e-06, "loss": 0.8748579, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.74873948097229 }, { "auxiliary_loss_clip": 0.0111981, "auxiliary_loss_mlp": 0.01022245, "balance_loss_clip": 1.04669619, "balance_loss_mlp": 1.01400757, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.836502153832017, "language_loss": 0.67791647, "learning_rate": 2.2980111729875177e-06, "loss": 0.69933701, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.7845370769500732 }, { "auxiliary_loss_clip": 0.01141315, "auxiliary_loss_mlp": 0.01027362, "balance_loss_clip": 1.04932559, "balance_loss_mlp": 1.02000654, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 2.5380471036803973, "language_loss": 0.82755756, "learning_rate": 2.2972408749192917e-06, "loss": 0.84924436, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.666111469268799 }, { "auxiliary_loss_clip": 0.01161203, "auxiliary_loss_mlp": 0.00711767, "balance_loss_clip": 1.05337596, "balance_loss_mlp": 1.00043797, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 2.3354617563316307, "language_loss": 0.67080164, "learning_rate": 2.296470531759861e-06, "loss": 0.68953133, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.6545608043670654 }, { "auxiliary_loss_clip": 0.0112532, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04672313, "balance_loss_mlp": 1.02295923, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 1.9221212960227996, "language_loss": 0.79182982, "learning_rate": 2.2957001436260866e-06, "loss": 0.8133921, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.7191388607025146 }, { "auxiliary_loss_clip": 0.0114565, "auxiliary_loss_mlp": 0.01024423, "balance_loss_clip": 1.05037773, "balance_loss_mlp": 1.01646018, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.714477085922817, "language_loss": 0.72858369, "learning_rate": 2.294929710634836e-06, "loss": 0.75028443, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.628253698348999 }, { "auxiliary_loss_clip": 0.01164167, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.05076265, "balance_loss_mlp": 1.01704443, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 2.0468727810093115, "language_loss": 0.61680943, "learning_rate": 2.2941592329029823e-06, "loss": 0.63870299, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.782015085220337 }, { "auxiliary_loss_clip": 0.01162606, "auxiliary_loss_mlp": 0.0102351, "balance_loss_clip": 1.05376852, "balance_loss_mlp": 1.01518297, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.7880708653497697, "language_loss": 0.78783977, "learning_rate": 2.2933887105474067e-06, "loss": 0.80970091, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.6432688236236572 }, { "auxiliary_loss_clip": 0.01159982, "auxiliary_loss_mlp": 0.01022967, "balance_loss_clip": 1.05327678, "balance_loss_mlp": 1.01603198, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.7959320053055996, "language_loss": 0.81973112, "learning_rate": 2.2926181436849974e-06, "loss": 0.84156066, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.658906936645508 }, { "auxiliary_loss_clip": 0.01163991, "auxiliary_loss_mlp": 0.01024272, "balance_loss_clip": 1.05353701, "balance_loss_mlp": 1.01633596, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.8397237569624483, "language_loss": 0.72632182, "learning_rate": 2.2918475324326478e-06, "loss": 0.74820447, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.6573081016540527 }, { "auxiliary_loss_clip": 0.0116669, "auxiliary_loss_mlp": 0.00712231, "balance_loss_clip": 1.05311131, "balance_loss_mlp": 1.0006088, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 4.7322624428879925, "language_loss": 0.91392791, "learning_rate": 2.2910768769072603e-06, "loss": 0.9327172, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.652373790740967 }, { "auxiliary_loss_clip": 0.01159309, "auxiliary_loss_mlp": 0.01023796, "balance_loss_clip": 1.0522995, "balance_loss_mlp": 1.01603532, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 2.0908219858054897, "language_loss": 0.75762427, "learning_rate": 2.2903061772257417e-06, "loss": 0.7794553, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.6813113689422607 }, { "auxiliary_loss_clip": 0.01162301, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.05245948, "balance_loss_mlp": 1.02282977, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 2.6596104548698243, "language_loss": 0.78740394, "learning_rate": 2.289535433505007e-06, "loss": 0.80933017, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.678272008895874 }, { "auxiliary_loss_clip": 0.01150392, "auxiliary_loss_mlp": 0.01026447, "balance_loss_clip": 1.05017161, "balance_loss_mlp": 1.0190742, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 6.082681100045374, "language_loss": 0.63796818, "learning_rate": 2.2887646458619767e-06, "loss": 0.65973657, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 3.694171667098999 }, { "auxiliary_loss_clip": 0.01137862, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.05101538, "balance_loss_mlp": 1.02621233, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 2.0019896708465392, "language_loss": 0.7686969, "learning_rate": 2.2879938144135797e-06, "loss": 0.79041767, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 4.550044059753418 }, { "auxiliary_loss_clip": 0.0113178, "auxiliary_loss_mlp": 0.00711255, "balance_loss_clip": 1.04897833, "balance_loss_mlp": 1.0004878, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.8045921263411824, "language_loss": 0.75151408, "learning_rate": 2.2872229392767496e-06, "loss": 0.76994443, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.7104690074920654 }, { "auxiliary_loss_clip": 0.01169548, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.05522656, "balance_loss_mlp": 1.02200699, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.5893928141350593, "language_loss": 0.74797666, "learning_rate": 2.286452020568428e-06, "loss": 0.76996911, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.6065285205841064 }, { "auxiliary_loss_clip": 0.01185723, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.05427992, "balance_loss_mlp": 1.01773918, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.7132719662207994, "language_loss": 0.73439652, "learning_rate": 2.2856810584055637e-06, "loss": 0.75651163, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.5666913986206055 }, { "auxiliary_loss_clip": 0.01167869, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.05432439, "balance_loss_mlp": 1.02034044, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.5418801503018662, "language_loss": 0.6760236, "learning_rate": 2.2849100529051085e-06, "loss": 0.69798315, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.8357958793640137 }, { "auxiliary_loss_clip": 0.01179205, "auxiliary_loss_mlp": 0.01021081, "balance_loss_clip": 1.05453086, "balance_loss_mlp": 1.01389909, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.8472766014042614, "language_loss": 0.79901308, "learning_rate": 2.284139004184026e-06, "loss": 0.82101595, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.5215659141540527 }, { "auxiliary_loss_clip": 0.01182061, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.05620456, "balance_loss_mlp": 1.02325439, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 2.1868633171528566, "language_loss": 0.74303341, "learning_rate": 2.2833679123592814e-06, "loss": 0.76516348, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.605321168899536 }, { "auxiliary_loss_clip": 0.01146415, "auxiliary_loss_mlp": 0.01023066, "balance_loss_clip": 1.05016816, "balance_loss_mlp": 1.01554394, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.788199799242037, "language_loss": 0.63806236, "learning_rate": 2.2825967775478508e-06, "loss": 0.65975714, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.729196071624756 }, { "auxiliary_loss_clip": 0.0118237, "auxiliary_loss_mlp": 0.01022155, "balance_loss_clip": 1.05527902, "balance_loss_mlp": 1.0143702, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.0591176888412117, "language_loss": 0.83323002, "learning_rate": 2.2818255998667135e-06, "loss": 0.85527527, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.6248672008514404 }, { "auxiliary_loss_clip": 0.0116379, "auxiliary_loss_mlp": 0.01024167, "balance_loss_clip": 1.05477619, "balance_loss_mlp": 1.01676989, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.903003074908301, "language_loss": 0.79134941, "learning_rate": 2.2810543794328566e-06, "loss": 0.81322896, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.6791627407073975 }, { "auxiliary_loss_clip": 0.0116966, "auxiliary_loss_mlp": 0.01023794, "balance_loss_clip": 1.05593562, "balance_loss_mlp": 1.01627827, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.8207184176622373, "language_loss": 0.82639945, "learning_rate": 2.2802831163632735e-06, "loss": 0.84833395, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.629351854324341 }, { "auxiliary_loss_clip": 0.01103474, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.04723608, "balance_loss_mlp": 1.01856828, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.8473971521677095, "language_loss": 0.7451303, "learning_rate": 2.279511810774965e-06, "loss": 0.76642835, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.8036062717437744 }, { "auxiliary_loss_clip": 0.01181179, "auxiliary_loss_mlp": 0.01024273, "balance_loss_clip": 1.05409217, "balance_loss_mlp": 1.0161612, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 2.2661608798109656, "language_loss": 0.71837866, "learning_rate": 2.2787404627849364e-06, "loss": 0.74043322, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.5639231204986572 }, { "auxiliary_loss_clip": 0.01146371, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.0502677, "balance_loss_mlp": 1.0202651, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.6944718396272154, "language_loss": 0.78898036, "learning_rate": 2.277969072510202e-06, "loss": 0.81071877, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.734391927719116 }, { "auxiliary_loss_clip": 0.01151959, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 1.05382085, "balance_loss_mlp": 1.01816106, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.551341648591927, "language_loss": 0.81386626, "learning_rate": 2.2771976400677803e-06, "loss": 0.8356421, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.68588924407959 }, { "auxiliary_loss_clip": 0.01107606, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.047158, "balance_loss_mlp": 1.02056336, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 3.6830417795162784, "language_loss": 0.7865575, "learning_rate": 2.2764261655746965e-06, "loss": 0.8079145, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.7457215785980225 }, { "auxiliary_loss_clip": 0.01132054, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.05064595, "balance_loss_mlp": 1.01747572, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 14.131255513908725, "language_loss": 0.75872254, "learning_rate": 2.2756546491479832e-06, "loss": 0.78029895, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.7257730960845947 }, { "auxiliary_loss_clip": 0.01181353, "auxiliary_loss_mlp": 0.00711917, "balance_loss_clip": 1.05278397, "balance_loss_mlp": 1.00057173, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 6.823260316057598, "language_loss": 0.80716217, "learning_rate": 2.274883090904679e-06, "loss": 0.82609487, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.5921452045440674 }, { "auxiliary_loss_clip": 0.01186807, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.0575335, "balance_loss_mlp": 1.03069222, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 2.7920820994362208, "language_loss": 0.67908061, "learning_rate": 2.2741114909618283e-06, "loss": 0.70133245, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.5585813522338867 }, { "auxiliary_loss_clip": 0.01135531, "auxiliary_loss_mlp": 0.01025466, "balance_loss_clip": 1.05189657, "balance_loss_mlp": 1.01777053, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.7615674474352243, "language_loss": 0.72042298, "learning_rate": 2.2733398494364828e-06, "loss": 0.74203289, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.6805338859558105 }, { "auxiliary_loss_clip": 0.01148757, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.05534422, "balance_loss_mlp": 1.0215044, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 2.149878027191046, "language_loss": 0.84658855, "learning_rate": 2.272568166445699e-06, "loss": 0.86836314, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.6837522983551025 }, { "auxiliary_loss_clip": 0.01169925, "auxiliary_loss_mlp": 0.01018357, "balance_loss_clip": 1.05568862, "balance_loss_mlp": 1.01088619, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 3.222522819073182, "language_loss": 0.64548641, "learning_rate": 2.271796442106541e-06, "loss": 0.66736925, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.6622495651245117 }, { "auxiliary_loss_clip": 0.01058372, "auxiliary_loss_mlp": 0.01003556, "balance_loss_clip": 1.03062987, "balance_loss_mlp": 1.00223851, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.8150704232353837, "language_loss": 0.56575191, "learning_rate": 2.271024676536079e-06, "loss": 0.58637118, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.222287893295288 }, { "auxiliary_loss_clip": 0.01161114, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.05852294, "balance_loss_mlp": 1.02643645, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.408320934132122, "language_loss": 0.73343188, "learning_rate": 2.2702528698513894e-06, "loss": 0.75538713, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.6853013038635254 }, { "auxiliary_loss_clip": 0.01150217, "auxiliary_loss_mlp": 0.01025159, "balance_loss_clip": 1.04927862, "balance_loss_mlp": 1.01686251, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 2.661305043515279, "language_loss": 0.78573716, "learning_rate": 2.269481022169554e-06, "loss": 0.80749094, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.7297956943511963 }, { "auxiliary_loss_clip": 0.0115749, "auxiliary_loss_mlp": 0.01022361, "balance_loss_clip": 1.05343127, "balance_loss_mlp": 1.01451075, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.8158979832765567, "language_loss": 0.80877382, "learning_rate": 2.2687091336076614e-06, "loss": 0.83057237, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 3.6526200771331787 }, { "auxiliary_loss_clip": 0.01164298, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.05464816, "balance_loss_mlp": 1.01685917, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 1.8166433321669389, "language_loss": 0.79963511, "learning_rate": 2.267937204282807e-06, "loss": 0.82152534, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 4.493053674697876 }, { "auxiliary_loss_clip": 0.01173574, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.05552649, "balance_loss_mlp": 1.02141476, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 1.9000498014143588, "language_loss": 0.78537333, "learning_rate": 2.2671652343120926e-06, "loss": 0.80740619, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.66190767288208 }, { "auxiliary_loss_clip": 0.01180438, "auxiliary_loss_mlp": 0.01025215, "balance_loss_clip": 1.05535603, "balance_loss_mlp": 1.01745176, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.7335576846385974, "language_loss": 0.8098259, "learning_rate": 2.2663932238126236e-06, "loss": 0.83188242, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.6003708839416504 }, { "auxiliary_loss_clip": 0.01164197, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.05209696, "balance_loss_mlp": 1.01630688, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.527115520625175, "language_loss": 0.80111194, "learning_rate": 2.265621172901515e-06, "loss": 0.82298976, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.6509957313537598 }, { "auxiliary_loss_clip": 0.01187459, "auxiliary_loss_mlp": 0.01030423, "balance_loss_clip": 1.05943918, "balance_loss_mlp": 1.0226655, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 6.854332443039241, "language_loss": 0.71275413, "learning_rate": 2.2648490816958854e-06, "loss": 0.7349329, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.6325676441192627 }, { "auxiliary_loss_clip": 0.01165499, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.05271602, "balance_loss_mlp": 1.01932836, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 3.047268677037643, "language_loss": 0.72963405, "learning_rate": 2.264076950312861e-06, "loss": 0.75156575, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.6836459636688232 }, { "auxiliary_loss_clip": 0.01152695, "auxiliary_loss_mlp": 0.01023874, "balance_loss_clip": 1.05122399, "balance_loss_mlp": 1.01625669, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 2.3549164882988705, "language_loss": 0.82291937, "learning_rate": 2.2633047788695727e-06, "loss": 0.84468508, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.6706435680389404 }, { "auxiliary_loss_clip": 0.01149568, "auxiliary_loss_mlp": 0.01025051, "balance_loss_clip": 1.05364299, "balance_loss_mlp": 1.01764822, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.9173860237715405, "language_loss": 0.64036351, "learning_rate": 2.262532567483159e-06, "loss": 0.66210973, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.6556894779205322 }, { "auxiliary_loss_clip": 0.0118496, "auxiliary_loss_mlp": 0.00712007, "balance_loss_clip": 1.05826855, "balance_loss_mlp": 1.00063038, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 1.9345038163620836, "language_loss": 0.8004725, "learning_rate": 2.2617603162707635e-06, "loss": 0.81944221, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.665210485458374 }, { "auxiliary_loss_clip": 0.01182654, "auxiliary_loss_mlp": 0.01023957, "balance_loss_clip": 1.05693138, "balance_loss_mlp": 1.01646459, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 3.3910785267965173, "language_loss": 0.82916939, "learning_rate": 2.2609880253495363e-06, "loss": 0.85123551, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.624157667160034 }, { "auxiliary_loss_clip": 0.01145323, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.05233967, "balance_loss_mlp": 1.01688588, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 3.5982386251373253, "language_loss": 0.86344492, "learning_rate": 2.260215694836633e-06, "loss": 0.88514727, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.710333824157715 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.00711282, "balance_loss_clip": 1.04779494, "balance_loss_mlp": 1.00041938, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 2.0337463052011584, "language_loss": 0.65112889, "learning_rate": 2.2594433248492157e-06, "loss": 0.66944551, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.739750623703003 }, { "auxiliary_loss_clip": 0.01172945, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.05572879, "balance_loss_mlp": 1.01987672, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.8295117393151574, "language_loss": 0.80207974, "learning_rate": 2.2586709155044527e-06, "loss": 0.82408893, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.6401326656341553 }, { "auxiliary_loss_clip": 0.01184818, "auxiliary_loss_mlp": 0.01025048, "balance_loss_clip": 1.05823207, "balance_loss_mlp": 1.0172044, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.5602765806716297, "language_loss": 0.75958407, "learning_rate": 2.2578984669195167e-06, "loss": 0.78168273, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.6560511589050293 }, { "auxiliary_loss_clip": 0.01162472, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.05062652, "balance_loss_mlp": 1.01694965, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.8436076626626146, "language_loss": 0.67984736, "learning_rate": 2.2571259792115887e-06, "loss": 0.70171273, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.767892599105835 }, { "auxiliary_loss_clip": 0.01162621, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.0540756, "balance_loss_mlp": 1.01936674, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.8519170017909103, "language_loss": 0.79342997, "learning_rate": 2.2563534524978544e-06, "loss": 0.81532574, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.611936092376709 }, { "auxiliary_loss_clip": 0.01135977, "auxiliary_loss_mlp": 0.01021442, "balance_loss_clip": 1.05597639, "balance_loss_mlp": 1.01411366, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 2.146226679618005, "language_loss": 0.70509845, "learning_rate": 2.2555808868955052e-06, "loss": 0.72667265, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.825704574584961 }, { "auxiliary_loss_clip": 0.01119264, "auxiliary_loss_mlp": 0.01030092, "balance_loss_clip": 1.04912019, "balance_loss_mlp": 1.02189636, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 3.224948377946745, "language_loss": 0.73714417, "learning_rate": 2.254808282521738e-06, "loss": 0.75863767, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.7340447902679443 }, { "auxiliary_loss_clip": 0.01139965, "auxiliary_loss_mlp": 0.00711637, "balance_loss_clip": 1.05234456, "balance_loss_mlp": 1.0005331, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 1.7634081587309165, "language_loss": 0.81007278, "learning_rate": 2.2540356394937573e-06, "loss": 0.82858884, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.753998041152954 }, { "auxiliary_loss_clip": 0.0113752, "auxiliary_loss_mlp": 0.01028971, "balance_loss_clip": 1.04863143, "balance_loss_mlp": 1.02041197, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.259413242040567, "language_loss": 0.83702999, "learning_rate": 2.253262957928772e-06, "loss": 0.85869491, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.6785619258880615 }, { "auxiliary_loss_clip": 0.01146312, "auxiliary_loss_mlp": 0.01028215, "balance_loss_clip": 1.05134106, "balance_loss_mlp": 1.02026987, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.74178849786016, "language_loss": 0.72026712, "learning_rate": 2.2524902379439976e-06, "loss": 0.74201238, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.6452550888061523 }, { "auxiliary_loss_clip": 0.01043139, "auxiliary_loss_mlp": 0.01014822, "balance_loss_clip": 1.04866731, "balance_loss_mlp": 1.01338506, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7449637739670927, "language_loss": 0.63720262, "learning_rate": 2.251717479656655e-06, "loss": 0.6577822, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.3960227966308594 }, { "auxiliary_loss_clip": 0.01187146, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.05921948, "balance_loss_mlp": 1.01930308, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 1.920017085737174, "language_loss": 0.7580514, "learning_rate": 2.2509446831839704e-06, "loss": 0.78019536, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.7252676486968994 }, { "auxiliary_loss_clip": 0.01154608, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.05369782, "balance_loss_mlp": 1.02381659, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 2.262694404629515, "language_loss": 0.82407045, "learning_rate": 2.250171848643177e-06, "loss": 0.84593624, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.676609992980957 }, { "auxiliary_loss_clip": 0.01147494, "auxiliary_loss_mlp": 0.010234, "balance_loss_clip": 1.05418789, "balance_loss_mlp": 1.01651263, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 2.365937317729464, "language_loss": 0.8658427, "learning_rate": 2.249398976151513e-06, "loss": 0.88755155, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.638035774230957 }, { "auxiliary_loss_clip": 0.01182076, "auxiliary_loss_mlp": 0.01024451, "balance_loss_clip": 1.05660391, "balance_loss_mlp": 1.01680958, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 2.704539567633256, "language_loss": 0.78968602, "learning_rate": 2.248626065826223e-06, "loss": 0.81175137, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 3.533936023712158 }, { "auxiliary_loss_clip": 0.01114138, "auxiliary_loss_mlp": 0.01013322, "balance_loss_clip": 1.04956436, "balance_loss_mlp": 1.01178408, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7653472612070314, "language_loss": 0.62534356, "learning_rate": 2.2478531177845564e-06, "loss": 0.64661813, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 4.157796621322632 }, { "auxiliary_loss_clip": 0.01158867, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.05792546, "balance_loss_mlp": 1.02069116, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.6975501006508518, "language_loss": 0.85157132, "learning_rate": 2.247080132143769e-06, "loss": 0.8734473, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.7281723022460938 }, { "auxiliary_loss_clip": 0.01135528, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.04984331, "balance_loss_mlp": 1.0174346, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.7262229511500107, "language_loss": 0.69318575, "learning_rate": 2.246307109021121e-06, "loss": 0.71479475, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.675600051879883 }, { "auxiliary_loss_clip": 0.01145984, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 1.05114412, "balance_loss_mlp": 1.01874089, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.638205640073282, "language_loss": 0.8245362, "learning_rate": 2.2455340485338817e-06, "loss": 0.84625411, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.7582554817199707 }, { "auxiliary_loss_clip": 0.01167413, "auxiliary_loss_mlp": 0.01023786, "balance_loss_clip": 1.05467594, "balance_loss_mlp": 1.01694608, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 2.308494902468561, "language_loss": 0.68002176, "learning_rate": 2.244760950799322e-06, "loss": 0.70193368, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.708583116531372 }, { "auxiliary_loss_clip": 0.01118302, "auxiliary_loss_mlp": 0.01026, "balance_loss_clip": 1.04836893, "balance_loss_mlp": 1.01881742, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.160389846860655, "language_loss": 0.72672403, "learning_rate": 2.2439878159347203e-06, "loss": 0.74816704, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.7668521404266357 }, { "auxiliary_loss_clip": 0.0111299, "auxiliary_loss_mlp": 0.01003059, "balance_loss_clip": 1.04859364, "balance_loss_mlp": 1.00153875, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7304042414698871, "language_loss": 0.55280256, "learning_rate": 2.2432146440573616e-06, "loss": 0.57396305, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.2938077449798584 }, { "auxiliary_loss_clip": 0.01154117, "auxiliary_loss_mlp": 0.01023327, "balance_loss_clip": 1.05746615, "balance_loss_mlp": 1.01589465, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 2.7696056767440447, "language_loss": 0.66838205, "learning_rate": 2.242441435284534e-06, "loss": 0.69015652, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.7325477600097656 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01025442, "balance_loss_clip": 1.05544078, "balance_loss_mlp": 1.0172348, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.246310890317124, "language_loss": 0.85194153, "learning_rate": 2.2416681897335337e-06, "loss": 0.87387621, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.7223494052886963 }, { "auxiliary_loss_clip": 0.01118018, "auxiliary_loss_mlp": 0.01025555, "balance_loss_clip": 1.05173135, "balance_loss_mlp": 1.01754105, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 4.784845111914011, "language_loss": 0.67033195, "learning_rate": 2.240894907521661e-06, "loss": 0.69176763, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.792841672897339 }, { "auxiliary_loss_clip": 0.01153924, "auxiliary_loss_mlp": 0.01021599, "balance_loss_clip": 1.05560422, "balance_loss_mlp": 1.01412177, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 1.9188023627813628, "language_loss": 0.63814235, "learning_rate": 2.240121588766223e-06, "loss": 0.65989757, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.6859869956970215 }, { "auxiliary_loss_clip": 0.01142225, "auxiliary_loss_mlp": 0.01023844, "balance_loss_clip": 1.04981995, "balance_loss_mlp": 1.01671267, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 2.157337931044969, "language_loss": 0.72065598, "learning_rate": 2.239348233584531e-06, "loss": 0.74231666, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.727385997772217 }, { "auxiliary_loss_clip": 0.01168401, "auxiliary_loss_mlp": 0.01024777, "balance_loss_clip": 1.05519724, "balance_loss_mlp": 1.01709962, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 2.1627732034976384, "language_loss": 0.8070851, "learning_rate": 2.2385748420939013e-06, "loss": 0.82901686, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.6393587589263916 }, { "auxiliary_loss_clip": 0.01182589, "auxiliary_loss_mlp": 0.01024559, "balance_loss_clip": 1.05935359, "balance_loss_mlp": 1.01745987, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.7768478055887515, "language_loss": 0.72638392, "learning_rate": 2.2378014144116583e-06, "loss": 0.74845535, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.6113080978393555 }, { "auxiliary_loss_clip": 0.01184569, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.05680203, "balance_loss_mlp": 1.02281725, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 1.8677911826881766, "language_loss": 0.80093896, "learning_rate": 2.23702795065513e-06, "loss": 0.82308877, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.6392099857330322 }, { "auxiliary_loss_clip": 0.01096874, "auxiliary_loss_mlp": 0.01004103, "balance_loss_clip": 1.04361415, "balance_loss_mlp": 1.00264287, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9746330711731157, "language_loss": 0.67423385, "learning_rate": 2.2362544509416493e-06, "loss": 0.6952436, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 3.0664784908294678 }, { "auxiliary_loss_clip": 0.0114113, "auxiliary_loss_mlp": 0.01021515, "balance_loss_clip": 1.04885066, "balance_loss_mlp": 1.01417768, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.2338811740629203, "language_loss": 0.83182508, "learning_rate": 2.2354809153885572e-06, "loss": 0.85345149, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.6417641639709473 }, { "auxiliary_loss_clip": 0.01163535, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.05240571, "balance_loss_mlp": 1.02012038, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 4.276133164064332, "language_loss": 0.82986212, "learning_rate": 2.234707344113197e-06, "loss": 0.85177892, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.6228866577148438 }, { "auxiliary_loss_clip": 0.01178943, "auxiliary_loss_mlp": 0.01021211, "balance_loss_clip": 1.05481076, "balance_loss_mlp": 1.01401341, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.812998754711285, "language_loss": 0.77664018, "learning_rate": 2.233933737232919e-06, "loss": 0.79864168, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.615556240081787 }, { "auxiliary_loss_clip": 0.01106917, "auxiliary_loss_mlp": 0.00711472, "balance_loss_clip": 1.04472053, "balance_loss_mlp": 1.00051451, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 2.360186878458703, "language_loss": 0.78344822, "learning_rate": 2.2331600948650793e-06, "loss": 0.80163217, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.723313331604004 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.00713139, "balance_loss_clip": 1.05033362, "balance_loss_mlp": 1.00052702, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.555875801357667, "language_loss": 0.80234075, "learning_rate": 2.2323864171270386e-06, "loss": 0.82072175, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.720240354537964 }, { "auxiliary_loss_clip": 0.01139669, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.05180979, "balance_loss_mlp": 1.02111948, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.8328976160110666, "language_loss": 0.73037326, "learning_rate": 2.231612704136164e-06, "loss": 0.75206041, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.7279675006866455 }, { "auxiliary_loss_clip": 0.01160669, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.05335355, "balance_loss_mlp": 1.01729918, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 3.065289475268774, "language_loss": 0.7457248, "learning_rate": 2.2308389560098253e-06, "loss": 0.76758397, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.640777826309204 }, { "auxiliary_loss_clip": 0.0114131, "auxiliary_loss_mlp": 0.01022868, "balance_loss_clip": 1.05452156, "balance_loss_mlp": 1.01522708, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 3.6998115233631115, "language_loss": 0.77396464, "learning_rate": 2.2300651728654008e-06, "loss": 0.79560649, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.7228426933288574 }, { "auxiliary_loss_clip": 0.01078836, "auxiliary_loss_mlp": 0.0070201, "balance_loss_clip": 1.02970397, "balance_loss_mlp": 1.00008249, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7330015920922379, "language_loss": 0.60153234, "learning_rate": 2.229291354820272e-06, "loss": 0.61934078, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.2447216510772705 }, { "auxiliary_loss_clip": 0.01166036, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.05354667, "balance_loss_mlp": 1.0189147, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 4.309412091777948, "language_loss": 0.7643199, "learning_rate": 2.228517501991828e-06, "loss": 0.78624904, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 4.371078014373779 }, { "auxiliary_loss_clip": 0.01066499, "auxiliary_loss_mlp": 0.01002048, "balance_loss_clip": 1.02797401, "balance_loss_mlp": 1.00066555, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8137175891151284, "language_loss": 0.6104607, "learning_rate": 2.22774361449746e-06, "loss": 0.63114619, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 5.156203269958496 }, { "auxiliary_loss_clip": 0.0110841, "auxiliary_loss_mlp": 0.01025167, "balance_loss_clip": 1.05118704, "balance_loss_mlp": 1.01712036, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 22.01486282055838, "language_loss": 0.70862877, "learning_rate": 2.2269696924545668e-06, "loss": 0.72996455, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.7704825401306152 }, { "auxiliary_loss_clip": 0.01134828, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.05292845, "balance_loss_mlp": 1.0249784, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.1304595221693385, "language_loss": 0.78231138, "learning_rate": 2.2261957359805523e-06, "loss": 0.80398339, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.651568651199341 }, { "auxiliary_loss_clip": 0.01181717, "auxiliary_loss_mlp": 0.01026828, "balance_loss_clip": 1.05606604, "balance_loss_mlp": 1.01909745, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 2.5307642162622503, "language_loss": 0.73609853, "learning_rate": 2.225421745192823e-06, "loss": 0.75818396, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.6974070072174072 }, { "auxiliary_loss_clip": 0.01163549, "auxiliary_loss_mlp": 0.01024036, "balance_loss_clip": 1.05250359, "balance_loss_mlp": 1.01621032, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.223268890231703, "language_loss": 0.78460658, "learning_rate": 2.2246477202087955e-06, "loss": 0.80648255, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.64544415473938 }, { "auxiliary_loss_clip": 0.01148145, "auxiliary_loss_mlp": 0.01021949, "balance_loss_clip": 1.04943776, "balance_loss_mlp": 1.01452541, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.7622192745010947, "language_loss": 0.8267588, "learning_rate": 2.223873661145887e-06, "loss": 0.84845972, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.666632652282715 }, { "auxiliary_loss_clip": 0.01153547, "auxiliary_loss_mlp": 0.00711619, "balance_loss_clip": 1.05785847, "balance_loss_mlp": 1.00045395, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.6883014074770946, "language_loss": 0.71138442, "learning_rate": 2.2230995681215226e-06, "loss": 0.73003602, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.639486312866211 }, { "auxiliary_loss_clip": 0.01130877, "auxiliary_loss_mlp": 0.010239, "balance_loss_clip": 1.04992139, "balance_loss_mlp": 1.01596653, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 2.0229422837962123, "language_loss": 0.78301328, "learning_rate": 2.2223254412531305e-06, "loss": 0.80456102, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.714519739151001 }, { "auxiliary_loss_clip": 0.01137444, "auxiliary_loss_mlp": 0.01024987, "balance_loss_clip": 1.04972601, "balance_loss_mlp": 1.01731014, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 1.9390213488873358, "language_loss": 0.82712114, "learning_rate": 2.221551280658146e-06, "loss": 0.84874547, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.647228956222534 }, { "auxiliary_loss_clip": 0.01116309, "auxiliary_loss_mlp": 0.01025065, "balance_loss_clip": 1.0497787, "balance_loss_mlp": 1.01761174, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 2.3413465963656983, "language_loss": 0.740816, "learning_rate": 2.2207770864540085e-06, "loss": 0.7622298, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.748762369155884 }, { "auxiliary_loss_clip": 0.0114377, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.05105186, "balance_loss_mlp": 1.01619732, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.8423389771914016, "language_loss": 0.73103333, "learning_rate": 2.220002858758162e-06, "loss": 0.75270987, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.6245617866516113 }, { "auxiliary_loss_clip": 0.01088468, "auxiliary_loss_mlp": 0.01007322, "balance_loss_clip": 1.03597665, "balance_loss_mlp": 1.00587988, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8810675203222941, "language_loss": 0.60848117, "learning_rate": 2.2192285976880573e-06, "loss": 0.62943912, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.276183843612671 }, { "auxiliary_loss_clip": 0.01134314, "auxiliary_loss_mlp": 0.00711317, "balance_loss_clip": 1.05062842, "balance_loss_mlp": 1.00055408, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.8782493021760238, "language_loss": 0.81091416, "learning_rate": 2.2184543033611485e-06, "loss": 0.8293705, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.8844034671783447 }, { "auxiliary_loss_clip": 0.01165827, "auxiliary_loss_mlp": 0.01021052, "balance_loss_clip": 1.05144262, "balance_loss_mlp": 1.01405454, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.053532033047985, "language_loss": 0.82058978, "learning_rate": 2.2176799758948957e-06, "loss": 0.84245861, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.6417627334594727 }, { "auxiliary_loss_clip": 0.01144234, "auxiliary_loss_mlp": 0.01032323, "balance_loss_clip": 1.05098689, "balance_loss_mlp": 1.02393055, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 2.102256714973451, "language_loss": 0.73440444, "learning_rate": 2.2169056154067635e-06, "loss": 0.75616997, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.8217663764953613 }, { "auxiliary_loss_clip": 0.0116452, "auxiliary_loss_mlp": 0.0071148, "balance_loss_clip": 1.05328763, "balance_loss_mlp": 1.0004189, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 2.0771764329537072, "language_loss": 0.82420599, "learning_rate": 2.216131222014222e-06, "loss": 0.84296596, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.5986580848693848 }, { "auxiliary_loss_clip": 0.01130646, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.0535084, "balance_loss_mlp": 1.01605189, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.6263917355561897, "language_loss": 0.80823654, "learning_rate": 2.2153567958347455e-06, "loss": 0.82978863, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.6967010498046875 }, { "auxiliary_loss_clip": 0.01148173, "auxiliary_loss_mlp": 0.01026066, "balance_loss_clip": 1.05272603, "balance_loss_mlp": 1.01869321, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.217678479278178, "language_loss": 0.79725683, "learning_rate": 2.214582336985815e-06, "loss": 0.81899923, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.6055479049682617 }, { "auxiliary_loss_clip": 0.01142737, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.05137146, "balance_loss_mlp": 1.02397668, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.0484983650488844, "language_loss": 0.66262162, "learning_rate": 2.2138078455849142e-06, "loss": 0.68436539, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.689025640487671 }, { "auxiliary_loss_clip": 0.01169728, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.05299783, "balance_loss_mlp": 1.01884472, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 2.1648224348192042, "language_loss": 0.7869029, "learning_rate": 2.2130333217495334e-06, "loss": 0.80886084, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.752782106399536 }, { "auxiliary_loss_clip": 0.01143697, "auxiliary_loss_mlp": 0.0102473, "balance_loss_clip": 1.05000436, "balance_loss_mlp": 1.01687384, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.6515405796565283, "language_loss": 0.68025696, "learning_rate": 2.2122587655971665e-06, "loss": 0.70194113, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.648137092590332 }, { "auxiliary_loss_clip": 0.01148973, "auxiliary_loss_mlp": 0.01023872, "balance_loss_clip": 1.05042577, "balance_loss_mlp": 1.01649296, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.6086650961385802, "language_loss": 0.64159656, "learning_rate": 2.211484177245314e-06, "loss": 0.66332495, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.6698219776153564 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01022794, "balance_loss_clip": 1.0543313, "balance_loss_mlp": 1.01521873, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.097042252350175, "language_loss": 0.72500795, "learning_rate": 2.21070955681148e-06, "loss": 0.74704748, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.6042017936706543 }, { "auxiliary_loss_clip": 0.01122066, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.04725695, "balance_loss_mlp": 1.02173197, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.7748047135148646, "language_loss": 0.78246546, "learning_rate": 2.209934904413174e-06, "loss": 0.8039757, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.7443976402282715 }, { "auxiliary_loss_clip": 0.01097614, "auxiliary_loss_mlp": 0.0102331, "balance_loss_clip": 1.0412786, "balance_loss_mlp": 1.01562154, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.9992774162025673, "language_loss": 0.71720517, "learning_rate": 2.2091602201679095e-06, "loss": 0.73841441, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.8262646198272705 }, { "auxiliary_loss_clip": 0.01135762, "auxiliary_loss_mlp": 0.01022368, "balance_loss_clip": 1.04915297, "balance_loss_mlp": 1.01524854, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.154478139288988, "language_loss": 0.83402419, "learning_rate": 2.208385504193206e-06, "loss": 0.85560548, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 5.540787220001221 }, { "auxiliary_loss_clip": 0.01179798, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.05360854, "balance_loss_mlp": 1.01773739, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.0959519097512547, "language_loss": 0.80859572, "learning_rate": 2.2076107566065873e-06, "loss": 0.83064604, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 3.53135085105896 }, { "auxiliary_loss_clip": 0.01165392, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.05118155, "balance_loss_mlp": 1.01845503, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 5.861617616941002, "language_loss": 0.75786215, "learning_rate": 2.2068359775255816e-06, "loss": 0.77977335, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 2.756258010864258 }, { "auxiliary_loss_clip": 0.01112255, "auxiliary_loss_mlp": 0.01022656, "balance_loss_clip": 1.04554081, "balance_loss_mlp": 1.01500857, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 4.360967368527996, "language_loss": 0.78682834, "learning_rate": 2.206061167067723e-06, "loss": 0.80817747, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.754983901977539 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01026217, "balance_loss_clip": 1.04595375, "balance_loss_mlp": 1.01816416, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.385651706092646, "language_loss": 0.79769558, "learning_rate": 2.205286325350549e-06, "loss": 0.81924117, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.7524399757385254 }, { "auxiliary_loss_clip": 0.01114337, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.04512143, "balance_loss_mlp": 1.02313185, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.2591482714290367, "language_loss": 0.72090435, "learning_rate": 2.204511452491603e-06, "loss": 0.7423588, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 2.682908773422241 }, { "auxiliary_loss_clip": 0.01176897, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.05460322, "balance_loss_mlp": 1.01691616, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 2.575747367583557, "language_loss": 0.75091356, "learning_rate": 2.2037365486084316e-06, "loss": 0.77292442, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.850135326385498 }, { "auxiliary_loss_clip": 0.01140601, "auxiliary_loss_mlp": 0.01025508, "balance_loss_clip": 1.04805756, "balance_loss_mlp": 1.0180099, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 2.0283542337073364, "language_loss": 0.77784371, "learning_rate": 2.2029616138185886e-06, "loss": 0.79950482, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.7050087451934814 }, { "auxiliary_loss_clip": 0.0113364, "auxiliary_loss_mlp": 0.01019867, "balance_loss_clip": 1.05451095, "balance_loss_mlp": 1.01232123, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.7729789029050584, "language_loss": 0.82613695, "learning_rate": 2.202186648239629e-06, "loss": 0.84767199, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.7527379989624023 }, { "auxiliary_loss_clip": 0.01161256, "auxiliary_loss_mlp": 0.01021391, "balance_loss_clip": 1.05260909, "balance_loss_mlp": 1.01364219, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 1.7982526826500833, "language_loss": 0.71820331, "learning_rate": 2.201411651989117e-06, "loss": 0.74002981, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.6621365547180176 }, { "auxiliary_loss_clip": 0.01144892, "auxiliary_loss_mlp": 0.00711006, "balance_loss_clip": 1.05240726, "balance_loss_mlp": 1.00041163, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 3.3019823772983483, "language_loss": 0.78069258, "learning_rate": 2.2006366251846167e-06, "loss": 0.79925156, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.723597764968872 }, { "auxiliary_loss_clip": 0.01147369, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 1.05233908, "balance_loss_mlp": 1.01738095, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 2.0577642245486394, "language_loss": 0.75956732, "learning_rate": 2.1998615679436997e-06, "loss": 0.78128475, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.615161657333374 }, { "auxiliary_loss_clip": 0.01155453, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.0504266, "balance_loss_mlp": 1.02217007, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.419549307998138, "language_loss": 0.77064627, "learning_rate": 2.199086480383942e-06, "loss": 0.7925005, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.685325860977173 }, { "auxiliary_loss_clip": 0.01163968, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.05474973, "balance_loss_mlp": 1.02300787, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 2.70420755966929, "language_loss": 0.67983639, "learning_rate": 2.1983113626229234e-06, "loss": 0.70179307, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.713959217071533 }, { "auxiliary_loss_clip": 0.01124736, "auxiliary_loss_mlp": 0.00711574, "balance_loss_clip": 1.04542947, "balance_loss_mlp": 1.00053692, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.7424921262302286, "language_loss": 0.78586602, "learning_rate": 2.1975362147782293e-06, "loss": 0.80422914, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.7073214054107666 }, { "auxiliary_loss_clip": 0.01082472, "auxiliary_loss_mlp": 0.01002764, "balance_loss_clip": 1.04886568, "balance_loss_mlp": 1.00136971, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6952803607856071, "language_loss": 0.54094887, "learning_rate": 2.196761036967448e-06, "loss": 0.5618012, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.383683204650879 }, { "auxiliary_loss_clip": 0.01156896, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.05042863, "balance_loss_mlp": 1.02151966, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.8137956536439335, "language_loss": 0.77573287, "learning_rate": 2.1959858293081743e-06, "loss": 0.79758888, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.6464946269989014 }, { "auxiliary_loss_clip": 0.01130824, "auxiliary_loss_mlp": 0.01024042, "balance_loss_clip": 1.05060458, "balance_loss_mlp": 1.01637375, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.6944712648007005, "language_loss": 0.7598623, "learning_rate": 2.1952105919180056e-06, "loss": 0.78141093, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.7287545204162598 }, { "auxiliary_loss_clip": 0.0114611, "auxiliary_loss_mlp": 0.01023377, "balance_loss_clip": 1.05302739, "balance_loss_mlp": 1.01549733, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.2716432452296322, "language_loss": 0.67953932, "learning_rate": 2.1944353249145456e-06, "loss": 0.70123422, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.6389153003692627 }, { "auxiliary_loss_clip": 0.01179659, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.05561471, "balance_loss_mlp": 1.02210808, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 2.5026549111403273, "language_loss": 0.7490772, "learning_rate": 2.193660028415401e-06, "loss": 0.77116334, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.676236629486084 }, { "auxiliary_loss_clip": 0.01137938, "auxiliary_loss_mlp": 0.01026244, "balance_loss_clip": 1.04770994, "balance_loss_mlp": 1.01871347, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 2.725136816101358, "language_loss": 0.81756234, "learning_rate": 2.1928847025381852e-06, "loss": 0.83920419, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.661862850189209 }, { "auxiliary_loss_clip": 0.01160755, "auxiliary_loss_mlp": 0.01022477, "balance_loss_clip": 1.04881501, "balance_loss_mlp": 1.01490712, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.7047911937943643, "language_loss": 0.84019732, "learning_rate": 2.192109347400512e-06, "loss": 0.86202967, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.657029390335083 }, { "auxiliary_loss_clip": 0.01150587, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.05026555, "balance_loss_mlp": 1.01515436, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.7426768279389664, "language_loss": 0.79042339, "learning_rate": 2.191333963120004e-06, "loss": 0.81216371, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.6638004779815674 }, { "auxiliary_loss_clip": 0.01147313, "auxiliary_loss_mlp": 0.01023611, "balance_loss_clip": 1.05049729, "balance_loss_mlp": 1.01589835, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.524743383479417, "language_loss": 0.70151865, "learning_rate": 2.190558549814286e-06, "loss": 0.72322786, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.6685945987701416 }, { "auxiliary_loss_clip": 0.01144582, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.0485425, "balance_loss_mlp": 1.02365589, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.754759807993893, "language_loss": 0.79461855, "learning_rate": 2.1897831076009872e-06, "loss": 0.81637669, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.678347110748291 }, { "auxiliary_loss_clip": 0.01162766, "auxiliary_loss_mlp": 0.01025922, "balance_loss_clip": 1.05200589, "balance_loss_mlp": 1.01816726, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.8115696219609465, "language_loss": 0.7978667, "learning_rate": 2.1890076365977426e-06, "loss": 0.81975353, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.690664291381836 }, { "auxiliary_loss_clip": 0.01069146, "auxiliary_loss_mlp": 0.01006559, "balance_loss_clip": 1.03016686, "balance_loss_mlp": 1.0050925, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8591336674804926, "language_loss": 0.5279631, "learning_rate": 2.188232136922189e-06, "loss": 0.54872012, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 4.9890663623809814 }, { "auxiliary_loss_clip": 0.01093564, "auxiliary_loss_mlp": 0.01022806, "balance_loss_clip": 1.04509914, "balance_loss_mlp": 1.01526868, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 2.699756907202141, "language_loss": 0.76287597, "learning_rate": 2.187456608691971e-06, "loss": 0.78403968, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 3.7164194583892822 }, { "auxiliary_loss_clip": 0.01138267, "auxiliary_loss_mlp": 0.01024371, "balance_loss_clip": 1.0540204, "balance_loss_mlp": 1.01733184, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 1.8775768491426446, "language_loss": 0.87362313, "learning_rate": 2.1866810520247334e-06, "loss": 0.89524949, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.683289051055908 }, { "auxiliary_loss_clip": 0.0116382, "auxiliary_loss_mlp": 0.01024856, "balance_loss_clip": 1.04861593, "balance_loss_mlp": 1.01701164, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 1.9452435161937551, "language_loss": 0.65175587, "learning_rate": 2.185905467038129e-06, "loss": 0.67364264, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.6757655143737793 }, { "auxiliary_loss_clip": 0.01178856, "auxiliary_loss_mlp": 0.0102509, "balance_loss_clip": 1.05639148, "balance_loss_mlp": 1.01803839, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 2.1491382828023737, "language_loss": 0.7772305, "learning_rate": 2.1851298538498127e-06, "loss": 0.79926991, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.5550074577331543 }, { "auxiliary_loss_clip": 0.01171639, "auxiliary_loss_mlp": 0.00712229, "balance_loss_clip": 1.05478096, "balance_loss_mlp": 1.00072193, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 1.943874538077178, "language_loss": 0.80270016, "learning_rate": 2.184354212577446e-06, "loss": 0.82153881, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.7148120403289795 }, { "auxiliary_loss_clip": 0.01182301, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.05307031, "balance_loss_mlp": 1.0160408, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 3.661197939072979, "language_loss": 0.63216162, "learning_rate": 2.1835785433386907e-06, "loss": 0.65422398, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.546997547149658 }, { "auxiliary_loss_clip": 0.01121477, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.04745579, "balance_loss_mlp": 1.02214158, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.9117318490105486, "language_loss": 0.65365022, "learning_rate": 2.182802846251216e-06, "loss": 0.67516607, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.73245906829834 }, { "auxiliary_loss_clip": 0.01131987, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.0463382, "balance_loss_mlp": 1.02059102, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 1.8659626537126845, "language_loss": 0.72615641, "learning_rate": 2.182027121432696e-06, "loss": 0.74776387, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.7311177253723145 }, { "auxiliary_loss_clip": 0.0118327, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.05435014, "balance_loss_mlp": 1.02100754, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 2.3442750394245873, "language_loss": 0.8246296, "learning_rate": 2.1812513690008054e-06, "loss": 0.84675235, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.6107544898986816 }, { "auxiliary_loss_clip": 0.01173309, "auxiliary_loss_mlp": 0.0102376, "balance_loss_clip": 1.05504751, "balance_loss_mlp": 1.01554668, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.962788375480231, "language_loss": 0.7995832, "learning_rate": 2.180475589073227e-06, "loss": 0.82155395, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.6214792728424072 }, { "auxiliary_loss_clip": 0.01152581, "auxiliary_loss_mlp": 0.01022267, "balance_loss_clip": 1.04854608, "balance_loss_mlp": 1.01486373, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.8729702720883137, "language_loss": 0.73579621, "learning_rate": 2.1796997817676456e-06, "loss": 0.75754464, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.656519889831543 }, { "auxiliary_loss_clip": 0.01166581, "auxiliary_loss_mlp": 0.0071119, "balance_loss_clip": 1.05338919, "balance_loss_mlp": 1.00066781, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.7748042657356444, "language_loss": 0.67577291, "learning_rate": 2.1789239472017494e-06, "loss": 0.69455063, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.7469279766082764 }, { "auxiliary_loss_clip": 0.01129383, "auxiliary_loss_mlp": 0.01023362, "balance_loss_clip": 1.04863811, "balance_loss_mlp": 1.01583385, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.8768440247103144, "language_loss": 0.72966683, "learning_rate": 2.1781480854932326e-06, "loss": 0.75119424, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.7197372913360596 }, { "auxiliary_loss_clip": 0.01112918, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.04618347, "balance_loss_mlp": 1.02089787, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 2.8793523449325096, "language_loss": 0.79315573, "learning_rate": 2.1773721967597933e-06, "loss": 0.81456947, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.731457233428955 }, { "auxiliary_loss_clip": 0.01060239, "auxiliary_loss_mlp": 0.01004568, "balance_loss_clip": 1.0252918, "balance_loss_mlp": 1.00310802, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.8452099582333743, "language_loss": 0.57430339, "learning_rate": 2.1765962811191322e-06, "loss": 0.59495151, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.142422914505005 }, { "auxiliary_loss_clip": 0.01044137, "auxiliary_loss_mlp": 0.01003086, "balance_loss_clip": 1.03907871, "balance_loss_mlp": 1.00160146, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8269990354761295, "language_loss": 0.62033147, "learning_rate": 2.1758203386889566e-06, "loss": 0.64080375, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.30424427986145 }, { "auxiliary_loss_clip": 0.0113473, "auxiliary_loss_mlp": 0.0071174, "balance_loss_clip": 1.05163383, "balance_loss_mlp": 1.00068903, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 2.562784456289223, "language_loss": 0.84660447, "learning_rate": 2.1750443695869746e-06, "loss": 0.86506921, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.6408231258392334 }, { "auxiliary_loss_clip": 0.01165495, "auxiliary_loss_mlp": 0.01024367, "balance_loss_clip": 1.05192494, "balance_loss_mlp": 1.01661205, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 2.4283606303798164, "language_loss": 0.85714382, "learning_rate": 2.174268373930901e-06, "loss": 0.87904245, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.7046732902526855 }, { "auxiliary_loss_clip": 0.01125055, "auxiliary_loss_mlp": 0.00711952, "balance_loss_clip": 1.04982424, "balance_loss_mlp": 1.00059676, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 1.9828920223464346, "language_loss": 0.79654455, "learning_rate": 2.1734923518384537e-06, "loss": 0.81491458, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.669586658477783 }, { "auxiliary_loss_clip": 0.01118515, "auxiliary_loss_mlp": 0.0102079, "balance_loss_clip": 1.04834533, "balance_loss_mlp": 1.01343524, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 2.4436558137984496, "language_loss": 0.82229733, "learning_rate": 2.1727163034273547e-06, "loss": 0.8436904, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.7528083324432373 }, { "auxiliary_loss_clip": 0.0116471, "auxiliary_loss_mlp": 0.01022152, "balance_loss_clip": 1.05122793, "balance_loss_mlp": 1.01520181, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.4918724711697235, "language_loss": 0.78899682, "learning_rate": 2.17194022881533e-06, "loss": 0.8108654, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.63202166557312 }, { "auxiliary_loss_clip": 0.01152387, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 1.05145764, "balance_loss_mlp": 1.01704848, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 2.0967383179350736, "language_loss": 0.67572427, "learning_rate": 2.1711641281201092e-06, "loss": 0.69750106, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.6459999084472656 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01028258, "balance_loss_clip": 1.05470395, "balance_loss_mlp": 1.02044415, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.510468507953748, "language_loss": 0.79225379, "learning_rate": 2.1703880014594264e-06, "loss": 0.81417871, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.7051901817321777 }, { "auxiliary_loss_clip": 0.01114578, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.05295312, "balance_loss_mlp": 1.02341735, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 1.983385075372312, "language_loss": 0.73850161, "learning_rate": 2.1696118489510182e-06, "loss": 0.7599566, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.8360965251922607 }, { "auxiliary_loss_clip": 0.01137502, "auxiliary_loss_mlp": 0.00711715, "balance_loss_clip": 1.05052054, "balance_loss_mlp": 1.00059557, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 1.8142040430014847, "language_loss": 0.72918737, "learning_rate": 2.1688356707126286e-06, "loss": 0.74767947, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.760899305343628 }, { "auxiliary_loss_clip": 0.01126533, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.04863131, "balance_loss_mlp": 1.0218153, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 2.161105781594209, "language_loss": 0.70089966, "learning_rate": 2.168059466862001e-06, "loss": 0.72246414, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 4.456729888916016 }, { "auxiliary_loss_clip": 0.01148521, "auxiliary_loss_mlp": 0.01022788, "balance_loss_clip": 1.04940033, "balance_loss_mlp": 1.01554894, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 2.639505407434014, "language_loss": 0.81722128, "learning_rate": 2.167283237516887e-06, "loss": 0.8389343, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 4.5337746143341064 }, { "auxiliary_loss_clip": 0.01149824, "auxiliary_loss_mlp": 0.01021038, "balance_loss_clip": 1.05084658, "balance_loss_mlp": 1.01334929, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.9708340130080333, "language_loss": 0.74863786, "learning_rate": 2.1665069827950383e-06, "loss": 0.77034652, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.625169277191162 }, { "auxiliary_loss_clip": 0.01145634, "auxiliary_loss_mlp": 0.01023749, "balance_loss_clip": 1.0500505, "balance_loss_mlp": 1.01660824, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 2.2182881670780215, "language_loss": 0.86921751, "learning_rate": 2.1657307028142126e-06, "loss": 0.89091134, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.6537413597106934 }, { "auxiliary_loss_clip": 0.01152428, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.0538826, "balance_loss_mlp": 1.02249098, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 1.9655274970203689, "language_loss": 0.67351782, "learning_rate": 2.164954397692171e-06, "loss": 0.69534695, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.731797218322754 }, { "auxiliary_loss_clip": 0.01080444, "auxiliary_loss_mlp": 0.01003487, "balance_loss_clip": 1.03977346, "balance_loss_mlp": 1.00210381, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.0963182597251424, "language_loss": 0.77316111, "learning_rate": 2.164178067546678e-06, "loss": 0.79400039, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.312124013900757 }, { "auxiliary_loss_clip": 0.01151375, "auxiliary_loss_mlp": 0.01024453, "balance_loss_clip": 1.04803061, "balance_loss_mlp": 1.01713347, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 2.307295081212084, "language_loss": 0.91149604, "learning_rate": 2.163401712495504e-06, "loss": 0.93325436, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.6799979209899902 }, { "auxiliary_loss_clip": 0.01121567, "auxiliary_loss_mlp": 0.01028911, "balance_loss_clip": 1.0499928, "balance_loss_mlp": 1.02104366, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 2.127215064377336, "language_loss": 0.79288489, "learning_rate": 2.1626253326564194e-06, "loss": 0.81438965, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.7665717601776123 }, { "auxiliary_loss_clip": 0.01146694, "auxiliary_loss_mlp": 0.01022425, "balance_loss_clip": 1.05010378, "balance_loss_mlp": 1.0148133, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.9165956107247752, "language_loss": 0.77333629, "learning_rate": 2.161848928147201e-06, "loss": 0.79502743, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.783604860305786 }, { "auxiliary_loss_clip": 0.01164028, "auxiliary_loss_mlp": 0.0102229, "balance_loss_clip": 1.05312157, "balance_loss_mlp": 1.01463127, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.0561299183547743, "language_loss": 0.80977458, "learning_rate": 2.161072499085629e-06, "loss": 0.83163774, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.658071994781494 }, { "auxiliary_loss_clip": 0.01135996, "auxiliary_loss_mlp": 0.01023755, "balance_loss_clip": 1.05065608, "balance_loss_mlp": 1.01637602, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 2.374120365043637, "language_loss": 0.83012986, "learning_rate": 2.160296045589487e-06, "loss": 0.85172737, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.809400796890259 }, { "auxiliary_loss_clip": 0.01163485, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.05334878, "balance_loss_mlp": 1.01903319, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 1.6758380572107305, "language_loss": 0.69942904, "learning_rate": 2.159519567776562e-06, "loss": 0.72133249, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.6215319633483887 }, { "auxiliary_loss_clip": 0.01119349, "auxiliary_loss_mlp": 0.01020282, "balance_loss_clip": 1.0452137, "balance_loss_mlp": 1.01250076, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 3.1268095207529485, "language_loss": 0.70991063, "learning_rate": 2.1587430657646463e-06, "loss": 0.73130685, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.74155592918396 }, { "auxiliary_loss_clip": 0.01145051, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.05122542, "balance_loss_mlp": 1.02052426, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 2.0219046515144505, "language_loss": 0.77766836, "learning_rate": 2.157966539671533e-06, "loss": 0.79939789, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.681177854537964 }, { "auxiliary_loss_clip": 0.01135275, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.0495559, "balance_loss_mlp": 1.02203512, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 2.1015367610253928, "language_loss": 0.67818481, "learning_rate": 2.157189989615021e-06, "loss": 0.6998319, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.704601526260376 }, { "auxiliary_loss_clip": 0.01166595, "auxiliary_loss_mlp": 0.00712244, "balance_loss_clip": 1.05310488, "balance_loss_mlp": 1.00052989, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 1.877333429902249, "language_loss": 0.74808609, "learning_rate": 2.156413415712913e-06, "loss": 0.76687455, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.6412100791931152 }, { "auxiliary_loss_clip": 0.01151961, "auxiliary_loss_mlp": 0.00711913, "balance_loss_clip": 1.05082035, "balance_loss_mlp": 1.00054073, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 4.412194912311375, "language_loss": 0.7864877, "learning_rate": 2.155636818083014e-06, "loss": 0.80512643, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.736954927444458 }, { "auxiliary_loss_clip": 0.01143633, "auxiliary_loss_mlp": 0.01025695, "balance_loss_clip": 1.05158496, "balance_loss_mlp": 1.01818502, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 2.219869398877821, "language_loss": 0.84842014, "learning_rate": 2.154860196843134e-06, "loss": 0.87011337, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.6338398456573486 }, { "auxiliary_loss_clip": 0.01178807, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.05207825, "balance_loss_mlp": 1.02560592, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 1.8109754711415957, "language_loss": 0.76778901, "learning_rate": 2.154083552111085e-06, "loss": 0.78990716, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.608726978302002 }, { "auxiliary_loss_clip": 0.01179807, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.05250406, "balance_loss_mlp": 1.01957512, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.795719852099356, "language_loss": 0.81819451, "learning_rate": 2.1533068840046834e-06, "loss": 0.84026468, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.653986692428589 }, { "auxiliary_loss_clip": 0.01140066, "auxiliary_loss_mlp": 0.00711912, "balance_loss_clip": 1.04889059, "balance_loss_mlp": 1.00063109, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.550549643332826, "language_loss": 0.61503732, "learning_rate": 2.152530192641749e-06, "loss": 0.6335572, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.690164566040039 }, { "auxiliary_loss_clip": 0.01168337, "auxiliary_loss_mlp": 0.01023026, "balance_loss_clip": 1.05446029, "balance_loss_mlp": 1.01575422, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 2.7896200596948333, "language_loss": 0.7269609, "learning_rate": 2.1517534781401068e-06, "loss": 0.74887455, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.586845874786377 }, { "auxiliary_loss_clip": 0.01162787, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.0520978, "balance_loss_mlp": 1.01536846, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 3.57909656049644, "language_loss": 0.69348812, "learning_rate": 2.150976740617581e-06, "loss": 0.71534944, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.6498591899871826 }, { "auxiliary_loss_clip": 0.01151592, "auxiliary_loss_mlp": 0.01023324, "balance_loss_clip": 1.05204964, "balance_loss_mlp": 1.01640689, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 2.199108673377899, "language_loss": 0.71033871, "learning_rate": 2.150199980192006e-06, "loss": 0.73208791, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.7108826637268066 }, { "auxiliary_loss_clip": 0.01141625, "auxiliary_loss_mlp": 0.01024021, "balance_loss_clip": 1.04798865, "balance_loss_mlp": 1.01661241, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.8006992692905646, "language_loss": 0.80812776, "learning_rate": 2.1494231969812114e-06, "loss": 0.82978415, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.6409199237823486 }, { "auxiliary_loss_clip": 0.01135741, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.04987645, "balance_loss_mlp": 1.02508211, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 5.684734980673886, "language_loss": 0.81045854, "learning_rate": 2.1486463911030372e-06, "loss": 0.83214104, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.735466480255127 }, { "auxiliary_loss_clip": 0.01144977, "auxiliary_loss_mlp": 0.01028936, "balance_loss_clip": 1.04828143, "balance_loss_mlp": 1.02124739, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.8356754515989084, "language_loss": 0.74424207, "learning_rate": 2.147869562675324e-06, "loss": 0.7659812, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 4.493034362792969 }, { "auxiliary_loss_clip": 0.01162194, "auxiliary_loss_mlp": 0.01022339, "balance_loss_clip": 1.05228198, "balance_loss_mlp": 1.0143466, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 2.09934236483781, "language_loss": 0.72791648, "learning_rate": 2.147092711815915e-06, "loss": 0.74976182, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 4.459111452102661 }, { "auxiliary_loss_clip": 0.01131744, "auxiliary_loss_mlp": 0.01025899, "balance_loss_clip": 1.0528847, "balance_loss_mlp": 1.01796246, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 4.300862239491634, "language_loss": 0.86306983, "learning_rate": 2.1463158386426593e-06, "loss": 0.88464618, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.6967945098876953 }, { "auxiliary_loss_clip": 0.01154445, "auxiliary_loss_mlp": 0.01025563, "balance_loss_clip": 1.05304217, "balance_loss_mlp": 1.01726055, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 2.105332129738295, "language_loss": 0.77621478, "learning_rate": 2.145538943273407e-06, "loss": 0.79801488, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.739065170288086 }, { "auxiliary_loss_clip": 0.01183863, "auxiliary_loss_mlp": 0.01024954, "balance_loss_clip": 1.05791068, "balance_loss_mlp": 1.01688933, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 2.01781856714299, "language_loss": 0.72104371, "learning_rate": 2.144762025826013e-06, "loss": 0.74313188, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 2.6180782318115234 }, { "auxiliary_loss_clip": 0.01167048, "auxiliary_loss_mlp": 0.01023178, "balance_loss_clip": 1.05287254, "balance_loss_mlp": 1.0159961, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.7881672728996088, "language_loss": 0.87045956, "learning_rate": 2.143985086418334e-06, "loss": 0.89236182, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.6623575687408447 }, { "auxiliary_loss_clip": 0.01149285, "auxiliary_loss_mlp": 0.01025433, "balance_loss_clip": 1.05069566, "balance_loss_mlp": 1.01858163, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.4934179642873238, "language_loss": 0.7693367, "learning_rate": 2.1432081251682324e-06, "loss": 0.79108393, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.6771628856658936 }, { "auxiliary_loss_clip": 0.01166136, "auxiliary_loss_mlp": 0.01030981, "balance_loss_clip": 1.0579108, "balance_loss_mlp": 1.02336383, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.948474485710973, "language_loss": 0.87351573, "learning_rate": 2.142431142193572e-06, "loss": 0.89548695, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.6565327644348145 }, { "auxiliary_loss_clip": 0.01177017, "auxiliary_loss_mlp": 0.01022592, "balance_loss_clip": 1.05438364, "balance_loss_mlp": 1.01574922, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.051460069253648, "language_loss": 0.7148968, "learning_rate": 2.1416541376122207e-06, "loss": 0.73689294, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.742633104324341 }, { "auxiliary_loss_clip": 0.01181069, "auxiliary_loss_mlp": 0.01026364, "balance_loss_clip": 1.05539155, "balance_loss_mlp": 1.0187465, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 2.1196183060346727, "language_loss": 0.73012227, "learning_rate": 2.1408771115420496e-06, "loss": 0.75219661, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.6509511470794678 }, { "auxiliary_loss_clip": 0.01124627, "auxiliary_loss_mlp": 0.01024494, "balance_loss_clip": 1.05499482, "balance_loss_mlp": 1.01736832, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 2.2205469960702753, "language_loss": 0.64639652, "learning_rate": 2.140100064100932e-06, "loss": 0.66788769, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.713179588317871 }, { "auxiliary_loss_clip": 0.01158137, "auxiliary_loss_mlp": 0.01025045, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.01780045, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 2.008206270949598, "language_loss": 0.76229286, "learning_rate": 2.139322995406746e-06, "loss": 0.78412473, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.6137349605560303 }, { "auxiliary_loss_clip": 0.01181648, "auxiliary_loss_mlp": 0.0102281, "balance_loss_clip": 1.05632448, "balance_loss_mlp": 1.01579714, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.7673549344238952, "language_loss": 0.79711795, "learning_rate": 2.1385459055773727e-06, "loss": 0.81916255, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.635847330093384 }, { "auxiliary_loss_clip": 0.01102394, "auxiliary_loss_mlp": 0.00710339, "balance_loss_clip": 1.0461483, "balance_loss_mlp": 1.00050843, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 1.8958061353597937, "language_loss": 0.73932827, "learning_rate": 2.137768794730696e-06, "loss": 0.75745559, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.0949127674102783 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.05462837, "balance_loss_mlp": 1.02236819, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.9843945164944345, "language_loss": 0.80189466, "learning_rate": 2.1369916629846026e-06, "loss": 0.82372403, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.6768596172332764 }, { "auxiliary_loss_clip": 0.01149796, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.0520829, "balance_loss_mlp": 1.01975513, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 2.8798726351364023, "language_loss": 0.74698186, "learning_rate": 2.136214510456983e-06, "loss": 0.76875126, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.717181444168091 }, { "auxiliary_loss_clip": 0.01059406, "auxiliary_loss_mlp": 0.00701413, "balance_loss_clip": 1.04046583, "balance_loss_mlp": 1.00024283, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.8838642706661625, "language_loss": 0.6315273, "learning_rate": 2.1354373372657296e-06, "loss": 0.64913553, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.391511917114258 }, { "auxiliary_loss_clip": 0.01179932, "auxiliary_loss_mlp": 0.0102444, "balance_loss_clip": 1.0578351, "balance_loss_mlp": 1.01715624, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.7074821447218929, "language_loss": 0.70810211, "learning_rate": 2.1346601435287404e-06, "loss": 0.73014581, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.591458559036255 }, { "auxiliary_loss_clip": 0.01147024, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 1.05087137, "balance_loss_mlp": 1.0184983, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.970504935003462, "language_loss": 0.80538321, "learning_rate": 2.1338829293639144e-06, "loss": 0.82710922, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.727670907974243 }, { "auxiliary_loss_clip": 0.01119369, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.05238605, "balance_loss_mlp": 1.02166212, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.063311431011542, "language_loss": 0.83095878, "learning_rate": 2.1331056948891547e-06, "loss": 0.85244673, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.6968531608581543 }, { "auxiliary_loss_clip": 0.01144321, "auxiliary_loss_mlp": 0.01026781, "balance_loss_clip": 1.05200815, "balance_loss_mlp": 1.01904464, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.2464287319345138, "language_loss": 0.76447982, "learning_rate": 2.1323284402223666e-06, "loss": 0.78619087, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.6609737873077393 }, { "auxiliary_loss_clip": 0.01178059, "auxiliary_loss_mlp": 0.00709892, "balance_loss_clip": 1.05656815, "balance_loss_mlp": 1.00054836, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.868003713897026, "language_loss": 0.88102841, "learning_rate": 2.1315511654814597e-06, "loss": 0.89990795, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.583930730819702 }, { "auxiliary_loss_clip": 0.01141189, "auxiliary_loss_mlp": 0.01021497, "balance_loss_clip": 1.05129802, "balance_loss_mlp": 1.0144546, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 2.6399096057610456, "language_loss": 0.78463143, "learning_rate": 2.1307738707843456e-06, "loss": 0.8062582, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.6845362186431885 }, { "auxiliary_loss_clip": 0.01169322, "auxiliary_loss_mlp": 0.01023117, "balance_loss_clip": 1.05543113, "balance_loss_mlp": 1.01523471, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 2.3603638324002816, "language_loss": 0.69158757, "learning_rate": 2.1299965562489385e-06, "loss": 0.71351194, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.6440341472625732 }, { "auxiliary_loss_clip": 0.01160439, "auxiliary_loss_mlp": 0.01026088, "balance_loss_clip": 1.05036414, "balance_loss_mlp": 1.01867032, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 2.0307253907544403, "language_loss": 0.79101294, "learning_rate": 2.129219221993158e-06, "loss": 0.81287819, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.724475622177124 }, { "auxiliary_loss_clip": 0.01066243, "auxiliary_loss_mlp": 0.01003151, "balance_loss_clip": 1.04153562, "balance_loss_mlp": 1.0018878, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.8129034374895032, "language_loss": 0.59978354, "learning_rate": 2.128441868134924e-06, "loss": 0.62047756, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.2949323654174805 }, { "auxiliary_loss_clip": 0.0113406, "auxiliary_loss_mlp": 0.01018208, "balance_loss_clip": 1.04918492, "balance_loss_mlp": 1.01060247, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.429140499350479, "language_loss": 0.82786059, "learning_rate": 2.1276644947921606e-06, "loss": 0.84938329, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 3.5869359970092773 }, { "auxiliary_loss_clip": 0.01164324, "auxiliary_loss_mlp": 0.01020861, "balance_loss_clip": 1.05337691, "balance_loss_mlp": 1.01270068, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 2.919023582942522, "language_loss": 0.82752907, "learning_rate": 2.126887102082795e-06, "loss": 0.84938085, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 5.433320045471191 }, { "auxiliary_loss_clip": 0.01127661, "auxiliary_loss_mlp": 0.01021921, "balance_loss_clip": 1.04529667, "balance_loss_mlp": 1.01478028, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.7208957223065047, "language_loss": 0.70124364, "learning_rate": 2.126109690124757e-06, "loss": 0.72273946, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.792301654815674 }, { "auxiliary_loss_clip": 0.01118073, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.04858661, "balance_loss_mlp": 1.01894689, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 2.0445838383377786, "language_loss": 0.70866406, "learning_rate": 2.1253322590359786e-06, "loss": 0.73011577, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.748800039291382 }, { "auxiliary_loss_clip": 0.01160891, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.05128872, "balance_loss_mlp": 1.02111995, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 2.3925161700425823, "language_loss": 0.74332386, "learning_rate": 2.124554808934397e-06, "loss": 0.76522654, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 2.6646323204040527 }, { "auxiliary_loss_clip": 0.01111392, "auxiliary_loss_mlp": 0.01025797, "balance_loss_clip": 1.04605246, "balance_loss_mlp": 1.01894879, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.7729469271711624, "language_loss": 0.73274136, "learning_rate": 2.1237773399379496e-06, "loss": 0.75411332, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.7526514530181885 }, { "auxiliary_loss_clip": 0.01153792, "auxiliary_loss_mlp": 0.01023947, "balance_loss_clip": 1.04966795, "balance_loss_mlp": 1.01635385, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 1.8581438786269757, "language_loss": 0.87064564, "learning_rate": 2.122999852164578e-06, "loss": 0.89242303, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.663377523422241 }, { "auxiliary_loss_clip": 0.01114815, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.04859817, "balance_loss_mlp": 1.01933098, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 6.802172449639333, "language_loss": 0.58331418, "learning_rate": 2.122222345732227e-06, "loss": 0.60473394, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.715257406234741 }, { "auxiliary_loss_clip": 0.01132957, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.04924452, "balance_loss_mlp": 1.02126682, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 2.295312765254754, "language_loss": 0.82983255, "learning_rate": 2.121444820758843e-06, "loss": 0.85145468, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.6624560356140137 }, { "auxiliary_loss_clip": 0.01117609, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.05024755, "balance_loss_mlp": 1.02290487, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 2.3287794641500663, "language_loss": 0.78247118, "learning_rate": 2.120667277362376e-06, "loss": 0.80395907, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.691826105117798 }, { "auxiliary_loss_clip": 0.01184537, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.05788589, "balance_loss_mlp": 1.02278149, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 2.3462019909860805, "language_loss": 0.84903854, "learning_rate": 2.1198897156607796e-06, "loss": 0.87119055, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.589529275894165 }, { "auxiliary_loss_clip": 0.01168434, "auxiliary_loss_mlp": 0.01022458, "balance_loss_clip": 1.0523783, "balance_loss_mlp": 1.01501393, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.595544179490769, "language_loss": 0.73819113, "learning_rate": 2.1191121357720085e-06, "loss": 0.76010001, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.667111396789551 }, { "auxiliary_loss_clip": 0.01112815, "auxiliary_loss_mlp": 0.01027468, "balance_loss_clip": 1.04988492, "balance_loss_mlp": 1.01953506, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.8579827369312902, "language_loss": 0.74596906, "learning_rate": 2.1183345378140206e-06, "loss": 0.76737189, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.690711736679077 }, { "auxiliary_loss_clip": 0.01083033, "auxiliary_loss_mlp": 0.01003613, "balance_loss_clip": 1.03095198, "balance_loss_mlp": 1.00203919, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8588845923900423, "language_loss": 0.61941159, "learning_rate": 2.1175569219047783e-06, "loss": 0.64027798, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.3332455158233643 }, { "auxiliary_loss_clip": 0.0118021, "auxiliary_loss_mlp": 0.01031445, "balance_loss_clip": 1.05465746, "balance_loss_mlp": 1.02358294, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 1.868058044072073, "language_loss": 0.73392522, "learning_rate": 2.1167792881622437e-06, "loss": 0.75604177, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.5975685119628906 }, { "auxiliary_loss_clip": 0.0114209, "auxiliary_loss_mlp": 0.01023622, "balance_loss_clip": 1.05083156, "balance_loss_mlp": 1.01683855, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.6235086694847445, "language_loss": 0.80884033, "learning_rate": 2.116001636704384e-06, "loss": 0.8304975, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.674525260925293 }, { "auxiliary_loss_clip": 0.01123878, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.04809237, "balance_loss_mlp": 1.01974905, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 5.022025483880685, "language_loss": 0.80409735, "learning_rate": 2.1152239676491685e-06, "loss": 0.82560527, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.7801733016967773 }, { "auxiliary_loss_clip": 0.01150436, "auxiliary_loss_mlp": 0.0102361, "balance_loss_clip": 1.04880333, "balance_loss_mlp": 1.01682425, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.8545598277402717, "language_loss": 0.73362815, "learning_rate": 2.114446281114569e-06, "loss": 0.75536859, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.647890090942383 }, { "auxiliary_loss_clip": 0.0114118, "auxiliary_loss_mlp": 0.01027811, "balance_loss_clip": 1.05156517, "balance_loss_mlp": 1.02041364, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 1.8384041136387999, "language_loss": 0.76069343, "learning_rate": 2.1136685772185587e-06, "loss": 0.78238332, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.6854681968688965 }, { "auxiliary_loss_clip": 0.01147467, "auxiliary_loss_mlp": 0.00711423, "balance_loss_clip": 1.04681981, "balance_loss_mlp": 1.0005101, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.9281430048455832, "language_loss": 0.77667224, "learning_rate": 2.1128908560791163e-06, "loss": 0.79526114, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.6492807865142822 }, { "auxiliary_loss_clip": 0.0117775, "auxiliary_loss_mlp": 0.01027192, "balance_loss_clip": 1.052876, "balance_loss_mlp": 1.01990199, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 2.8040324369154406, "language_loss": 0.78186655, "learning_rate": 2.1121131178142203e-06, "loss": 0.80391592, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.5762314796447754 }, { "auxiliary_loss_clip": 0.01146321, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.04874063, "balance_loss_mlp": 1.0187614, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.575087351935936, "language_loss": 0.82436389, "learning_rate": 2.1113353625418544e-06, "loss": 0.84608483, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.6267659664154053 }, { "auxiliary_loss_clip": 0.01157712, "auxiliary_loss_mlp": 0.01029559, "balance_loss_clip": 1.05412793, "balance_loss_mlp": 1.0223527, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.5640173219415419, "language_loss": 0.79183018, "learning_rate": 2.1105575903800017e-06, "loss": 0.81370282, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.6276044845581055 }, { "auxiliary_loss_clip": 0.0116923, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.05384922, "balance_loss_mlp": 1.01885653, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 4.225089538810222, "language_loss": 0.85111606, "learning_rate": 2.1097798014466502e-06, "loss": 0.87306911, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.6301229000091553 }, { "auxiliary_loss_clip": 0.0116858, "auxiliary_loss_mlp": 0.01024078, "balance_loss_clip": 1.05300105, "balance_loss_mlp": 1.01578069, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.487828437666212, "language_loss": 0.59034979, "learning_rate": 2.109001995859791e-06, "loss": 0.61227632, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.6158463954925537 }, { "auxiliary_loss_clip": 0.01064042, "auxiliary_loss_mlp": 0.01004576, "balance_loss_clip": 1.02674103, "balance_loss_mlp": 1.00324726, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.8090754808912871, "language_loss": 0.60114729, "learning_rate": 2.108224173737415e-06, "loss": 0.62183356, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.1986119747161865 }, { "auxiliary_loss_clip": 0.01142935, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.04716635, "balance_loss_mlp": 1.02100658, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 1.9533845414582354, "language_loss": 0.76282966, "learning_rate": 2.1074463351975183e-06, "loss": 0.78453958, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 3.5954456329345703 }, { "auxiliary_loss_clip": 0.01134252, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.04828036, "balance_loss_mlp": 1.01740527, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.9955720417279579, "language_loss": 0.71687984, "learning_rate": 2.106668480358098e-06, "loss": 0.73846793, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 4.565168142318726 }, { "auxiliary_loss_clip": 0.01139012, "auxiliary_loss_mlp": 0.01024196, "balance_loss_clip": 1.04721808, "balance_loss_mlp": 1.01594067, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.7712453332457176, "language_loss": 0.71148157, "learning_rate": 2.105890609337154e-06, "loss": 0.73311365, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.719775676727295 }, { "auxiliary_loss_clip": 0.01092413, "auxiliary_loss_mlp": 0.01000946, "balance_loss_clip": 1.02955425, "balance_loss_mlp": 0.99939603, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6893484510925478, "language_loss": 0.63755727, "learning_rate": 2.1051127222526883e-06, "loss": 0.65849084, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.282626152038574 }, { "auxiliary_loss_clip": 0.01163555, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.05478287, "balance_loss_mlp": 1.02203035, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 2.7324538415859547, "language_loss": 0.80615473, "learning_rate": 2.1043348192227067e-06, "loss": 0.8280803, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 2.6934075355529785 }, { "auxiliary_loss_clip": 0.01123188, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.04867649, "balance_loss_mlp": 1.02155113, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.839262834562242, "language_loss": 0.618981, "learning_rate": 2.1035569003652156e-06, "loss": 0.64050406, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.713923692703247 }, { "auxiliary_loss_clip": 0.01115072, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.04763067, "balance_loss_mlp": 1.02203369, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 1.9420313555757753, "language_loss": 0.82003957, "learning_rate": 2.1027789657982255e-06, "loss": 0.8414976, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.7077577114105225 }, { "auxiliary_loss_clip": 0.01115249, "auxiliary_loss_mlp": 0.01021893, "balance_loss_clip": 1.04720998, "balance_loss_mlp": 1.01451993, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 2.0188843354305837, "language_loss": 0.77634466, "learning_rate": 2.1020010156397482e-06, "loss": 0.79771608, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.6942617893218994 }, { "auxiliary_loss_clip": 0.01162325, "auxiliary_loss_mlp": 0.01025761, "balance_loss_clip": 1.05103636, "balance_loss_mlp": 1.01883531, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 2.944577163960029, "language_loss": 0.77383244, "learning_rate": 2.101223050007797e-06, "loss": 0.79571331, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.6971962451934814 }, { "auxiliary_loss_clip": 0.01090968, "auxiliary_loss_mlp": 0.01002284, "balance_loss_clip": 1.02810383, "balance_loss_mlp": 1.00093699, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8246511693800237, "language_loss": 0.53777373, "learning_rate": 2.1004450690203904e-06, "loss": 0.55870628, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.2099225521087646 }, { "auxiliary_loss_clip": 0.01091172, "auxiliary_loss_mlp": 0.01001456, "balance_loss_clip": 1.0283972, "balance_loss_mlp": 1.00000191, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.9458993144628424, "language_loss": 0.63292933, "learning_rate": 2.099667072795546e-06, "loss": 0.65385556, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.2339022159576416 }, { "auxiliary_loss_clip": 0.01160181, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 1.05052185, "balance_loss_mlp": 1.01860857, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 2.247604588975221, "language_loss": 0.79758441, "learning_rate": 2.0988890614512864e-06, "loss": 0.81944239, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.682318925857544 }, { "auxiliary_loss_clip": 0.01149336, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.05223656, "balance_loss_mlp": 1.02006817, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 2.675236103518238, "language_loss": 0.84141141, "learning_rate": 2.098111035105635e-06, "loss": 0.86317778, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.640942335128784 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01026852, "balance_loss_clip": 1.04662609, "balance_loss_mlp": 1.01922309, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.7933515188468654, "language_loss": 0.73357058, "learning_rate": 2.0973329938766176e-06, "loss": 0.75495124, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.705878257751465 }, { "auxiliary_loss_clip": 0.01166364, "auxiliary_loss_mlp": 0.01027535, "balance_loss_clip": 1.05165005, "balance_loss_mlp": 1.01959598, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 5.087767691861429, "language_loss": 0.79018748, "learning_rate": 2.0965549378822618e-06, "loss": 0.81212646, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.6467814445495605 }, { "auxiliary_loss_clip": 0.01065328, "auxiliary_loss_mlp": 0.01031566, "balance_loss_clip": 1.04298162, "balance_loss_mlp": 1.0242641, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 3.5070045382345034, "language_loss": 0.84445024, "learning_rate": 2.095776867240599e-06, "loss": 0.86541927, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.9116694927215576 }, { "auxiliary_loss_clip": 0.0112436, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.04677618, "balance_loss_mlp": 1.01913857, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 2.3495963402623947, "language_loss": 0.82804918, "learning_rate": 2.094998782069661e-06, "loss": 0.8495571, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.850006341934204 }, { "auxiliary_loss_clip": 0.01179107, "auxiliary_loss_mlp": 0.01026521, "balance_loss_clip": 1.05467439, "balance_loss_mlp": 1.01909971, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 1.7046263367631413, "language_loss": 0.75944054, "learning_rate": 2.0942206824874845e-06, "loss": 0.78149676, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.645673990249634 }, { "auxiliary_loss_clip": 0.01165468, "auxiliary_loss_mlp": 0.01026226, "balance_loss_clip": 1.05575347, "balance_loss_mlp": 1.01813745, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 5.368479136684376, "language_loss": 0.79387033, "learning_rate": 2.093442568612105e-06, "loss": 0.81578726, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.6192827224731445 }, { "auxiliary_loss_clip": 0.01177876, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.0515157, "balance_loss_mlp": 1.0167098, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 2.2103725278140374, "language_loss": 0.85153174, "learning_rate": 2.0926644405615613e-06, "loss": 0.87355781, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.6113486289978027 }, { "auxiliary_loss_clip": 0.01128414, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.04922104, "balance_loss_mlp": 1.01705885, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 2.168074135689336, "language_loss": 0.81227529, "learning_rate": 2.091886298453897e-06, "loss": 0.83380401, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.7009778022766113 }, { "auxiliary_loss_clip": 0.01160016, "auxiliary_loss_mlp": 0.01025228, "balance_loss_clip": 1.05034363, "balance_loss_mlp": 1.01773274, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 1.783031805254793, "language_loss": 0.73134708, "learning_rate": 2.091108142407153e-06, "loss": 0.75319946, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.600776433944702 }, { "auxiliary_loss_clip": 0.01082962, "auxiliary_loss_mlp": 0.01002172, "balance_loss_clip": 1.052549, "balance_loss_mlp": 1.00080669, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8370651128209949, "language_loss": 0.62372082, "learning_rate": 2.090329972539377e-06, "loss": 0.64457214, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.325382947921753 }, { "auxiliary_loss_clip": 0.01056133, "auxiliary_loss_mlp": 0.01026987, "balance_loss_clip": 1.04064417, "balance_loss_mlp": 1.01983714, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 2.207711910425203, "language_loss": 0.68643582, "learning_rate": 2.089551788968616e-06, "loss": 0.70726705, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 2.863743543624878 }, { "auxiliary_loss_clip": 0.01087227, "auxiliary_loss_mlp": 0.01004172, "balance_loss_clip": 1.02438211, "balance_loss_mlp": 1.00289631, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.829021318626037, "language_loss": 0.60777032, "learning_rate": 2.08877359181292e-06, "loss": 0.62868428, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.381927967071533 }, { "auxiliary_loss_clip": 0.01133632, "auxiliary_loss_mlp": 0.01019603, "balance_loss_clip": 1.0450809, "balance_loss_mlp": 1.01243842, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 3.6975653888171776, "language_loss": 0.85565233, "learning_rate": 2.0879953811903396e-06, "loss": 0.87718475, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.718900203704834 }, { "auxiliary_loss_clip": 0.01160532, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 1.05177093, "balance_loss_mlp": 1.02035153, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.8283431350380726, "language_loss": 0.787027, "learning_rate": 2.08721715721893e-06, "loss": 0.80891573, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 3.7976834774017334 }, { "auxiliary_loss_clip": 0.01161862, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.05240524, "balance_loss_mlp": 1.01729822, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.8525716928720857, "language_loss": 0.77362168, "learning_rate": 2.0864389200167477e-06, "loss": 0.79549038, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 4.505256414413452 }, { "auxiliary_loss_clip": 0.01164922, "auxiliary_loss_mlp": 0.00711486, "balance_loss_clip": 1.05226851, "balance_loss_mlp": 1.00051522, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 2.8599160536415447, "language_loss": 0.78993213, "learning_rate": 2.0856606697018504e-06, "loss": 0.80869627, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.741307497024536 }, { "auxiliary_loss_clip": 0.01142791, "auxiliary_loss_mlp": 0.01025663, "balance_loss_clip": 1.04847789, "balance_loss_mlp": 1.01786685, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.95715265054907, "language_loss": 0.73606265, "learning_rate": 2.084882406392297e-06, "loss": 0.75774723, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.645782232284546 }, { "auxiliary_loss_clip": 0.01164973, "auxiliary_loss_mlp": 0.01023785, "balance_loss_clip": 1.0534935, "balance_loss_mlp": 1.0168705, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.2956715687901625, "language_loss": 0.70921564, "learning_rate": 2.0841041302061496e-06, "loss": 0.73110324, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.650205135345459 }, { "auxiliary_loss_clip": 0.01136258, "auxiliary_loss_mlp": 0.01022066, "balance_loss_clip": 1.04623532, "balance_loss_mlp": 1.01477611, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.8582507468376264, "language_loss": 0.75704682, "learning_rate": 2.083325841261473e-06, "loss": 0.77863002, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.6608426570892334 }, { "auxiliary_loss_clip": 0.01140424, "auxiliary_loss_mlp": 0.01021097, "balance_loss_clip": 1.04714227, "balance_loss_mlp": 1.01329446, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 1.9753930850255064, "language_loss": 0.66105235, "learning_rate": 2.0825475396763322e-06, "loss": 0.68266761, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.7188074588775635 }, { "auxiliary_loss_clip": 0.01060664, "auxiliary_loss_mlp": 0.01026004, "balance_loss_clip": 1.04153371, "balance_loss_mlp": 1.0188632, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.6447622222895508, "language_loss": 0.65869117, "learning_rate": 2.081769225568796e-06, "loss": 0.6795578, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 3.0468356609344482 }, { "auxiliary_loss_clip": 0.01164029, "auxiliary_loss_mlp": 0.01023687, "balance_loss_clip": 1.05004942, "balance_loss_mlp": 1.015885, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.648804092779685, "language_loss": 0.76357698, "learning_rate": 2.0809908990569327e-06, "loss": 0.78545415, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 2.832188367843628 }, { "auxiliary_loss_clip": 0.01142368, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.04811895, "balance_loss_mlp": 1.01947021, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.8348985986431603, "language_loss": 0.79002619, "learning_rate": 2.0802125602588146e-06, "loss": 0.81171668, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.6951589584350586 }, { "auxiliary_loss_clip": 0.01180703, "auxiliary_loss_mlp": 0.01027078, "balance_loss_clip": 1.05468798, "balance_loss_mlp": 1.01954353, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 1.9489561177687669, "language_loss": 0.66307139, "learning_rate": 2.0794342092925146e-06, "loss": 0.68514919, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.653430461883545 }, { "auxiliary_loss_clip": 0.01167612, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.05530977, "balance_loss_mlp": 1.02129984, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 2.514700380975888, "language_loss": 0.67295837, "learning_rate": 2.078655846276108e-06, "loss": 0.69491768, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.7117998600006104 }, { "auxiliary_loss_clip": 0.01142253, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.04972291, "balance_loss_mlp": 1.01822376, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 2.3332899274543624, "language_loss": 0.69143057, "learning_rate": 2.0778774713276727e-06, "loss": 0.71311253, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.6749472618103027 }, { "auxiliary_loss_clip": 0.01157972, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.04832506, "balance_loss_mlp": 1.02176857, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.0984658076932856, "language_loss": 0.67748171, "learning_rate": 2.077099084565287e-06, "loss": 0.69935787, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.6420061588287354 }, { "auxiliary_loss_clip": 0.01142637, "auxiliary_loss_mlp": 0.01023123, "balance_loss_clip": 1.04740572, "balance_loss_mlp": 1.01578021, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 3.374205683938438, "language_loss": 0.6523084, "learning_rate": 2.0763206861070313e-06, "loss": 0.67396599, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.7196528911590576 }, { "auxiliary_loss_clip": 0.01177839, "auxiliary_loss_mlp": 0.01026417, "balance_loss_clip": 1.0524478, "balance_loss_mlp": 1.01874042, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 2.158094239234472, "language_loss": 0.75287437, "learning_rate": 2.0755422760709876e-06, "loss": 0.77491689, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.588416814804077 }, { "auxiliary_loss_clip": 0.01107407, "auxiliary_loss_mlp": 0.01019555, "balance_loss_clip": 1.04441714, "balance_loss_mlp": 1.01228905, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 2.769135980046194, "language_loss": 0.76804817, "learning_rate": 2.0747638545752417e-06, "loss": 0.78931785, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.699026107788086 }, { "auxiliary_loss_clip": 0.01144486, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.05169713, "balance_loss_mlp": 1.01804042, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 2.1366583723575854, "language_loss": 0.83742422, "learning_rate": 2.073985421737878e-06, "loss": 0.85912442, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.6964609622955322 }, { "auxiliary_loss_clip": 0.01165446, "auxiliary_loss_mlp": 0.01024769, "balance_loss_clip": 1.05115724, "balance_loss_mlp": 1.017169, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 3.3446942611226276, "language_loss": 0.74484146, "learning_rate": 2.0732069776769844e-06, "loss": 0.7667436, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.700911283493042 }, { "auxiliary_loss_clip": 0.01178139, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.0530957, "balance_loss_mlp": 1.01739645, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 1.9691480485124497, "language_loss": 0.73110771, "learning_rate": 2.072428522510651e-06, "loss": 0.75314093, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.592221260070801 }, { "auxiliary_loss_clip": 0.0112749, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 1.04978585, "balance_loss_mlp": 1.0178901, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.513889291681094, "language_loss": 0.76263267, "learning_rate": 2.071650056356968e-06, "loss": 0.78416336, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.7301831245422363 }, { "auxiliary_loss_clip": 0.01177955, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.05232394, "balance_loss_mlp": 1.01771855, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 2.227075275802646, "language_loss": 0.79975551, "learning_rate": 2.070871579334028e-06, "loss": 0.82178783, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.5490810871124268 }, { "auxiliary_loss_clip": 0.0117499, "auxiliary_loss_mlp": 0.01024305, "balance_loss_clip": 1.05051422, "balance_loss_mlp": 1.01645207, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.6452677029110239, "language_loss": 0.72092855, "learning_rate": 2.0700930915599264e-06, "loss": 0.74292147, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.5831120014190674 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.01027019, "balance_loss_clip": 1.0524646, "balance_loss_mlp": 1.01988983, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 1.9676836763138084, "language_loss": 0.78767931, "learning_rate": 2.0693145931527583e-06, "loss": 0.80971503, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.5334296226501465 }, { "auxiliary_loss_clip": 0.01142848, "auxiliary_loss_mlp": 0.0102213, "balance_loss_clip": 1.04955745, "balance_loss_mlp": 1.01488829, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.5219387421304988, "language_loss": 0.77887082, "learning_rate": 2.068536084230622e-06, "loss": 0.80052066, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.7206287384033203 }, { "auxiliary_loss_clip": 0.01162607, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.05132926, "balance_loss_mlp": 1.02148724, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 10.245919522250299, "language_loss": 0.88800627, "learning_rate": 2.067757564911616e-06, "loss": 0.90992689, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.6065993309020996 }, { "auxiliary_loss_clip": 0.01153477, "auxiliary_loss_mlp": 0.00711509, "balance_loss_clip": 1.05022359, "balance_loss_mlp": 1.00053525, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 1.9502759716316653, "language_loss": 0.92465037, "learning_rate": 2.0669790353138407e-06, "loss": 0.94330025, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 3.6093945503234863 }, { "auxiliary_loss_clip": 0.01127702, "auxiliary_loss_mlp": 0.00711523, "balance_loss_clip": 1.04968739, "balance_loss_mlp": 1.00052714, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 3.9771415102149437, "language_loss": 0.73343229, "learning_rate": 2.0662004955553995e-06, "loss": 0.75182462, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 3.637650489807129 }, { "auxiliary_loss_clip": 0.01140601, "auxiliary_loss_mlp": 0.01027595, "balance_loss_clip": 1.04801857, "balance_loss_mlp": 1.02080297, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.9666544595358353, "language_loss": 0.76931125, "learning_rate": 2.065421945754395e-06, "loss": 0.79099321, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.624542713165283 }, { "auxiliary_loss_clip": 0.01120402, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.05186772, "balance_loss_mlp": 1.02321529, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.6153737411335736, "language_loss": 0.78239924, "learning_rate": 2.0646433860289344e-06, "loss": 0.80390853, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.848111391067505 }, { "auxiliary_loss_clip": 0.01166879, "auxiliary_loss_mlp": 0.00712115, "balance_loss_clip": 1.05086517, "balance_loss_mlp": 1.00058222, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.0502905065392643, "language_loss": 0.82879961, "learning_rate": 2.0638648164971233e-06, "loss": 0.84758949, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.720815420150757 }, { "auxiliary_loss_clip": 0.01144994, "auxiliary_loss_mlp": 0.01023976, "balance_loss_clip": 1.0498364, "balance_loss_mlp": 1.0171032, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 2.639469730081329, "language_loss": 0.8847605, "learning_rate": 2.06308623727707e-06, "loss": 0.90645009, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.657353639602661 }, { "auxiliary_loss_clip": 0.01159129, "auxiliary_loss_mlp": 0.01026288, "balance_loss_clip": 1.04893076, "balance_loss_mlp": 1.01846194, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.6209724430082004, "language_loss": 0.76564533, "learning_rate": 2.0623076484868846e-06, "loss": 0.78749955, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.5985498428344727 }, { "auxiliary_loss_clip": 0.01064698, "auxiliary_loss_mlp": 0.01011619, "balance_loss_clip": 1.02859449, "balance_loss_mlp": 1.01015842, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8319004562368421, "language_loss": 0.60648549, "learning_rate": 2.061529050244679e-06, "loss": 0.62724864, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.1892945766448975 }, { "auxiliary_loss_clip": 0.01137644, "auxiliary_loss_mlp": 0.01021026, "balance_loss_clip": 1.0492413, "balance_loss_mlp": 1.01354861, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 2.0677335142759294, "language_loss": 0.74125296, "learning_rate": 2.060750442668565e-06, "loss": 0.76283967, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.653834581375122 }, { "auxiliary_loss_clip": 0.01164194, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.05296671, "balance_loss_mlp": 1.02409542, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.328282923928377, "language_loss": 0.64314753, "learning_rate": 2.059971825876657e-06, "loss": 0.66510785, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.5574498176574707 }, { "auxiliary_loss_clip": 0.01165837, "auxiliary_loss_mlp": 0.01025872, "balance_loss_clip": 1.0534085, "balance_loss_mlp": 1.01836455, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.8069458297692984, "language_loss": 0.76647651, "learning_rate": 2.0591931999870713e-06, "loss": 0.78839362, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.645958185195923 }, { "auxiliary_loss_clip": 0.01073476, "auxiliary_loss_mlp": 0.01002922, "balance_loss_clip": 1.02665448, "balance_loss_mlp": 1.00153279, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8296225734945248, "language_loss": 0.57566607, "learning_rate": 2.0584145651179234e-06, "loss": 0.59642994, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.235271453857422 }, { "auxiliary_loss_clip": 0.01146857, "auxiliary_loss_mlp": 0.00711597, "balance_loss_clip": 1.05235207, "balance_loss_mlp": 1.00057185, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.8515474116722572, "language_loss": 0.80116004, "learning_rate": 2.0576359213873327e-06, "loss": 0.81974459, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.6526267528533936 }, { "auxiliary_loss_clip": 0.0115269, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.04831851, "balance_loss_mlp": 1.01914775, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 4.127718472882934, "language_loss": 0.7043587, "learning_rate": 2.056857268913419e-06, "loss": 0.72615576, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.639875888824463 }, { "auxiliary_loss_clip": 0.01163937, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.05353928, "balance_loss_mlp": 1.02405787, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.4610969769081743, "language_loss": 0.84967208, "learning_rate": 2.056078607814303e-06, "loss": 0.8716256, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.617549419403076 }, { "auxiliary_loss_clip": 0.0115938, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.05061924, "balance_loss_mlp": 1.01978779, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 1.755294589713092, "language_loss": 0.78211123, "learning_rate": 2.055299938208106e-06, "loss": 0.80398297, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.597308397293091 }, { "auxiliary_loss_clip": 0.01170221, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.05538177, "balance_loss_mlp": 1.02509487, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.7569247837620214, "language_loss": 0.85928208, "learning_rate": 2.0545212602129526e-06, "loss": 0.88131249, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.6666171550750732 }, { "auxiliary_loss_clip": 0.01143111, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.04927111, "balance_loss_mlp": 1.02171803, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 2.31350110989719, "language_loss": 0.66309345, "learning_rate": 2.0537425739469673e-06, "loss": 0.68482161, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.6190342903137207 }, { "auxiliary_loss_clip": 0.0107798, "auxiliary_loss_mlp": 0.01005636, "balance_loss_clip": 1.02590871, "balance_loss_mlp": 1.00432456, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8698490340194842, "language_loss": 0.59470594, "learning_rate": 2.052963879528276e-06, "loss": 0.61554217, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.142688035964966 }, { "auxiliary_loss_clip": 0.01163841, "auxiliary_loss_mlp": 0.01028609, "balance_loss_clip": 1.05323052, "balance_loss_mlp": 1.0214653, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.7384382047502585, "language_loss": 0.76866794, "learning_rate": 2.052185177075007e-06, "loss": 0.79059243, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.6762049198150635 }, { "auxiliary_loss_clip": 0.011639, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.0511266, "balance_loss_mlp": 1.02101564, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 1.9496073962537193, "language_loss": 0.82924622, "learning_rate": 2.051406466705288e-06, "loss": 0.85117483, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.666579246520996 }, { "auxiliary_loss_clip": 0.01178829, "auxiliary_loss_mlp": 0.01021548, "balance_loss_clip": 1.05257213, "balance_loss_mlp": 1.01436257, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 2.0037445556954654, "language_loss": 0.81113148, "learning_rate": 2.0506277485372486e-06, "loss": 0.83313525, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.607717514038086 }, { "auxiliary_loss_clip": 0.01159215, "auxiliary_loss_mlp": 0.01026866, "balance_loss_clip": 1.05182314, "balance_loss_mlp": 1.0189091, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 1.8891366784233412, "language_loss": 0.6677736, "learning_rate": 2.04984902268902e-06, "loss": 0.68963444, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.675595760345459 }, { "auxiliary_loss_clip": 0.01169493, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.05154252, "balance_loss_mlp": 1.01971412, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.3221072818475488, "language_loss": 0.74992925, "learning_rate": 2.0490702892787345e-06, "loss": 0.77190268, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.7373037338256836 }, { "auxiliary_loss_clip": 0.01155112, "auxiliary_loss_mlp": 0.01024573, "balance_loss_clip": 1.049716, "balance_loss_mlp": 1.017349, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.833299183464537, "language_loss": 0.62355614, "learning_rate": 2.0482915484245246e-06, "loss": 0.64535302, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.680591583251953 }, { "auxiliary_loss_clip": 0.01110289, "auxiliary_loss_mlp": 0.01024224, "balance_loss_clip": 1.04630184, "balance_loss_mlp": 1.01632977, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.3227318013783704, "language_loss": 0.84298325, "learning_rate": 2.047512800244526e-06, "loss": 0.86432844, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.7338664531707764 }, { "auxiliary_loss_clip": 0.01165512, "auxiliary_loss_mlp": 0.01032272, "balance_loss_clip": 1.05420804, "balance_loss_mlp": 1.02502656, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 11.178354312341446, "language_loss": 0.78746176, "learning_rate": 2.046734044856873e-06, "loss": 0.8094396, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 4.510321617126465 }, { "auxiliary_loss_clip": 0.01163079, "auxiliary_loss_mlp": 0.01025514, "balance_loss_clip": 1.05397177, "balance_loss_mlp": 1.01877546, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 2.226223541720415, "language_loss": 0.81257808, "learning_rate": 2.045955282379702e-06, "loss": 0.83446407, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 3.551835536956787 }, { "auxiliary_loss_clip": 0.01159085, "auxiliary_loss_mlp": 0.01022531, "balance_loss_clip": 1.04842937, "balance_loss_mlp": 1.01484489, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 3.265073444348304, "language_loss": 0.75633609, "learning_rate": 2.045176512931152e-06, "loss": 0.77815223, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.6197545528411865 }, { "auxiliary_loss_clip": 0.01133694, "auxiliary_loss_mlp": 0.01022816, "balance_loss_clip": 1.04740047, "balance_loss_mlp": 1.01618147, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 1.9124167269484658, "language_loss": 0.76085281, "learning_rate": 2.0443977366293604e-06, "loss": 0.78241789, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.704155206680298 }, { "auxiliary_loss_clip": 0.01101607, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.04594374, "balance_loss_mlp": 1.01848793, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.6624803092383609, "language_loss": 0.77146769, "learning_rate": 2.043618953592468e-06, "loss": 0.79275048, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 2.850154161453247 }, { "auxiliary_loss_clip": 0.01149441, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.05260706, "balance_loss_mlp": 1.02541733, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 1.742720460374714, "language_loss": 0.81028545, "learning_rate": 2.0428401639386144e-06, "loss": 0.83211702, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.688978433609009 }, { "auxiliary_loss_clip": 0.01057838, "auxiliary_loss_mlp": 0.01002543, "balance_loss_clip": 1.02483416, "balance_loss_mlp": 1.00138116, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8190966702473712, "language_loss": 0.58113778, "learning_rate": 2.042061367785943e-06, "loss": 0.60174167, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.1997649669647217 }, { "auxiliary_loss_clip": 0.01131231, "auxiliary_loss_mlp": 0.01021448, "balance_loss_clip": 1.04583788, "balance_loss_mlp": 1.014009, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.089613987554582, "language_loss": 0.75147462, "learning_rate": 2.041282565252594e-06, "loss": 0.77300143, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.8582987785339355 }, { "auxiliary_loss_clip": 0.01130682, "auxiliary_loss_mlp": 0.01024119, "balance_loss_clip": 1.0484271, "balance_loss_mlp": 1.01696014, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.82579414494249, "language_loss": 0.77057171, "learning_rate": 2.040503756456714e-06, "loss": 0.79211974, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.743659019470215 }, { "auxiliary_loss_clip": 0.01157345, "auxiliary_loss_mlp": 0.01027879, "balance_loss_clip": 1.0494411, "balance_loss_mlp": 1.02023745, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 1.9824819841623387, "language_loss": 0.78883064, "learning_rate": 2.0397249415164456e-06, "loss": 0.81068289, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.6506457328796387 }, { "auxiliary_loss_clip": 0.01138586, "auxiliary_loss_mlp": 0.01028083, "balance_loss_clip": 1.04635084, "balance_loss_mlp": 1.02053094, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 3.725659963997205, "language_loss": 0.80301607, "learning_rate": 2.0389461205499354e-06, "loss": 0.82468277, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.7206268310546875 }, { "auxiliary_loss_clip": 0.01132885, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 1.04775023, "balance_loss_mlp": 1.01739216, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.9049593223502812, "language_loss": 0.73593223, "learning_rate": 2.03816729367533e-06, "loss": 0.75750607, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.7122552394866943 }, { "auxiliary_loss_clip": 0.01153974, "auxiliary_loss_mlp": 0.01025603, "balance_loss_clip": 1.05421925, "balance_loss_mlp": 1.01758575, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 2.4518886798382273, "language_loss": 0.71696305, "learning_rate": 2.0373884610107765e-06, "loss": 0.7387588, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.683997631072998 }, { "auxiliary_loss_clip": 0.01165441, "auxiliary_loss_mlp": 0.01025674, "balance_loss_clip": 1.05000591, "balance_loss_mlp": 1.01806903, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.937540155807179, "language_loss": 0.70354778, "learning_rate": 2.0366096226744225e-06, "loss": 0.72545892, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.6212809085845947 }, { "auxiliary_loss_clip": 0.01154622, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.04970789, "balance_loss_mlp": 1.02662873, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 2.0966653477251045, "language_loss": 0.77061141, "learning_rate": 2.035830778784418e-06, "loss": 0.79249835, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.662522792816162 }, { "auxiliary_loss_clip": 0.01151352, "auxiliary_loss_mlp": 0.01025037, "balance_loss_clip": 1.05407345, "balance_loss_mlp": 1.01731825, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 3.6080686051733912, "language_loss": 0.80064338, "learning_rate": 2.0350519294589134e-06, "loss": 0.82240731, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.6776342391967773 }, { "auxiliary_loss_clip": 0.01114558, "auxiliary_loss_mlp": 0.01025054, "balance_loss_clip": 1.04491043, "balance_loss_mlp": 1.01693559, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.645149476867178, "language_loss": 0.83038247, "learning_rate": 2.0342730748160588e-06, "loss": 0.85177863, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.77668833732605 }, { "auxiliary_loss_clip": 0.01144883, "auxiliary_loss_mlp": 0.01026563, "balance_loss_clip": 1.04797113, "balance_loss_mlp": 1.01907659, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.88800354600041, "language_loss": 0.70714021, "learning_rate": 2.033494214974006e-06, "loss": 0.72885466, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.6967577934265137 }, { "auxiliary_loss_clip": 0.01138958, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.05043805, "balance_loss_mlp": 1.02079248, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.7220807756900192, "language_loss": 0.83919358, "learning_rate": 2.0327153500509067e-06, "loss": 0.86086488, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.6761245727539062 }, { "auxiliary_loss_clip": 0.01149621, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.05029535, "balance_loss_mlp": 1.01961052, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 2.3054876293697464, "language_loss": 0.84697783, "learning_rate": 2.031936480164916e-06, "loss": 0.86873949, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.65218186378479 }, { "auxiliary_loss_clip": 0.01144432, "auxiliary_loss_mlp": 0.01027297, "balance_loss_clip": 1.05156803, "balance_loss_mlp": 1.01951241, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 3.913322899860832, "language_loss": 0.79871279, "learning_rate": 2.0311576054341857e-06, "loss": 0.82043004, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.7874755859375 }, { "auxiliary_loss_clip": 0.01180596, "auxiliary_loss_mlp": 0.01024128, "balance_loss_clip": 1.05496716, "balance_loss_mlp": 1.01669574, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.8330768264061428, "language_loss": 0.62417346, "learning_rate": 2.0303787259768715e-06, "loss": 0.64622068, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.587538480758667 }, { "auxiliary_loss_clip": 0.01148576, "auxiliary_loss_mlp": 0.01026171, "balance_loss_clip": 1.05166197, "balance_loss_mlp": 1.01839232, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.480672321099237, "language_loss": 0.68460184, "learning_rate": 2.0295998419111294e-06, "loss": 0.70634931, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.6512746810913086 }, { "auxiliary_loss_clip": 0.01099896, "auxiliary_loss_mlp": 0.01024889, "balance_loss_clip": 1.04270935, "balance_loss_mlp": 1.01780176, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 9.011226501971553, "language_loss": 0.73426789, "learning_rate": 2.028820953355115e-06, "loss": 0.75551569, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.760998487472534 }, { "auxiliary_loss_clip": 0.01151099, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.0486455, "balance_loss_mlp": 1.01871765, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 2.665411460121389, "language_loss": 0.78518319, "learning_rate": 2.0280420604269834e-06, "loss": 0.80696154, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.7167375087738037 }, { "auxiliary_loss_clip": 0.01072128, "auxiliary_loss_mlp": 0.01003013, "balance_loss_clip": 1.02350152, "balance_loss_mlp": 1.00185037, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7074366969233944, "language_loss": 0.58975589, "learning_rate": 2.027263163244895e-06, "loss": 0.61050737, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.4022879600524902 }, { "auxiliary_loss_clip": 0.0116035, "auxiliary_loss_mlp": 0.01025689, "balance_loss_clip": 1.05246258, "balance_loss_mlp": 1.01817298, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.564232740661229, "language_loss": 0.74750119, "learning_rate": 2.026484261927005e-06, "loss": 0.76936162, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 4.508905410766602 }, { "auxiliary_loss_clip": 0.01173392, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.05628502, "balance_loss_mlp": 1.02451646, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.42037563210008, "language_loss": 0.74327278, "learning_rate": 2.025705356591475e-06, "loss": 0.76532745, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 3.533637762069702 }, { "auxiliary_loss_clip": 0.01044528, "auxiliary_loss_mlp": 0.00702442, "balance_loss_clip": 1.02225184, "balance_loss_mlp": 1.00003147, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7590544041373498, "language_loss": 0.57923865, "learning_rate": 2.024926447356462e-06, "loss": 0.5967083, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.185732841491699 }, { "auxiliary_loss_clip": 0.01163514, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.01922357, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 2.5726927923370946, "language_loss": 0.78854185, "learning_rate": 2.024147534340127e-06, "loss": 0.81044841, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.6472065448760986 }, { "auxiliary_loss_clip": 0.01144689, "auxiliary_loss_mlp": 0.01023421, "balance_loss_clip": 1.04816651, "balance_loss_mlp": 1.01618481, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 2.410170887524192, "language_loss": 0.79842412, "learning_rate": 2.02336861766063e-06, "loss": 0.82010531, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.6705358028411865 }, { "auxiliary_loss_clip": 0.01169659, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.05270684, "balance_loss_mlp": 1.02824831, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 2.197931463909007, "language_loss": 0.78732687, "learning_rate": 2.0225896974361327e-06, "loss": 0.8093825, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.6345362663269043 }, { "auxiliary_loss_clip": 0.01046358, "auxiliary_loss_mlp": 0.01003375, "balance_loss_clip": 1.0238049, "balance_loss_mlp": 1.00223625, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8505261656482196, "language_loss": 0.59925491, "learning_rate": 2.0218107737847962e-06, "loss": 0.61975223, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.283324718475342 }, { "auxiliary_loss_clip": 0.01179272, "auxiliary_loss_mlp": 0.01024674, "balance_loss_clip": 1.05427825, "balance_loss_mlp": 1.01708031, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 2.342611650608901, "language_loss": 0.74618608, "learning_rate": 2.0210318468247826e-06, "loss": 0.76822555, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.6242177486419678 }, { "auxiliary_loss_clip": 0.01142839, "auxiliary_loss_mlp": 0.01024207, "balance_loss_clip": 1.04638422, "balance_loss_mlp": 1.01707864, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.9201924430990962, "language_loss": 0.81916976, "learning_rate": 2.020252916674255e-06, "loss": 0.84084022, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.657491683959961 }, { "auxiliary_loss_clip": 0.01164965, "auxiliary_loss_mlp": 0.01022687, "balance_loss_clip": 1.0515461, "balance_loss_mlp": 1.01545143, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.9492347794137102, "language_loss": 0.81097412, "learning_rate": 2.019473983451375e-06, "loss": 0.83285069, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.6556472778320312 }, { "auxiliary_loss_clip": 0.01138105, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.04909098, "balance_loss_mlp": 1.02254748, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.8629933415058983, "language_loss": 0.7171213, "learning_rate": 2.0186950472743076e-06, "loss": 0.73880118, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.674694538116455 }, { "auxiliary_loss_clip": 0.01179315, "auxiliary_loss_mlp": 0.01022439, "balance_loss_clip": 1.05290198, "balance_loss_mlp": 1.01507211, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 2.3346208814496965, "language_loss": 0.74426639, "learning_rate": 2.0179161082612162e-06, "loss": 0.76628387, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.6598284244537354 }, { "auxiliary_loss_clip": 0.01140675, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 1.04629326, "balance_loss_mlp": 1.01685727, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 2.8243409068893204, "language_loss": 0.72734755, "learning_rate": 2.017137166530266e-06, "loss": 0.74900353, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.669586181640625 }, { "auxiliary_loss_clip": 0.01151579, "auxiliary_loss_mlp": 0.01022731, "balance_loss_clip": 1.0505594, "balance_loss_mlp": 1.01573324, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.626092997550807, "language_loss": 0.79889035, "learning_rate": 2.0163582221996213e-06, "loss": 0.82063347, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.70578670501709 }, { "auxiliary_loss_clip": 0.01150845, "auxiliary_loss_mlp": 0.01028115, "balance_loss_clip": 1.05109322, "balance_loss_mlp": 1.01962733, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 2.1729052645208413, "language_loss": 0.6804474, "learning_rate": 2.015579275387446e-06, "loss": 0.70223701, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.8337783813476562 }, { "auxiliary_loss_clip": 0.01139542, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.0502615, "balance_loss_mlp": 1.02185273, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 2.7216403469614825, "language_loss": 0.68281853, "learning_rate": 2.0148003262119085e-06, "loss": 0.70450372, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.7193644046783447 }, { "auxiliary_loss_clip": 0.01131221, "auxiliary_loss_mlp": 0.01025855, "balance_loss_clip": 1.04791927, "balance_loss_mlp": 1.01760614, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.8983403578661509, "language_loss": 0.76943523, "learning_rate": 2.0140213747911728e-06, "loss": 0.79100609, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.6740591526031494 }, { "auxiliary_loss_clip": 0.01132981, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.05114508, "balance_loss_mlp": 1.02317333, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.294259050708184, "language_loss": 0.80434036, "learning_rate": 2.013242421243406e-06, "loss": 0.82598186, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.7428996562957764 }, { "auxiliary_loss_clip": 0.01116432, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.04954147, "balance_loss_mlp": 1.01866567, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.693421051168938, "language_loss": 0.79064059, "learning_rate": 2.012463465686774e-06, "loss": 0.81206238, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.719803810119629 }, { "auxiliary_loss_clip": 0.01054245, "auxiliary_loss_mlp": 0.01007242, "balance_loss_clip": 1.04578948, "balance_loss_mlp": 1.0059365, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.7676978667177158, "language_loss": 0.54702747, "learning_rate": 2.0116845082394446e-06, "loss": 0.56764233, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.3690874576568604 }, { "auxiliary_loss_clip": 0.01167523, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 1.05152822, "balance_loss_mlp": 1.01855254, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 1.9085813408414753, "language_loss": 0.78979135, "learning_rate": 2.0109055490195836e-06, "loss": 0.81173003, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.7650930881500244 }, { "auxiliary_loss_clip": 0.01102876, "auxiliary_loss_mlp": 0.01024433, "balance_loss_clip": 1.04155993, "balance_loss_mlp": 1.01708984, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 6.29555735416085, "language_loss": 0.64188355, "learning_rate": 2.0101265881453605e-06, "loss": 0.66315663, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.70282244682312 }, { "auxiliary_loss_clip": 0.0114417, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.05234194, "balance_loss_mlp": 1.02510059, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.174125381619533, "language_loss": 0.78211266, "learning_rate": 2.009347625734941e-06, "loss": 0.80387479, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.6790263652801514 }, { "auxiliary_loss_clip": 0.01182878, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.05576229, "balance_loss_mlp": 1.02041876, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 3.5502383911334765, "language_loss": 0.75297397, "learning_rate": 2.0085686619064954e-06, "loss": 0.7750845, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.565962076187134 }, { "auxiliary_loss_clip": 0.01167535, "auxiliary_loss_mlp": 0.01023499, "balance_loss_clip": 1.05315733, "balance_loss_mlp": 1.01640058, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.2129415065399427, "language_loss": 0.82964247, "learning_rate": 2.00778969677819e-06, "loss": 0.85155284, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.6204099655151367 }, { "auxiliary_loss_clip": 0.01148174, "auxiliary_loss_mlp": 0.01024665, "balance_loss_clip": 1.05092752, "balance_loss_mlp": 1.01745248, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 2.4288135078884268, "language_loss": 0.64095956, "learning_rate": 2.0070107304681934e-06, "loss": 0.66268802, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.717730760574341 }, { "auxiliary_loss_clip": 0.01130881, "auxiliary_loss_mlp": 0.01022412, "balance_loss_clip": 1.05099607, "balance_loss_mlp": 1.0153873, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 5.848434098856962, "language_loss": 0.7839613, "learning_rate": 2.006231763094675e-06, "loss": 0.80549419, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 3.7376554012298584 }, { "auxiliary_loss_clip": 0.01143084, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.05294812, "balance_loss_mlp": 1.01793027, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 1.856490662024069, "language_loss": 0.87416917, "learning_rate": 2.0054527947758027e-06, "loss": 0.89585012, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 3.6000447273254395 }, { "auxiliary_loss_clip": 0.01072866, "auxiliary_loss_mlp": 0.01000941, "balance_loss_clip": 1.02314055, "balance_loss_mlp": 0.99955225, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7253093884607137, "language_loss": 0.55830789, "learning_rate": 2.004673825629746e-06, "loss": 0.57904589, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.1764233112335205 }, { "auxiliary_loss_clip": 0.01142652, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.04882836, "balance_loss_mlp": 1.02011824, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 1.5142984073672956, "language_loss": 0.72352421, "learning_rate": 2.0038948557746744e-06, "loss": 0.74522811, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.739112615585327 }, { "auxiliary_loss_clip": 0.01161823, "auxiliary_loss_mlp": 0.01024019, "balance_loss_clip": 1.05332518, "balance_loss_mlp": 1.01665807, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 3.334909413282669, "language_loss": 0.75403643, "learning_rate": 2.0031158853287558e-06, "loss": 0.77589488, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.677525758743286 }, { "auxiliary_loss_clip": 0.01149256, "auxiliary_loss_mlp": 0.01024747, "balance_loss_clip": 1.05355597, "balance_loss_mlp": 1.01752841, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 3.0024873655822693, "language_loss": 0.70395905, "learning_rate": 2.0023369144101593e-06, "loss": 0.72569907, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.615157127380371 }, { "auxiliary_loss_clip": 0.01138525, "auxiliary_loss_mlp": 0.01020871, "balance_loss_clip": 1.04796433, "balance_loss_mlp": 1.01365328, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 2.461338097162191, "language_loss": 0.76623094, "learning_rate": 2.0015579431370555e-06, "loss": 0.78782493, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.7134053707122803 }, { "auxiliary_loss_clip": 0.01160634, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.052876, "balance_loss_mlp": 1.02505171, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 2.168092841923491, "language_loss": 0.70396835, "learning_rate": 2.000778971627612e-06, "loss": 0.72589684, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.6476693153381348 }, { "auxiliary_loss_clip": 0.01139843, "auxiliary_loss_mlp": 0.01027992, "balance_loss_clip": 1.04779387, "balance_loss_mlp": 1.02043402, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 2.01708402775531, "language_loss": 0.90333074, "learning_rate": 2e-06, "loss": 0.92500907, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.6814844608306885 }, { "auxiliary_loss_clip": 0.01176351, "auxiliary_loss_mlp": 0.01022413, "balance_loss_clip": 1.05271387, "balance_loss_mlp": 1.01537371, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 22.788766320309758, "language_loss": 0.85733676, "learning_rate": 1.9992210283723878e-06, "loss": 0.87932444, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.5244860649108887 }, { "auxiliary_loss_clip": 0.0117482, "auxiliary_loss_mlp": 0.01023932, "balance_loss_clip": 1.05279577, "balance_loss_mlp": 1.01685393, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.5936427937274307, "language_loss": 0.79206961, "learning_rate": 1.9984420568629448e-06, "loss": 0.81405711, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.670145034790039 }, { "auxiliary_loss_clip": 0.01167591, "auxiliary_loss_mlp": 0.01028776, "balance_loss_clip": 1.05368829, "balance_loss_mlp": 1.0217185, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.0142964372665517, "language_loss": 0.78991032, "learning_rate": 1.9976630855898405e-06, "loss": 0.81187403, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.6224284172058105 }, { "auxiliary_loss_clip": 0.01140947, "auxiliary_loss_mlp": 0.01020894, "balance_loss_clip": 1.04547322, "balance_loss_mlp": 1.01417089, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.2399570595462697, "language_loss": 0.74939322, "learning_rate": 1.9968841146712445e-06, "loss": 0.77101171, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.7943649291992188 }, { "auxiliary_loss_clip": 0.01098105, "auxiliary_loss_mlp": 0.00711139, "balance_loss_clip": 1.0459888, "balance_loss_mlp": 1.00078082, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.9021478474222122, "language_loss": 0.71173513, "learning_rate": 1.996105144225326e-06, "loss": 0.72982758, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.7774345874786377 }, { "auxiliary_loss_clip": 0.01163811, "auxiliary_loss_mlp": 0.01024585, "balance_loss_clip": 1.05330265, "balance_loss_mlp": 1.01726556, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 2.407417648221821, "language_loss": 0.7813772, "learning_rate": 1.995326174370254e-06, "loss": 0.80326116, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.686309337615967 }, { "auxiliary_loss_clip": 0.01159883, "auxiliary_loss_mlp": 0.00710736, "balance_loss_clip": 1.04921901, "balance_loss_mlp": 1.00053799, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 3.1832043992444703, "language_loss": 0.72812343, "learning_rate": 1.994547205224197e-06, "loss": 0.74682957, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.6773996353149414 }, { "auxiliary_loss_clip": 0.01144637, "auxiliary_loss_mlp": 0.01020564, "balance_loss_clip": 1.0512414, "balance_loss_mlp": 1.01327395, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 1.9691431850391596, "language_loss": 0.67526674, "learning_rate": 1.993768236905325e-06, "loss": 0.69691879, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.76088547706604 }, { "auxiliary_loss_clip": 0.01143067, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04832554, "balance_loss_mlp": 1.01665235, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 3.3231137553580474, "language_loss": 0.65849054, "learning_rate": 1.992989269531807e-06, "loss": 0.68016106, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.6974644660949707 }, { "auxiliary_loss_clip": 0.0114454, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04779232, "balance_loss_mlp": 1.01867795, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.6918606730801096, "language_loss": 0.68366462, "learning_rate": 1.99221030322181e-06, "loss": 0.70537031, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.6134979724884033 }, { "auxiliary_loss_clip": 0.01149635, "auxiliary_loss_mlp": 0.01025177, "balance_loss_clip": 1.04947138, "balance_loss_mlp": 1.01760769, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.8538601063237132, "language_loss": 0.8071326, "learning_rate": 1.991431338093505e-06, "loss": 0.82888073, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.7932186126708984 }, { "auxiliary_loss_clip": 0.01148438, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.0530653, "balance_loss_mlp": 1.01642299, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.8041660005920563, "language_loss": 0.79461706, "learning_rate": 1.9906523742650587e-06, "loss": 0.81633586, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.6697888374328613 }, { "auxiliary_loss_clip": 0.01178478, "auxiliary_loss_mlp": 0.01023793, "balance_loss_clip": 1.05048728, "balance_loss_mlp": 1.0163368, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.24066521277971, "language_loss": 0.77323335, "learning_rate": 1.9898734118546397e-06, "loss": 0.79525602, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.650196075439453 }, { "auxiliary_loss_clip": 0.01087211, "auxiliary_loss_mlp": 0.01023944, "balance_loss_clip": 1.04469681, "balance_loss_mlp": 1.01639771, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.6792449216620406, "language_loss": 0.80499506, "learning_rate": 1.989094450980416e-06, "loss": 0.82610655, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.7948052883148193 }, { "auxiliary_loss_clip": 0.01159958, "auxiliary_loss_mlp": 0.01023258, "balance_loss_clip": 1.0515852, "balance_loss_mlp": 1.01644492, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 2.9089073743375944, "language_loss": 0.7674588, "learning_rate": 1.9883154917605556e-06, "loss": 0.78929096, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 3.229276418685913 }, { "auxiliary_loss_clip": 0.01175225, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.05062771, "balance_loss_mlp": 1.02084792, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.923542771120969, "language_loss": 0.83400536, "learning_rate": 1.9875365343132262e-06, "loss": 0.85603911, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.56648325920105 }, { "auxiliary_loss_clip": 0.01165024, "auxiliary_loss_mlp": 0.00711058, "balance_loss_clip": 1.05450368, "balance_loss_mlp": 1.00057566, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 3.090495258368083, "language_loss": 0.84705579, "learning_rate": 1.9867575787565946e-06, "loss": 0.86581659, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.5974040031433105 }, { "auxiliary_loss_clip": 0.0116328, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.05287194, "balance_loss_mlp": 1.02011466, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 2.004253190942682, "language_loss": 0.86116844, "learning_rate": 1.9859786252088275e-06, "loss": 0.8830806, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 3.739797592163086 }, { "auxiliary_loss_clip": 0.01133176, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.04874563, "balance_loss_mlp": 1.01838636, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 21.30197172973917, "language_loss": 0.66981882, "learning_rate": 1.9851996737880914e-06, "loss": 0.69141519, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 3.792930841445923 }, { "auxiliary_loss_clip": 0.01167208, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.05260015, "balance_loss_mlp": 1.01995468, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 4.128143928265227, "language_loss": 0.74270666, "learning_rate": 1.9844207246125537e-06, "loss": 0.76465929, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.6263182163238525 }, { "auxiliary_loss_clip": 0.01144168, "auxiliary_loss_mlp": 0.01025288, "balance_loss_clip": 1.04981995, "balance_loss_mlp": 1.01804602, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 2.5102298296643126, "language_loss": 0.68520224, "learning_rate": 1.983641777800379e-06, "loss": 0.70689678, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.8410611152648926 }, { "auxiliary_loss_clip": 0.01064888, "auxiliary_loss_mlp": 0.01003683, "balance_loss_clip": 1.02357256, "balance_loss_mlp": 1.00200176, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.750074913518163, "language_loss": 0.58849937, "learning_rate": 1.9828628334697343e-06, "loss": 0.6091851, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.411745548248291 }, { "auxiliary_loss_clip": 0.01068106, "auxiliary_loss_mlp": 0.0100429, "balance_loss_clip": 1.02617931, "balance_loss_mlp": 1.0026207, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7688790916286268, "language_loss": 0.54641533, "learning_rate": 1.982083891738784e-06, "loss": 0.56713927, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.289933919906616 }, { "auxiliary_loss_clip": 0.0114025, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.05220819, "balance_loss_mlp": 1.01716876, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.5578788563477113, "language_loss": 0.83019924, "learning_rate": 1.9813049527256923e-06, "loss": 0.85184473, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.662565231323242 }, { "auxiliary_loss_clip": 0.01128059, "auxiliary_loss_mlp": 0.0102156, "balance_loss_clip": 1.04615116, "balance_loss_mlp": 1.01441395, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 1.9710387516894516, "language_loss": 0.82313991, "learning_rate": 1.9805260165486252e-06, "loss": 0.84463608, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.7636170387268066 }, { "auxiliary_loss_clip": 0.01165216, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.05370259, "balance_loss_mlp": 1.01719069, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 1.8529350848163437, "language_loss": 0.86489904, "learning_rate": 1.9797470833257457e-06, "loss": 0.88679385, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.6189608573913574 }, { "auxiliary_loss_clip": 0.01165918, "auxiliary_loss_mlp": 0.01030112, "balance_loss_clip": 1.05680203, "balance_loss_mlp": 1.02282214, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 3.1223845970678044, "language_loss": 0.77428925, "learning_rate": 1.9789681531752177e-06, "loss": 0.79624957, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.650998592376709 }, { "auxiliary_loss_clip": 0.01109835, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.0456059, "balance_loss_mlp": 1.02180767, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.5337812605068548, "language_loss": 0.7270391, "learning_rate": 1.978189226215204e-06, "loss": 0.74842417, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.742910385131836 }, { "auxiliary_loss_clip": 0.01178215, "auxiliary_loss_mlp": 0.01027438, "balance_loss_clip": 1.05263758, "balance_loss_mlp": 1.01985669, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 1.9675044335281746, "language_loss": 0.76989073, "learning_rate": 1.9774103025638675e-06, "loss": 0.79194725, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.588409900665283 }, { "auxiliary_loss_clip": 0.0111684, "auxiliary_loss_mlp": 0.01025442, "balance_loss_clip": 1.0501287, "balance_loss_mlp": 1.01797915, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.8427448442413779, "language_loss": 0.76378286, "learning_rate": 1.9766313823393696e-06, "loss": 0.78520572, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.777946710586548 }, { "auxiliary_loss_clip": 0.01105931, "auxiliary_loss_mlp": 0.01027222, "balance_loss_clip": 1.04195714, "balance_loss_mlp": 1.0196166, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 3.5419784999411874, "language_loss": 0.69425189, "learning_rate": 1.975852465659873e-06, "loss": 0.71558338, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.6824352741241455 }, { "auxiliary_loss_clip": 0.01164281, "auxiliary_loss_mlp": 0.01024881, "balance_loss_clip": 1.05288446, "balance_loss_mlp": 1.01704955, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 3.1122114952883493, "language_loss": 0.70500875, "learning_rate": 1.9750735526435377e-06, "loss": 0.72690034, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.6782500743865967 }, { "auxiliary_loss_clip": 0.01147665, "auxiliary_loss_mlp": 0.01024967, "balance_loss_clip": 1.05142212, "balance_loss_mlp": 1.01716447, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.37736964720317, "language_loss": 0.79144561, "learning_rate": 1.974294643408525e-06, "loss": 0.81317186, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.66965651512146 }, { "auxiliary_loss_clip": 0.01164497, "auxiliary_loss_mlp": 0.01027869, "balance_loss_clip": 1.04996586, "balance_loss_mlp": 1.0205915, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 2.573331017859397, "language_loss": 0.67447078, "learning_rate": 1.9735157380729947e-06, "loss": 0.69639444, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.6751773357391357 }, { "auxiliary_loss_clip": 0.01146439, "auxiliary_loss_mlp": 0.01030511, "balance_loss_clip": 1.04873562, "balance_loss_mlp": 1.0228219, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.8431879130601903, "language_loss": 0.84163833, "learning_rate": 1.9727368367551053e-06, "loss": 0.86340785, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.676508903503418 }, { "auxiliary_loss_clip": 0.01136306, "auxiliary_loss_mlp": 0.01022972, "balance_loss_clip": 1.04787314, "balance_loss_mlp": 1.01548553, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.8884078990981679, "language_loss": 0.6824156, "learning_rate": 1.9719579395730164e-06, "loss": 0.70400834, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.6949877738952637 }, { "auxiliary_loss_clip": 0.0118125, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 1.05646467, "balance_loss_mlp": 1.02161121, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 2.2088170253518995, "language_loss": 0.93453348, "learning_rate": 1.9711790466448854e-06, "loss": 0.95663565, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.59029483795166 }, { "auxiliary_loss_clip": 0.01118429, "auxiliary_loss_mlp": 0.01030249, "balance_loss_clip": 1.04853642, "balance_loss_mlp": 1.02337062, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.4071369169211896, "language_loss": 0.71661615, "learning_rate": 1.9704001580888704e-06, "loss": 0.73810303, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.7819743156433105 }, { "auxiliary_loss_clip": 0.01142245, "auxiliary_loss_mlp": 0.00711236, "balance_loss_clip": 1.04876804, "balance_loss_mlp": 1.00056744, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 1.9282964847596582, "language_loss": 0.86423284, "learning_rate": 1.9696212740231283e-06, "loss": 0.88276768, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.6565630435943604 }, { "auxiliary_loss_clip": 0.01169596, "auxiliary_loss_mlp": 0.01024368, "balance_loss_clip": 1.05294561, "balance_loss_mlp": 1.01672053, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.485592830994901, "language_loss": 0.82085472, "learning_rate": 1.9688423945658146e-06, "loss": 0.8427943, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.714139461517334 }, { "auxiliary_loss_clip": 0.01104522, "auxiliary_loss_mlp": 0.01023569, "balance_loss_clip": 1.0417881, "balance_loss_mlp": 1.015499, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.4368028315941705, "language_loss": 0.72062123, "learning_rate": 1.9680635198350845e-06, "loss": 0.74190211, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.817974805831909 }, { "auxiliary_loss_clip": 0.01163133, "auxiliary_loss_mlp": 0.01025999, "balance_loss_clip": 1.05114961, "balance_loss_mlp": 1.01870346, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 2.5617169165985985, "language_loss": 0.72587764, "learning_rate": 1.967284649949093e-06, "loss": 0.747769, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.6761064529418945 }, { "auxiliary_loss_clip": 0.01128899, "auxiliary_loss_mlp": 0.01023788, "balance_loss_clip": 1.0468086, "balance_loss_mlp": 1.01651049, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 2.2631123548362386, "language_loss": 0.72192204, "learning_rate": 1.966505785025994e-06, "loss": 0.74344897, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.8797073364257812 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01024371, "balance_loss_clip": 1.0517391, "balance_loss_mlp": 1.01664674, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.727670246848122, "language_loss": 0.76369226, "learning_rate": 1.965726925183941e-06, "loss": 0.78526884, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 4.727513551712036 }, { "auxiliary_loss_clip": 0.01177608, "auxiliary_loss_mlp": 0.01022343, "balance_loss_clip": 1.05269372, "balance_loss_mlp": 1.01462436, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.8682754513619806, "language_loss": 0.84644127, "learning_rate": 1.964948070541087e-06, "loss": 0.86844081, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 3.545700788497925 }, { "auxiliary_loss_clip": 0.01151778, "auxiliary_loss_mlp": 0.01025565, "balance_loss_clip": 1.04743004, "balance_loss_mlp": 1.01840067, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.3769796897319186, "language_loss": 0.6988076, "learning_rate": 1.9641692212155816e-06, "loss": 0.720581, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.6589083671569824 }, { "auxiliary_loss_clip": 0.01113949, "auxiliary_loss_mlp": 0.01025113, "balance_loss_clip": 1.05125523, "balance_loss_mlp": 1.017802, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 1.970430265604217, "language_loss": 0.72698796, "learning_rate": 1.9633903773255777e-06, "loss": 0.74837852, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.1066927909851074 }, { "auxiliary_loss_clip": 0.01177067, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.05111396, "balance_loss_mlp": 1.02126384, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 2.0685284609050356, "language_loss": 0.75154257, "learning_rate": 1.9626115389892237e-06, "loss": 0.77359712, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.73506236076355 }, { "auxiliary_loss_clip": 0.01137361, "auxiliary_loss_mlp": 0.01025396, "balance_loss_clip": 1.04966664, "balance_loss_mlp": 1.01830339, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 7.410763113172215, "language_loss": 0.85704052, "learning_rate": 1.96183270632467e-06, "loss": 0.87866813, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.7546606063842773 }, { "auxiliary_loss_clip": 0.01124972, "auxiliary_loss_mlp": 0.00711618, "balance_loss_clip": 1.04675579, "balance_loss_mlp": 1.00040925, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 2.10825573076747, "language_loss": 0.79164028, "learning_rate": 1.9610538794500644e-06, "loss": 0.81000614, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.647575616836548 }, { "auxiliary_loss_clip": 0.01051031, "auxiliary_loss_mlp": 0.01001808, "balance_loss_clip": 1.02367187, "balance_loss_mlp": 1.00031745, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7796681837545347, "language_loss": 0.59436005, "learning_rate": 1.9602750584835542e-06, "loss": 0.61488843, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.413478374481201 }, { "auxiliary_loss_clip": 0.01141863, "auxiliary_loss_mlp": 0.01024648, "balance_loss_clip": 1.04709578, "balance_loss_mlp": 1.01747799, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.382682340429211, "language_loss": 0.82690239, "learning_rate": 1.959496243543286e-06, "loss": 0.84856749, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.6467723846435547 }, { "auxiliary_loss_clip": 0.01167304, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.05652928, "balance_loss_mlp": 1.02316988, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.062132720364949, "language_loss": 0.79419136, "learning_rate": 1.9587174347474057e-06, "loss": 0.81617653, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.66863751411438 }, { "auxiliary_loss_clip": 0.01101257, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.04271376, "balance_loss_mlp": 1.01883316, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 5.312128433684954, "language_loss": 0.82420522, "learning_rate": 1.9579386322140574e-06, "loss": 0.84548366, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.673830270767212 }, { "auxiliary_loss_clip": 0.01181653, "auxiliary_loss_mlp": 0.00711496, "balance_loss_clip": 1.05404449, "balance_loss_mlp": 1.00053203, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 2.245763745738672, "language_loss": 0.81022835, "learning_rate": 1.9571598360613854e-06, "loss": 0.82915986, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.674776792526245 }, { "auxiliary_loss_clip": 0.01132704, "auxiliary_loss_mlp": 0.01022665, "balance_loss_clip": 1.04606557, "balance_loss_mlp": 1.01518416, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.670414085414979, "language_loss": 0.69721913, "learning_rate": 1.956381046407532e-06, "loss": 0.71877289, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.637963056564331 }, { "auxiliary_loss_clip": 0.0113017, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.04998887, "balance_loss_mlp": 1.02177858, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 2.335107551941037, "language_loss": 0.85827184, "learning_rate": 1.9556022633706394e-06, "loss": 0.87986577, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.803645133972168 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.049402, "balance_loss_mlp": 1.02607524, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.8337889874219235, "language_loss": 0.80025148, "learning_rate": 1.954823487068848e-06, "loss": 0.82197559, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.695096492767334 }, { "auxiliary_loss_clip": 0.01164436, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.056005, "balance_loss_mlp": 1.01826215, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 2.1362243608042966, "language_loss": 0.8112995, "learning_rate": 1.9540447176202976e-06, "loss": 0.83319819, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.6696865558624268 }, { "auxiliary_loss_clip": 0.01072749, "auxiliary_loss_mlp": 0.01002209, "balance_loss_clip": 1.02308059, "balance_loss_mlp": 1.00080192, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.9691120035251525, "language_loss": 0.60684758, "learning_rate": 1.9532659551431272e-06, "loss": 0.62759715, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.418217897415161 }, { "auxiliary_loss_clip": 0.01162749, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.05150461, "balance_loss_mlp": 1.02225518, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 2.259712525857363, "language_loss": 0.67421192, "learning_rate": 1.9524871997554744e-06, "loss": 0.69613469, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 2.9866340160369873 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01022871, "balance_loss_clip": 1.05283689, "balance_loss_mlp": 1.01570082, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.3831578257920274, "language_loss": 0.80508554, "learning_rate": 1.951708451575475e-06, "loss": 0.82695657, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.6059412956237793 }, { "auxiliary_loss_clip": 0.0113704, "auxiliary_loss_mlp": 0.01023244, "balance_loss_clip": 1.04618645, "balance_loss_mlp": 1.01567996, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.608231717679323, "language_loss": 0.82702476, "learning_rate": 1.9509297107212657e-06, "loss": 0.84862757, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.6821064949035645 }, { "auxiliary_loss_clip": 0.01176123, "auxiliary_loss_mlp": 0.01026123, "balance_loss_clip": 1.05281329, "balance_loss_mlp": 1.01895821, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.8569572739709412, "language_loss": 0.78808558, "learning_rate": 1.95015097731098e-06, "loss": 0.81010807, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.6024599075317383 }, { "auxiliary_loss_clip": 0.01175912, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 1.0521214, "balance_loss_mlp": 1.01742947, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.905861144312996, "language_loss": 0.82404119, "learning_rate": 1.949372251462751e-06, "loss": 0.84604979, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.5578694343566895 }, { "auxiliary_loss_clip": 0.01132227, "auxiliary_loss_mlp": 0.00711511, "balance_loss_clip": 1.0500493, "balance_loss_mlp": 1.00057769, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 4.089549807861628, "language_loss": 0.82774675, "learning_rate": 1.9485935332947124e-06, "loss": 0.84618413, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.7298340797424316 }, { "auxiliary_loss_clip": 0.01143489, "auxiliary_loss_mlp": 0.01018718, "balance_loss_clip": 1.0502882, "balance_loss_mlp": 1.01204276, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.590543764500278, "language_loss": 0.83257186, "learning_rate": 1.947814822924993e-06, "loss": 0.85419393, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.6258647441864014 }, { "auxiliary_loss_clip": 0.01177531, "auxiliary_loss_mlp": 0.01027617, "balance_loss_clip": 1.05481982, "balance_loss_mlp": 1.0204165, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 1.939884220913978, "language_loss": 0.82901794, "learning_rate": 1.9470361204717236e-06, "loss": 0.85106945, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.6184380054473877 }, { "auxiliary_loss_clip": 0.01134441, "auxiliary_loss_mlp": 0.0071142, "balance_loss_clip": 1.0497551, "balance_loss_mlp": 1.00041986, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 2.001495603161807, "language_loss": 0.80721402, "learning_rate": 1.9462574260530326e-06, "loss": 0.82567269, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 3.6632936000823975 }, { "auxiliary_loss_clip": 0.01151697, "auxiliary_loss_mlp": 0.0102839, "balance_loss_clip": 1.0484544, "balance_loss_mlp": 1.0206356, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.920187911420332, "language_loss": 0.80980581, "learning_rate": 1.9454787397870472e-06, "loss": 0.83160675, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 4.433694362640381 }, { "auxiliary_loss_clip": 0.01091005, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.04686284, "balance_loss_mlp": 1.02335835, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 1.8764948206309775, "language_loss": 0.71681786, "learning_rate": 1.944700061791894e-06, "loss": 0.7380318, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.7439563274383545 }, { "auxiliary_loss_clip": 0.01162714, "auxiliary_loss_mlp": 0.01024278, "balance_loss_clip": 1.05412292, "balance_loss_mlp": 1.01750708, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.2381693650635195, "language_loss": 0.65385044, "learning_rate": 1.943921392185698e-06, "loss": 0.67572039, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.624872922897339 }, { "auxiliary_loss_clip": 0.01151454, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.05040526, "balance_loss_mlp": 1.02593899, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.073496899046935, "language_loss": 0.7699827, "learning_rate": 1.9431427310865814e-06, "loss": 0.79183155, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.6702969074249268 }, { "auxiliary_loss_clip": 0.0111411, "auxiliary_loss_mlp": 0.01021064, "balance_loss_clip": 1.04666424, "balance_loss_mlp": 1.01410866, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 2.3802548001868247, "language_loss": 0.78543913, "learning_rate": 1.942364078612667e-06, "loss": 0.80679083, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.7105374336242676 }, { "auxiliary_loss_clip": 0.01136816, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.04889131, "balance_loss_mlp": 1.01956892, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 2.028324408624535, "language_loss": 0.75237668, "learning_rate": 1.9415854348820765e-06, "loss": 0.77401447, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.7087481021881104 }, { "auxiliary_loss_clip": 0.01168804, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 1.05331421, "balance_loss_mlp": 1.01880205, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.1819592210049423, "language_loss": 0.67990649, "learning_rate": 1.940806800012929e-06, "loss": 0.70186281, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.655510663986206 }, { "auxiliary_loss_clip": 0.01109077, "auxiliary_loss_mlp": 0.00712001, "balance_loss_clip": 1.047225, "balance_loss_mlp": 1.00051498, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 2.2364393021540847, "language_loss": 0.63684267, "learning_rate": 1.9400281741233432e-06, "loss": 0.6550535, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.8750078678131104 }, { "auxiliary_loss_clip": 0.01045871, "auxiliary_loss_mlp": 0.01000931, "balance_loss_clip": 1.02451634, "balance_loss_mlp": 0.9996317, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6607785877376174, "language_loss": 0.52576494, "learning_rate": 1.939249557331435e-06, "loss": 0.546233, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.320950746536255 }, { "auxiliary_loss_clip": 0.01140005, "auxiliary_loss_mlp": 0.01021974, "balance_loss_clip": 1.0499537, "balance_loss_mlp": 1.01479125, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 2.019322115440257, "language_loss": 0.73411357, "learning_rate": 1.938470949755321e-06, "loss": 0.75573337, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.7799007892608643 }, { "auxiliary_loss_clip": 0.01052058, "auxiliary_loss_mlp": 0.01004627, "balance_loss_clip": 1.02397227, "balance_loss_mlp": 1.00307775, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8164665386350675, "language_loss": 0.55649483, "learning_rate": 1.937692351513115e-06, "loss": 0.57706165, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.1865429878234863 }, { "auxiliary_loss_clip": 0.01166448, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.0508697, "balance_loss_mlp": 1.01934338, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.6385228179917943, "language_loss": 0.81022197, "learning_rate": 1.9369137627229297e-06, "loss": 0.8321532, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.652090311050415 }, { "auxiliary_loss_clip": 0.01158684, "auxiliary_loss_mlp": 0.01024498, "balance_loss_clip": 1.05064273, "balance_loss_mlp": 1.01692808, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.75460083494114, "language_loss": 0.88374245, "learning_rate": 1.936135183502877e-06, "loss": 0.90557432, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.5907821655273438 }, { "auxiliary_loss_clip": 0.01134397, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.04804945, "balance_loss_mlp": 1.01919794, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.961847900591094, "language_loss": 0.80868936, "learning_rate": 1.935356613971066e-06, "loss": 0.83029789, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.6968958377838135 }, { "auxiliary_loss_clip": 0.01147191, "auxiliary_loss_mlp": 0.00711511, "balance_loss_clip": 1.05154347, "balance_loss_mlp": 1.00052881, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 2.1196471259566514, "language_loss": 0.76931518, "learning_rate": 1.9345780542456047e-06, "loss": 0.78790218, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.6756749153137207 }, { "auxiliary_loss_clip": 0.01152772, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.05001318, "balance_loss_mlp": 1.0194205, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 1.9934403582370612, "language_loss": 0.72029322, "learning_rate": 1.9337995044446007e-06, "loss": 0.74209154, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.6146559715270996 }, { "auxiliary_loss_clip": 0.01168045, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05296004, "balance_loss_mlp": 1.02000117, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.0747604900695498, "language_loss": 0.8006438, "learning_rate": 1.9330209646861596e-06, "loss": 0.82259566, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.616542339324951 }, { "auxiliary_loss_clip": 0.01143955, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.04898596, "balance_loss_mlp": 1.01901293, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.6461016484506998, "language_loss": 0.77955037, "learning_rate": 1.9322424350883843e-06, "loss": 0.80125624, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.7591381072998047 }, { "auxiliary_loss_clip": 0.01149766, "auxiliary_loss_mlp": 0.01020318, "balance_loss_clip": 1.05029845, "balance_loss_mlp": 1.01301599, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 2.1880129774523787, "language_loss": 0.78891784, "learning_rate": 1.931463915769379e-06, "loss": 0.81061864, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.7170164585113525 }, { "auxiliary_loss_clip": 0.01114558, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.04701459, "balance_loss_mlp": 1.02209568, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.460369732884159, "language_loss": 0.7406401, "learning_rate": 1.930685406847242e-06, "loss": 0.76208597, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.7477738857269287 }, { "auxiliary_loss_clip": 0.01143806, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.0504272, "balance_loss_mlp": 1.01858377, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 2.0572160375130033, "language_loss": 0.81577539, "learning_rate": 1.9299069084400734e-06, "loss": 0.83747566, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.715287923812866 }, { "auxiliary_loss_clip": 0.01130426, "auxiliary_loss_mlp": 0.01025592, "balance_loss_clip": 1.05205035, "balance_loss_mlp": 1.0177722, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 2.8000644214222743, "language_loss": 0.6995309, "learning_rate": 1.9291284206659717e-06, "loss": 0.72109109, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.7016239166259766 }, { "auxiliary_loss_clip": 0.01183964, "auxiliary_loss_mlp": 0.01028866, "balance_loss_clip": 1.05737007, "balance_loss_mlp": 1.02117157, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 5.042236266940378, "language_loss": 0.71565259, "learning_rate": 1.928349943643032e-06, "loss": 0.73778087, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.7099649906158447 }, { "auxiliary_loss_clip": 0.01163481, "auxiliary_loss_mlp": 0.01024473, "balance_loss_clip": 1.05501461, "balance_loss_mlp": 1.01706386, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 2.0621062133052668, "language_loss": 0.81662345, "learning_rate": 1.9275714774893493e-06, "loss": 0.83850294, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.6926701068878174 }, { "auxiliary_loss_clip": 0.0111984, "auxiliary_loss_mlp": 0.01024771, "balance_loss_clip": 1.04482472, "balance_loss_mlp": 1.01661134, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 3.1449568813576856, "language_loss": 0.7285859, "learning_rate": 1.9267930223230154e-06, "loss": 0.75003195, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.7248547077178955 }, { "auxiliary_loss_clip": 0.01149048, "auxiliary_loss_mlp": 0.01022352, "balance_loss_clip": 1.05158627, "balance_loss_mlp": 1.01492202, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 6.687454975252914, "language_loss": 0.78327703, "learning_rate": 1.9260145782621224e-06, "loss": 0.80499101, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 3.582188844680786 }, { "auxiliary_loss_clip": 0.01146505, "auxiliary_loss_mlp": 0.01027823, "balance_loss_clip": 1.05222678, "balance_loss_mlp": 1.02021778, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 2.7058591509560843, "language_loss": 0.88226569, "learning_rate": 1.925236145424758e-06, "loss": 0.90400898, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 4.4812703132629395 }, { "auxiliary_loss_clip": 0.01078056, "auxiliary_loss_mlp": 0.01002112, "balance_loss_clip": 1.02607369, "balance_loss_mlp": 1.00078857, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.6943097519076623, "language_loss": 0.57597351, "learning_rate": 1.924457723929012e-06, "loss": 0.59677517, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.3579680919647217 }, { "auxiliary_loss_clip": 0.01165539, "auxiliary_loss_mlp": 0.0102454, "balance_loss_clip": 1.05343306, "balance_loss_mlp": 1.01713097, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 2.0816738090118663, "language_loss": 0.82932973, "learning_rate": 1.9236793138929685e-06, "loss": 0.85123056, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.723301887512207 }, { "auxiliary_loss_clip": 0.01167734, "auxiliary_loss_mlp": 0.01024735, "balance_loss_clip": 1.05292583, "balance_loss_mlp": 1.01736164, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 2.024391930333561, "language_loss": 0.81077051, "learning_rate": 1.9229009154347133e-06, "loss": 0.83269525, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.62507700920105 }, { "auxiliary_loss_clip": 0.01102386, "auxiliary_loss_mlp": 0.00711341, "balance_loss_clip": 1.04513502, "balance_loss_mlp": 1.00045729, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.244526842616088, "language_loss": 0.80908924, "learning_rate": 1.922122528672327e-06, "loss": 0.82722652, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.710999011993408 }, { "auxiliary_loss_clip": 0.01176315, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.0532372, "balance_loss_mlp": 1.01702011, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 2.4781271264908904, "language_loss": 0.78859627, "learning_rate": 1.9213441537238914e-06, "loss": 0.81059813, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.567722797393799 }, { "auxiliary_loss_clip": 0.01033987, "auxiliary_loss_mlp": 0.01001595, "balance_loss_clip": 1.02660489, "balance_loss_mlp": 1.00018847, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8401905558804844, "language_loss": 0.57348597, "learning_rate": 1.920565790707485e-06, "loss": 0.59384179, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.523861885070801 }, { "auxiliary_loss_clip": 0.01124579, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.04881382, "balance_loss_mlp": 1.01841378, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 2.3248183564335756, "language_loss": 0.66171217, "learning_rate": 1.9197874397411853e-06, "loss": 0.68322319, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.973810911178589 }, { "auxiliary_loss_clip": 0.01129008, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.04319024, "balance_loss_mlp": 1.01952338, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 3.2049820427789717, "language_loss": 0.67366838, "learning_rate": 1.919009100943067e-06, "loss": 0.6952306, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.6902451515197754 }, { "auxiliary_loss_clip": 0.01125496, "auxiliary_loss_mlp": 0.01030656, "balance_loss_clip": 1.04853284, "balance_loss_mlp": 1.0226419, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 2.43972109381129, "language_loss": 0.65678895, "learning_rate": 1.9182307744312043e-06, "loss": 0.67835045, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.710672378540039 }, { "auxiliary_loss_clip": 0.01149352, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 1.04936004, "balance_loss_mlp": 1.0188477, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 1.9607000515015458, "language_loss": 0.76986063, "learning_rate": 1.9174524603236676e-06, "loss": 0.79161751, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.6204895973205566 }, { "auxiliary_loss_clip": 0.01145475, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.04830205, "balance_loss_mlp": 1.02082908, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 3.102655939138444, "language_loss": 0.76182866, "learning_rate": 1.916674158738527e-06, "loss": 0.78356916, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.7009963989257812 }, { "auxiliary_loss_clip": 0.01124032, "auxiliary_loss_mlp": 0.00712285, "balance_loss_clip": 1.04877567, "balance_loss_mlp": 1.00054383, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 4.12111570191233, "language_loss": 0.60304177, "learning_rate": 1.9158958697938506e-06, "loss": 0.62140495, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.6469712257385254 }, { "auxiliary_loss_clip": 0.01141408, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.04941082, "balance_loss_mlp": 1.0201174, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 3.771222573254477, "language_loss": 0.85745925, "learning_rate": 1.9151175936077032e-06, "loss": 0.87915349, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.6942601203918457 }, { "auxiliary_loss_clip": 0.01158953, "auxiliary_loss_mlp": 0.01024383, "balance_loss_clip": 1.05089283, "balance_loss_mlp": 1.01728415, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.8180169488731102, "language_loss": 0.79525405, "learning_rate": 1.9143393302981507e-06, "loss": 0.81708741, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.6106536388397217 }, { "auxiliary_loss_clip": 0.01149258, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 1.05027962, "balance_loss_mlp": 1.01801395, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.9425687680423354, "language_loss": 0.83632201, "learning_rate": 1.913561079983252e-06, "loss": 0.85806823, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.6653385162353516 }, { "auxiliary_loss_clip": 0.01152486, "auxiliary_loss_mlp": 0.01027803, "balance_loss_clip": 1.05001688, "balance_loss_mlp": 1.01910639, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.3234401872703745, "language_loss": 0.75598609, "learning_rate": 1.9127828427810693e-06, "loss": 0.77778894, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.6797120571136475 }, { "auxiliary_loss_clip": 0.01136891, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.04792738, "balance_loss_mlp": 1.02025628, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 2.438343504918216, "language_loss": 0.80851972, "learning_rate": 1.9120046188096607e-06, "loss": 0.83016568, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.735171318054199 }, { "auxiliary_loss_clip": 0.01149155, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.05445397, "balance_loss_mlp": 1.02241349, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 2.5045041496544536, "language_loss": 0.74341744, "learning_rate": 1.9112264081870804e-06, "loss": 0.76520622, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.6302554607391357 }, { "auxiliary_loss_clip": 0.01129561, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.05072355, "balance_loss_mlp": 1.0184679, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.701085300434633, "language_loss": 0.76142442, "learning_rate": 1.9104482110313843e-06, "loss": 0.78298151, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.7297682762145996 }, { "auxiliary_loss_clip": 0.01163194, "auxiliary_loss_mlp": 0.01022591, "balance_loss_clip": 1.05298829, "balance_loss_mlp": 1.01482713, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 4.82030031640174, "language_loss": 0.74291921, "learning_rate": 1.909670027460623e-06, "loss": 0.764777, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.758535861968994 }, { "auxiliary_loss_clip": 0.01161424, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.05201936, "balance_loss_mlp": 1.02191925, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.8734185154012741, "language_loss": 0.71630847, "learning_rate": 1.908891857592847e-06, "loss": 0.7382201, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.713901996612549 }, { "auxiliary_loss_clip": 0.0112796, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.05192041, "balance_loss_mlp": 1.02163959, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.2859297846789945, "language_loss": 0.90026784, "learning_rate": 1.9081137015461034e-06, "loss": 0.92183816, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.7313172817230225 }, { "auxiliary_loss_clip": 0.01107689, "auxiliary_loss_mlp": 0.01029445, "balance_loss_clip": 1.04899812, "balance_loss_mlp": 1.02203286, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 2.058935139561928, "language_loss": 0.90713966, "learning_rate": 1.9073355594384383e-06, "loss": 0.92851102, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.7045323848724365 }, { "auxiliary_loss_clip": 0.01124557, "auxiliary_loss_mlp": 0.01027719, "balance_loss_clip": 1.04785752, "balance_loss_mlp": 1.01986265, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 2.0677807310893703, "language_loss": 0.80860507, "learning_rate": 1.906557431387895e-06, "loss": 0.83012784, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.747162103652954 }, { "auxiliary_loss_clip": 0.01129007, "auxiliary_loss_mlp": 0.01021838, "balance_loss_clip": 1.05355501, "balance_loss_mlp": 1.01467025, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 48.51839881715293, "language_loss": 0.78742695, "learning_rate": 1.905779317512516e-06, "loss": 0.8089354, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 3.5982372760772705 }, { "auxiliary_loss_clip": 0.01161445, "auxiliary_loss_mlp": 0.01021571, "balance_loss_clip": 1.0519017, "balance_loss_mlp": 1.01414418, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 2.1593225989659888, "language_loss": 0.80776393, "learning_rate": 1.9050012179303385e-06, "loss": 0.82959408, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 4.380547761917114 }, { "auxiliary_loss_clip": 0.01163046, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.04978657, "balance_loss_mlp": 1.02070665, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.75199518851395, "language_loss": 0.68696892, "learning_rate": 1.904223132759401e-06, "loss": 0.70888501, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.6533639430999756 }, { "auxiliary_loss_clip": 0.0116362, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.05221772, "balance_loss_mlp": 1.01951599, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.840472431306078, "language_loss": 0.68641526, "learning_rate": 1.9034450621177383e-06, "loss": 0.70832151, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.6512134075164795 }, { "auxiliary_loss_clip": 0.01163677, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.05405617, "balance_loss_mlp": 1.02276301, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 1.902795799822152, "language_loss": 0.70402217, "learning_rate": 1.9026670061233824e-06, "loss": 0.7259649, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.699430465698242 }, { "auxiliary_loss_clip": 0.01141775, "auxiliary_loss_mlp": 0.01025609, "balance_loss_clip": 1.05049777, "balance_loss_mlp": 1.01845622, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.9186229739387501, "language_loss": 0.80457407, "learning_rate": 1.901888964894365e-06, "loss": 0.82624793, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.6643524169921875 }, { "auxiliary_loss_clip": 0.01179388, "auxiliary_loss_mlp": 0.01024221, "balance_loss_clip": 1.0524447, "balance_loss_mlp": 1.01639521, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 1.8552622490620423, "language_loss": 0.67408603, "learning_rate": 1.9011109385487134e-06, "loss": 0.69612217, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.6430118083953857 }, { "auxiliary_loss_clip": 0.01179762, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.05358648, "balance_loss_mlp": 1.0224427, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 2.5540665983466266, "language_loss": 0.66559851, "learning_rate": 1.900332927204454e-06, "loss": 0.68769932, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.604147434234619 }, { "auxiliary_loss_clip": 0.01154773, "auxiliary_loss_mlp": 0.01027914, "balance_loss_clip": 1.05209386, "balance_loss_mlp": 1.02023673, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 4.44314040024799, "language_loss": 0.76695359, "learning_rate": 1.8995549309796097e-06, "loss": 0.78878051, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.712934732437134 }, { "auxiliary_loss_clip": 0.011724, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.05587077, "balance_loss_mlp": 1.02346516, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 2.2910020135386957, "language_loss": 0.76277757, "learning_rate": 1.8987769499922028e-06, "loss": 0.7848134, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.670029878616333 }, { "auxiliary_loss_clip": 0.01163206, "auxiliary_loss_mlp": 0.00711591, "balance_loss_clip": 1.05417514, "balance_loss_mlp": 1.00052691, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 3.1685400874730485, "language_loss": 0.71580535, "learning_rate": 1.897998984360252e-06, "loss": 0.73455334, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.6164162158966064 }, { "auxiliary_loss_clip": 0.01143399, "auxiliary_loss_mlp": 0.01023855, "balance_loss_clip": 1.05117416, "balance_loss_mlp": 1.01675344, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 2.962962354289213, "language_loss": 0.78674352, "learning_rate": 1.897221034201775e-06, "loss": 0.80841607, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.7946276664733887 }, { "auxiliary_loss_clip": 0.01133926, "auxiliary_loss_mlp": 0.01023169, "balance_loss_clip": 1.04943585, "balance_loss_mlp": 1.01656175, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.9996319147541828, "language_loss": 0.66809535, "learning_rate": 1.8964430996347842e-06, "loss": 0.68966639, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.730039119720459 }, { "auxiliary_loss_clip": 0.01148523, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.05313694, "balance_loss_mlp": 1.01694942, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.6329888588766013, "language_loss": 0.82432181, "learning_rate": 1.8956651807772931e-06, "loss": 0.84605181, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.6201019287109375 }, { "auxiliary_loss_clip": 0.01162685, "auxiliary_loss_mlp": 0.01024975, "balance_loss_clip": 1.05514622, "balance_loss_mlp": 1.0180068, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.715074100579352, "language_loss": 0.83742821, "learning_rate": 1.8948872777473115e-06, "loss": 0.85930485, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.5886390209198 }, { "auxiliary_loss_clip": 0.01149303, "auxiliary_loss_mlp": 0.01027934, "balance_loss_clip": 1.05255604, "balance_loss_mlp": 1.02020383, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 2.1845932355855524, "language_loss": 0.63700616, "learning_rate": 1.8941093906628458e-06, "loss": 0.65877849, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.6157562732696533 }, { "auxiliary_loss_clip": 0.01142514, "auxiliary_loss_mlp": 0.01023371, "balance_loss_clip": 1.05012107, "balance_loss_mlp": 1.0161413, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 3.0143728767556865, "language_loss": 0.70906752, "learning_rate": 1.893331519641902e-06, "loss": 0.73072636, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.6390278339385986 }, { "auxiliary_loss_clip": 0.01121178, "auxiliary_loss_mlp": 0.01025414, "balance_loss_clip": 1.04642272, "balance_loss_mlp": 1.0180912, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.418069793340431, "language_loss": 0.73568189, "learning_rate": 1.8925536648024815e-06, "loss": 0.75714785, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.5918517112731934 }, { "auxiliary_loss_clip": 0.01181963, "auxiliary_loss_mlp": 0.01026557, "balance_loss_clip": 1.05538726, "balance_loss_mlp": 1.01882625, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.5065523994905226, "language_loss": 0.76182795, "learning_rate": 1.8917758262625849e-06, "loss": 0.78391308, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.570587635040283 }, { "auxiliary_loss_clip": 0.0114361, "auxiliary_loss_mlp": 0.0102059, "balance_loss_clip": 1.0533154, "balance_loss_mlp": 1.01397991, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.6545114593189327, "language_loss": 0.81105733, "learning_rate": 1.8909980041402089e-06, "loss": 0.8326993, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.659315824508667 }, { "auxiliary_loss_clip": 0.01158928, "auxiliary_loss_mlp": 0.01027495, "balance_loss_clip": 1.05299807, "balance_loss_mlp": 1.01974082, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 3.9655414570322263, "language_loss": 0.65522659, "learning_rate": 1.8902201985533494e-06, "loss": 0.67709088, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.570166826248169 }, { "auxiliary_loss_clip": 0.01147896, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.0517087, "balance_loss_mlp": 1.01862347, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 2.2499680813436616, "language_loss": 0.75063658, "learning_rate": 1.8894424096199983e-06, "loss": 0.77237821, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.63330078125 }, { "auxiliary_loss_clip": 0.01169007, "auxiliary_loss_mlp": 0.01028767, "balance_loss_clip": 1.05786932, "balance_loss_mlp": 1.02115226, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 2.0632760092516134, "language_loss": 0.85682893, "learning_rate": 1.8886646374581463e-06, "loss": 0.87880665, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.6091465950012207 }, { "auxiliary_loss_clip": 0.01163532, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.05375671, "balance_loss_mlp": 1.02203512, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 2.027748841757342, "language_loss": 0.71497875, "learning_rate": 1.8878868821857795e-06, "loss": 0.73690689, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.611496925354004 }, { "auxiliary_loss_clip": 0.01113654, "auxiliary_loss_mlp": 0.01030733, "balance_loss_clip": 1.04751205, "balance_loss_mlp": 1.02280521, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.5707548227219883, "language_loss": 0.75250489, "learning_rate": 1.8871091439208838e-06, "loss": 0.77394873, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.864690065383911 }, { "auxiliary_loss_clip": 0.01112666, "auxiliary_loss_mlp": 0.01026065, "balance_loss_clip": 1.04913104, "balance_loss_mlp": 1.01870406, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.5355354535060215, "language_loss": 0.77508998, "learning_rate": 1.8863314227814414e-06, "loss": 0.79647732, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.7948203086853027 }, { "auxiliary_loss_clip": 0.01170458, "auxiliary_loss_mlp": 0.01028439, "balance_loss_clip": 1.05572307, "balance_loss_mlp": 1.02104783, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.401168160727962, "language_loss": 0.4916954, "learning_rate": 1.8855537188854313e-06, "loss": 0.51368439, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 4.557165861129761 }, { "auxiliary_loss_clip": 0.01166634, "auxiliary_loss_mlp": 0.01025137, "balance_loss_clip": 1.05207407, "balance_loss_mlp": 1.01800275, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.056255922395873, "language_loss": 0.78237778, "learning_rate": 1.8847760323508315e-06, "loss": 0.80429548, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 4.416316986083984 }, { "auxiliary_loss_clip": 0.01144313, "auxiliary_loss_mlp": 0.01021854, "balance_loss_clip": 1.05339766, "balance_loss_mlp": 1.01451683, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 2.7469499837814206, "language_loss": 0.7538622, "learning_rate": 1.883998363295616e-06, "loss": 0.7755239, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.6691644191741943 }, { "auxiliary_loss_clip": 0.01070689, "auxiliary_loss_mlp": 0.01008427, "balance_loss_clip": 1.03267527, "balance_loss_mlp": 1.00705004, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8775805520861739, "language_loss": 0.62629026, "learning_rate": 1.8832207118377565e-06, "loss": 0.64708138, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.1540234088897705 }, { "auxiliary_loss_clip": 0.01178669, "auxiliary_loss_mlp": 0.01024933, "balance_loss_clip": 1.05451155, "balance_loss_mlp": 1.01752448, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 2.9455402835349203, "language_loss": 0.69512957, "learning_rate": 1.882443078095222e-06, "loss": 0.71716559, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.6582956314086914 }, { "auxiliary_loss_clip": 0.01048213, "auxiliary_loss_mlp": 0.01002844, "balance_loss_clip": 1.03451896, "balance_loss_mlp": 1.00160432, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8648157324679862, "language_loss": 0.66776508, "learning_rate": 1.8816654621859794e-06, "loss": 0.68827564, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.1531763076782227 }, { "auxiliary_loss_clip": 0.01179411, "auxiliary_loss_mlp": 0.01022141, "balance_loss_clip": 1.05614519, "balance_loss_mlp": 1.01488161, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.205647407532652, "language_loss": 0.72467566, "learning_rate": 1.8808878642279915e-06, "loss": 0.74669117, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.5987348556518555 }, { "auxiliary_loss_clip": 0.01134374, "auxiliary_loss_mlp": 0.01026501, "balance_loss_clip": 1.04801261, "balance_loss_mlp": 1.01874042, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 3.3325845349180554, "language_loss": 0.65043551, "learning_rate": 1.8801102843392209e-06, "loss": 0.67204422, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.6808865070343018 }, { "auxiliary_loss_clip": 0.01129078, "auxiliary_loss_mlp": 0.01021194, "balance_loss_clip": 1.04753995, "balance_loss_mlp": 1.01364517, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.6370201761840821, "language_loss": 0.8540141, "learning_rate": 1.8793327226376238e-06, "loss": 0.87551689, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.721646547317505 }, { "auxiliary_loss_clip": 0.0115612, "auxiliary_loss_mlp": 0.01029094, "balance_loss_clip": 1.05263937, "balance_loss_mlp": 1.02155435, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 3.918122977468123, "language_loss": 0.80100214, "learning_rate": 1.8785551792411569e-06, "loss": 0.82285428, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.6602208614349365 }, { "auxiliary_loss_clip": 0.01148313, "auxiliary_loss_mlp": 0.01019622, "balance_loss_clip": 1.05165553, "balance_loss_mlp": 1.01276493, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.2335919573119445, "language_loss": 0.82563102, "learning_rate": 1.8777776542677733e-06, "loss": 0.84731036, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.6292967796325684 }, { "auxiliary_loss_clip": 0.01128484, "auxiliary_loss_mlp": 0.01023465, "balance_loss_clip": 1.0467031, "balance_loss_mlp": 1.01586568, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 2.0962130980147196, "language_loss": 0.73217022, "learning_rate": 1.8770001478354216e-06, "loss": 0.75368971, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.7061781883239746 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.05404091, "balance_loss_mlp": 1.02361548, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.071247303689649, "language_loss": 0.84171188, "learning_rate": 1.8762226600620504e-06, "loss": 0.86364961, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.724454879760742 }, { "auxiliary_loss_clip": 0.01152459, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.04890025, "balance_loss_mlp": 1.01963747, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.924616662594737, "language_loss": 0.59240746, "learning_rate": 1.8754451910656031e-06, "loss": 0.6142081, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.6105620861053467 }, { "auxiliary_loss_clip": 0.01122502, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.04942369, "balance_loss_mlp": 1.02246392, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 2.1342722560648903, "language_loss": 0.83185607, "learning_rate": 1.8746677409640212e-06, "loss": 0.85338002, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.7372138500213623 }, { "auxiliary_loss_clip": 0.01169366, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 1.05610371, "balance_loss_mlp": 1.02441573, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.72253235413019, "language_loss": 0.8462615, "learning_rate": 1.8738903098752432e-06, "loss": 0.86827588, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.7583093643188477 }, { "auxiliary_loss_clip": 0.01151892, "auxiliary_loss_mlp": 0.01026417, "balance_loss_clip": 1.05390894, "balance_loss_mlp": 1.01925862, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.2633770110060394, "language_loss": 0.73300922, "learning_rate": 1.8731128979172052e-06, "loss": 0.75479233, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.660508155822754 }, { "auxiliary_loss_clip": 0.0114516, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.05320716, "balance_loss_mlp": 1.01883042, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.321997919206127, "language_loss": 0.67713702, "learning_rate": 1.8723355052078394e-06, "loss": 0.69884551, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.780163526535034 }, { "auxiliary_loss_clip": 0.01160134, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.05124307, "balance_loss_mlp": 1.02246368, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.7740167292436064, "language_loss": 0.77479756, "learning_rate": 1.8715581318650765e-06, "loss": 0.79670811, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.575352668762207 }, { "auxiliary_loss_clip": 0.01142786, "auxiliary_loss_mlp": 0.01027222, "balance_loss_clip": 1.0530057, "balance_loss_mlp": 1.01969051, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.2207450961746726, "language_loss": 0.81622118, "learning_rate": 1.8707807780068422e-06, "loss": 0.8379212, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.6866376399993896 }, { "auxiliary_loss_clip": 0.01147231, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.04981947, "balance_loss_mlp": 1.01826215, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.1347605093957203, "language_loss": 0.66157591, "learning_rate": 1.8700034437510611e-06, "loss": 0.68330216, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.6652379035949707 }, { "auxiliary_loss_clip": 0.01123613, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.0491991, "balance_loss_mlp": 1.02013755, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.6979566903489176, "language_loss": 0.81055033, "learning_rate": 1.8692261292156549e-06, "loss": 0.83206737, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.682997465133667 }, { "auxiliary_loss_clip": 0.01180677, "auxiliary_loss_mlp": 0.01028951, "balance_loss_clip": 1.05765247, "balance_loss_mlp": 1.02099669, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 2.0251580635578628, "language_loss": 0.8099789, "learning_rate": 1.8684488345185401e-06, "loss": 0.83207512, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.588561773300171 }, { "auxiliary_loss_clip": 0.01182452, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.05617344, "balance_loss_mlp": 1.02759242, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.4768855470713014, "language_loss": 0.78413588, "learning_rate": 1.8676715597776332e-06, "loss": 0.80631191, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.5837180614471436 }, { "auxiliary_loss_clip": 0.01106748, "auxiliary_loss_mlp": 0.01023384, "balance_loss_clip": 1.04752827, "balance_loss_mlp": 1.0162971, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 2.0095062462131486, "language_loss": 0.76095593, "learning_rate": 1.8668943051108455e-06, "loss": 0.7822572, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.704738140106201 }, { "auxiliary_loss_clip": 0.01151507, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.05426943, "balance_loss_mlp": 1.0246768, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.9802331131114472, "language_loss": 0.76650167, "learning_rate": 1.8661170706360856e-06, "loss": 0.78834116, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.6645848751068115 }, { "auxiliary_loss_clip": 0.01160789, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.05369759, "balance_loss_mlp": 1.01908922, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.8143818534522795, "language_loss": 0.81421745, "learning_rate": 1.8653398564712594e-06, "loss": 0.83608902, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 4.477340936660767 }, { "auxiliary_loss_clip": 0.01164877, "auxiliary_loss_mlp": 0.01023925, "balance_loss_clip": 1.05665195, "balance_loss_mlp": 1.0161463, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 3.1420073669293185, "language_loss": 0.82139778, "learning_rate": 1.8645626627342704e-06, "loss": 0.8432858, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 4.462398290634155 }, { "auxiliary_loss_clip": 0.01166616, "auxiliary_loss_mlp": 0.01024744, "balance_loss_clip": 1.05314386, "balance_loss_mlp": 1.01762152, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.080525405571681, "language_loss": 0.8116442, "learning_rate": 1.8637854895430172e-06, "loss": 0.83355784, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.638993501663208 }, { "auxiliary_loss_clip": 0.01123459, "auxiliary_loss_mlp": 0.01022943, "balance_loss_clip": 1.0470767, "balance_loss_mlp": 1.01513481, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.5831967473712756, "language_loss": 0.69618547, "learning_rate": 1.8630083370153978e-06, "loss": 0.71764946, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.7538344860076904 }, { "auxiliary_loss_clip": 0.01033914, "auxiliary_loss_mlp": 0.01003245, "balance_loss_clip": 1.02850163, "balance_loss_mlp": 1.00193977, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.747547855747305, "language_loss": 0.55382979, "learning_rate": 1.8622312052693041e-06, "loss": 0.57420141, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.452799081802368 }, { "auxiliary_loss_clip": 0.01153564, "auxiliary_loss_mlp": 0.01024363, "balance_loss_clip": 1.04800236, "balance_loss_mlp": 1.01706123, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.3423681086404557, "language_loss": 0.71996093, "learning_rate": 1.8614540944226267e-06, "loss": 0.74174023, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.720468521118164 }, { "auxiliary_loss_clip": 0.01145626, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 1.05391419, "balance_loss_mlp": 1.01814556, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.7746058188265759, "language_loss": 0.68170673, "learning_rate": 1.8606770045932537e-06, "loss": 0.70341593, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.6954848766326904 }, { "auxiliary_loss_clip": 0.01124204, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.04544175, "balance_loss_mlp": 1.0196594, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 1.899814433081797, "language_loss": 0.81996274, "learning_rate": 1.859899935899068e-06, "loss": 0.84147954, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.7302796840667725 }, { "auxiliary_loss_clip": 0.0114545, "auxiliary_loss_mlp": 0.01025554, "balance_loss_clip": 1.05394161, "balance_loss_mlp": 1.017519, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 2.6645238324659855, "language_loss": 0.79292524, "learning_rate": 1.8591228884579506e-06, "loss": 0.81463528, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.748568534851074 }, { "auxiliary_loss_clip": 0.01134081, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 1.04921103, "balance_loss_mlp": 1.02294064, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 1.9430578466704858, "language_loss": 0.82114011, "learning_rate": 1.8583458623877795e-06, "loss": 0.84278107, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.743577003479004 }, { "auxiliary_loss_clip": 0.01165782, "auxiliary_loss_mlp": 0.01026697, "balance_loss_clip": 1.05411768, "balance_loss_mlp": 1.01920462, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 3.4753511388782576, "language_loss": 0.74321437, "learning_rate": 1.8575688578064281e-06, "loss": 0.7651391, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.7283966541290283 }, { "auxiliary_loss_clip": 0.01166272, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.05388176, "balance_loss_mlp": 1.02347755, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.7384916452017765, "language_loss": 0.76286608, "learning_rate": 1.8567918748317674e-06, "loss": 0.78483856, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.621161460876465 }, { "auxiliary_loss_clip": 0.0113052, "auxiliary_loss_mlp": 0.01025513, "balance_loss_clip": 1.04680419, "balance_loss_mlp": 1.01803911, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 2.4754936055054872, "language_loss": 0.82779104, "learning_rate": 1.8560149135816659e-06, "loss": 0.84935135, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.7942514419555664 }, { "auxiliary_loss_clip": 0.01159591, "auxiliary_loss_mlp": 0.01025815, "balance_loss_clip": 1.05122375, "balance_loss_mlp": 1.01845968, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.555846945629058, "language_loss": 0.84322762, "learning_rate": 1.8552379741739873e-06, "loss": 0.86508167, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.6637558937072754 }, { "auxiliary_loss_clip": 0.01054312, "auxiliary_loss_mlp": 0.0070186, "balance_loss_clip": 1.02859259, "balance_loss_mlp": 0.99976617, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.8778829446886425, "language_loss": 0.5557757, "learning_rate": 1.8544610567265935e-06, "loss": 0.57333744, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.542208433151245 }, { "auxiliary_loss_clip": 0.01149969, "auxiliary_loss_mlp": 0.00711663, "balance_loss_clip": 1.05518103, "balance_loss_mlp": 1.00049603, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 2.128838691527581, "language_loss": 0.83091116, "learning_rate": 1.853684161357341e-06, "loss": 0.84952748, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.7392687797546387 }, { "auxiliary_loss_clip": 0.01160928, "auxiliary_loss_mlp": 0.00711309, "balance_loss_clip": 1.05249023, "balance_loss_mlp": 1.00038815, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 2.1817160843844117, "language_loss": 0.76657462, "learning_rate": 1.852907288184085e-06, "loss": 0.78529704, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.6703739166259766 }, { "auxiliary_loss_clip": 0.0112097, "auxiliary_loss_mlp": 0.01025198, "balance_loss_clip": 1.04976666, "balance_loss_mlp": 1.01685369, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 2.2668781811078538, "language_loss": 0.70318067, "learning_rate": 1.8521304373246762e-06, "loss": 0.7246424, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.829517364501953 }, { "auxiliary_loss_clip": 0.01167411, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.05346298, "balance_loss_mlp": 1.02523994, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.5015066858872848, "language_loss": 0.8913005, "learning_rate": 1.8513536088969626e-06, "loss": 0.9133116, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.6021981239318848 }, { "auxiliary_loss_clip": 0.01164939, "auxiliary_loss_mlp": 0.01024584, "balance_loss_clip": 1.05412817, "balance_loss_mlp": 1.01629877, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.6987936830935397, "language_loss": 0.8028636, "learning_rate": 1.8505768030187884e-06, "loss": 0.82475883, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.6649653911590576 }, { "auxiliary_loss_clip": 0.01143954, "auxiliary_loss_mlp": 0.01025571, "balance_loss_clip": 1.0532999, "balance_loss_mlp": 1.01843643, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.5574780128302173, "language_loss": 0.79818445, "learning_rate": 1.849800019807995e-06, "loss": 0.81987971, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.6388962268829346 }, { "auxiliary_loss_clip": 0.01130112, "auxiliary_loss_mlp": 0.01025081, "balance_loss_clip": 1.04980516, "balance_loss_mlp": 1.01766658, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.222681543738576, "language_loss": 0.71658468, "learning_rate": 1.8490232593824186e-06, "loss": 0.73813665, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.7680187225341797 }, { "auxiliary_loss_clip": 0.01143974, "auxiliary_loss_mlp": 0.01022546, "balance_loss_clip": 1.05278051, "balance_loss_mlp": 1.01533079, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.7870269001024481, "language_loss": 0.8476187, "learning_rate": 1.8482465218598935e-06, "loss": 0.86928391, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.7130658626556396 }, { "auxiliary_loss_clip": 0.01131957, "auxiliary_loss_mlp": 0.01030204, "balance_loss_clip": 1.04983795, "balance_loss_mlp": 1.02258658, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.7435187730879964, "language_loss": 0.83285105, "learning_rate": 1.8474698073582508e-06, "loss": 0.85447264, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.696714162826538 }, { "auxiliary_loss_clip": 0.01133731, "auxiliary_loss_mlp": 0.01025742, "balance_loss_clip": 1.04933345, "balance_loss_mlp": 1.01845515, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 2.103455364277948, "language_loss": 0.86979985, "learning_rate": 1.8466931159953166e-06, "loss": 0.89139456, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.6124725341796875 }, { "auxiliary_loss_clip": 0.01153711, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.05596852, "balance_loss_mlp": 1.02307093, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 1.7975037886522678, "language_loss": 0.84176111, "learning_rate": 1.8459164478889158e-06, "loss": 0.8636061, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 3.7622852325439453 }, { "auxiliary_loss_clip": 0.01118818, "auxiliary_loss_mlp": 0.01025096, "balance_loss_clip": 1.0444361, "balance_loss_mlp": 1.01832509, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 2.2583122122378936, "language_loss": 0.75872111, "learning_rate": 1.8451398031568663e-06, "loss": 0.78016031, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 3.6201868057250977 }, { "auxiliary_loss_clip": 0.01128242, "auxiliary_loss_mlp": 0.01023969, "balance_loss_clip": 1.04914713, "balance_loss_mlp": 1.01597595, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.9415597049031796, "language_loss": 0.74670649, "learning_rate": 1.844363181916986e-06, "loss": 0.76822859, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 3.6990067958831787 }, { "auxiliary_loss_clip": 0.01160448, "auxiliary_loss_mlp": 0.01024215, "balance_loss_clip": 1.05145431, "balance_loss_mlp": 1.01644278, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 2.090209897343894, "language_loss": 0.83541954, "learning_rate": 1.8435865842870868e-06, "loss": 0.85726619, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.6021361351013184 }, { "auxiliary_loss_clip": 0.01137283, "auxiliary_loss_mlp": 0.00711677, "balance_loss_clip": 1.04734576, "balance_loss_mlp": 1.00041914, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.9611889963865123, "language_loss": 0.72307438, "learning_rate": 1.8428100103849787e-06, "loss": 0.74156398, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.7758595943450928 }, { "auxiliary_loss_clip": 0.01147436, "auxiliary_loss_mlp": 0.01027555, "balance_loss_clip": 1.05586362, "balance_loss_mlp": 1.02001512, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.452885450920617, "language_loss": 0.73637068, "learning_rate": 1.842033460328467e-06, "loss": 0.75812066, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.6525046825408936 }, { "auxiliary_loss_clip": 0.01150101, "auxiliary_loss_mlp": 0.00710919, "balance_loss_clip": 1.05071211, "balance_loss_mlp": 1.00037682, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.9430472233704923, "language_loss": 0.75076306, "learning_rate": 1.8412569342353541e-06, "loss": 0.76937324, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.7324376106262207 }, { "auxiliary_loss_clip": 0.0115253, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.05259991, "balance_loss_mlp": 1.02088714, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 2.2039342737860155, "language_loss": 0.84797627, "learning_rate": 1.840480432223438e-06, "loss": 0.86979139, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.6036601066589355 }, { "auxiliary_loss_clip": 0.01151711, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.05253899, "balance_loss_mlp": 1.01886678, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 5.899875305481013, "language_loss": 0.77849901, "learning_rate": 1.8397039544105131e-06, "loss": 0.80028212, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.728076696395874 }, { "auxiliary_loss_clip": 0.01139416, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.04678631, "balance_loss_mlp": 1.01744115, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 1.7213567531319538, "language_loss": 0.69896865, "learning_rate": 1.8389275009143711e-06, "loss": 0.72061741, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.6402668952941895 }, { "auxiliary_loss_clip": 0.01176219, "auxiliary_loss_mlp": 0.01023053, "balance_loss_clip": 1.05359423, "balance_loss_mlp": 1.01548314, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 2.059155473463139, "language_loss": 0.73129988, "learning_rate": 1.8381510718527988e-06, "loss": 0.75329256, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.6484310626983643 }, { "auxiliary_loss_clip": 0.01148339, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.04907358, "balance_loss_mlp": 1.02329445, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 2.015643374557779, "language_loss": 0.63413084, "learning_rate": 1.8373746673435812e-06, "loss": 0.65592754, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.6755638122558594 }, { "auxiliary_loss_clip": 0.01180556, "auxiliary_loss_mlp": 0.01025641, "balance_loss_clip": 1.05652344, "balance_loss_mlp": 1.01808333, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 2.036054017441942, "language_loss": 0.78896302, "learning_rate": 1.8365982875044964e-06, "loss": 0.81102502, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.7275428771972656 }, { "auxiliary_loss_clip": 0.01167087, "auxiliary_loss_mlp": 0.00711598, "balance_loss_clip": 1.05274773, "balance_loss_mlp": 1.00039411, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.2292515001838225, "language_loss": 0.75764197, "learning_rate": 1.8358219324533217e-06, "loss": 0.77642876, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.62471342086792 }, { "auxiliary_loss_clip": 0.01142732, "auxiliary_loss_mlp": 0.01020531, "balance_loss_clip": 1.05084276, "balance_loss_mlp": 1.01408732, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 19.246819358465096, "language_loss": 0.70399708, "learning_rate": 1.8350456023078292e-06, "loss": 0.72562969, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.768232583999634 }, { "auxiliary_loss_clip": 0.01181863, "auxiliary_loss_mlp": 0.01026693, "balance_loss_clip": 1.05412388, "balance_loss_mlp": 1.01862824, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 3.330244661434522, "language_loss": 0.78522873, "learning_rate": 1.8342692971857874e-06, "loss": 0.80731428, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.617462396621704 }, { "auxiliary_loss_clip": 0.01143792, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.05167818, "balance_loss_mlp": 1.01939654, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.8459681219868083, "language_loss": 0.71522772, "learning_rate": 1.833493017204962e-06, "loss": 0.73693228, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.7165281772613525 }, { "auxiliary_loss_clip": 0.01177879, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.05391026, "balance_loss_mlp": 1.01777112, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 1.837613256092343, "language_loss": 0.77771854, "learning_rate": 1.8327167624831134e-06, "loss": 0.79974723, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.629861354827881 }, { "auxiliary_loss_clip": 0.01179433, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.05597603, "balance_loss_mlp": 1.0185225, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.5551751321336167, "language_loss": 0.70754313, "learning_rate": 1.831940533137999e-06, "loss": 0.72959733, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.6142914295196533 }, { "auxiliary_loss_clip": 0.01164693, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.05708659, "balance_loss_mlp": 1.02232313, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 2.231883265993019, "language_loss": 0.7238006, "learning_rate": 1.8311643292873718e-06, "loss": 0.74574775, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.6712687015533447 }, { "auxiliary_loss_clip": 0.01161468, "auxiliary_loss_mlp": 0.01023933, "balance_loss_clip": 1.05498588, "balance_loss_mlp": 1.01695061, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.9384947203857217, "language_loss": 0.87701553, "learning_rate": 1.8303881510489818e-06, "loss": 0.89886957, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.610034942626953 }, { "auxiliary_loss_clip": 0.011478, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.05260253, "balance_loss_mlp": 1.02044463, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 2.064649053160172, "language_loss": 0.69367838, "learning_rate": 1.829611998540574e-06, "loss": 0.71543509, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.6988494396209717 }, { "auxiliary_loss_clip": 0.01160913, "auxiliary_loss_mlp": 0.0071189, "balance_loss_clip": 1.05067348, "balance_loss_mlp": 1.00046325, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.8133228836137454, "language_loss": 0.80007744, "learning_rate": 1.8288358718798914e-06, "loss": 0.81880546, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.6473257541656494 }, { "auxiliary_loss_clip": 0.01157459, "auxiliary_loss_mlp": 0.00711191, "balance_loss_clip": 1.05239308, "balance_loss_mlp": 1.00035083, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 2.546277295856959, "language_loss": 0.72537488, "learning_rate": 1.8280597711846703e-06, "loss": 0.74406135, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.7014150619506836 }, { "auxiliary_loss_clip": 0.01161893, "auxiliary_loss_mlp": 0.01024378, "balance_loss_clip": 1.05470848, "balance_loss_mlp": 1.01687384, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.9573289334713209, "language_loss": 0.83297765, "learning_rate": 1.8272836965726455e-06, "loss": 0.8548404, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.647353410720825 }, { "auxiliary_loss_clip": 0.01098809, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.04616785, "balance_loss_mlp": 1.02252221, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 2.1491681270969476, "language_loss": 0.7823813, "learning_rate": 1.8265076481615461e-06, "loss": 0.80366784, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.722804307937622 }, { "auxiliary_loss_clip": 0.01144901, "auxiliary_loss_mlp": 0.01028652, "balance_loss_clip": 1.05190825, "balance_loss_mlp": 1.02014697, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.0934258111313504, "language_loss": 0.8732357, "learning_rate": 1.8257316260690987e-06, "loss": 0.89497119, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.585430383682251 }, { "auxiliary_loss_clip": 0.01166678, "auxiliary_loss_mlp": 0.01022008, "balance_loss_clip": 1.05625772, "balance_loss_mlp": 1.01520729, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 2.5514092318316837, "language_loss": 0.76057357, "learning_rate": 1.8249556304130254e-06, "loss": 0.78246039, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 4.44889497756958 }, { "auxiliary_loss_clip": 0.01139943, "auxiliary_loss_mlp": 0.01021896, "balance_loss_clip": 1.05140746, "balance_loss_mlp": 1.01392341, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 2.0453482071616684, "language_loss": 0.6878553, "learning_rate": 1.824179661311044e-06, "loss": 0.70947373, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 4.730085611343384 }, { "auxiliary_loss_clip": 0.01113151, "auxiliary_loss_mlp": 0.01023545, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.01645172, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 2.970884641818091, "language_loss": 0.79941142, "learning_rate": 1.823403718880868e-06, "loss": 0.82077837, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.7871150970458984 }, { "auxiliary_loss_clip": 0.01145529, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.04804301, "balance_loss_mlp": 1.01996732, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.791969319623339, "language_loss": 0.66275823, "learning_rate": 1.822627803240207e-06, "loss": 0.68449396, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.868006944656372 }, { "auxiliary_loss_clip": 0.0113649, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.052037, "balance_loss_mlp": 1.02624321, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 6.835253376718948, "language_loss": 0.85476518, "learning_rate": 1.8218519145067675e-06, "loss": 0.8764708, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.708613872528076 }, { "auxiliary_loss_clip": 0.01126705, "auxiliary_loss_mlp": 0.01027388, "balance_loss_clip": 1.04871821, "balance_loss_mlp": 1.01963389, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 3.2502540215899187, "language_loss": 0.8928138, "learning_rate": 1.8210760527982508e-06, "loss": 0.9143548, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.6526260375976562 }, { "auxiliary_loss_clip": 0.01149416, "auxiliary_loss_mlp": 0.00711076, "balance_loss_clip": 1.0526123, "balance_loss_mlp": 1.00046206, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 1.8739766260576283, "language_loss": 0.75457531, "learning_rate": 1.8203002182323552e-06, "loss": 0.77318025, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.69185209274292 }, { "auxiliary_loss_clip": 0.01153672, "auxiliary_loss_mlp": 0.01025639, "balance_loss_clip": 1.05622053, "balance_loss_mlp": 1.01815295, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 2.1085114138388255, "language_loss": 0.75993359, "learning_rate": 1.819524410926773e-06, "loss": 0.78172672, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.7732152938842773 }, { "auxiliary_loss_clip": 0.01096105, "auxiliary_loss_mlp": 0.01023811, "balance_loss_clip": 1.04670119, "balance_loss_mlp": 1.01648617, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 2.421240053577206, "language_loss": 0.77135503, "learning_rate": 1.8187486309991944e-06, "loss": 0.79255414, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.8732919692993164 }, { "auxiliary_loss_clip": 0.0117016, "auxiliary_loss_mlp": 0.01027247, "balance_loss_clip": 1.05610168, "balance_loss_mlp": 1.02005827, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 2.371520219384109, "language_loss": 0.775509, "learning_rate": 1.817972878567304e-06, "loss": 0.79748309, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.676172971725464 }, { "auxiliary_loss_clip": 0.01152297, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.05173683, "balance_loss_mlp": 1.01772332, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.7585601585976842, "language_loss": 0.76841164, "learning_rate": 1.8171971537487834e-06, "loss": 0.79017943, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.6305553913116455 }, { "auxiliary_loss_clip": 0.0118107, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.05508888, "balance_loss_mlp": 1.01664972, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 2.3605598067038027, "language_loss": 0.80774373, "learning_rate": 1.8164214566613093e-06, "loss": 0.82979709, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.6442859172821045 }, { "auxiliary_loss_clip": 0.01178791, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.05528748, "balance_loss_mlp": 1.02031243, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 4.117667626782818, "language_loss": 0.6512382, "learning_rate": 1.8156457874225547e-06, "loss": 0.67330211, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.598437547683716 }, { "auxiliary_loss_clip": 0.01141734, "auxiliary_loss_mlp": 0.01023102, "balance_loss_clip": 1.0549655, "balance_loss_mlp": 1.01573145, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 2.376714359711477, "language_loss": 0.8069644, "learning_rate": 1.814870146150187e-06, "loss": 0.82861274, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.63960862159729 }, { "auxiliary_loss_clip": 0.01154594, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.05232537, "balance_loss_mlp": 1.02089918, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 2.6958128081723216, "language_loss": 0.7892471, "learning_rate": 1.814094532961871e-06, "loss": 0.8110764, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.6401381492614746 }, { "auxiliary_loss_clip": 0.01114995, "auxiliary_loss_mlp": 0.01030554, "balance_loss_clip": 1.04529309, "balance_loss_mlp": 1.02353263, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 2.7918057701286845, "language_loss": 0.83646452, "learning_rate": 1.8133189479752666e-06, "loss": 0.85792005, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.7612175941467285 }, { "auxiliary_loss_clip": 0.01178469, "auxiliary_loss_mlp": 0.01024644, "balance_loss_clip": 1.05490315, "balance_loss_mlp": 1.01724124, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 3.8207419888664855, "language_loss": 0.81847012, "learning_rate": 1.8125433913080292e-06, "loss": 0.84050125, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.5840251445770264 }, { "auxiliary_loss_clip": 0.01049453, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.0459404, "balance_loss_mlp": 1.01791024, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.7484728206402753, "language_loss": 0.8279677, "learning_rate": 1.811767863077811e-06, "loss": 0.84870505, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 3.026381254196167 }, { "auxiliary_loss_clip": 0.0109013, "auxiliary_loss_mlp": 0.01022671, "balance_loss_clip": 1.04723012, "balance_loss_mlp": 1.0162189, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.5311229798841381, "language_loss": 0.78249633, "learning_rate": 1.8109923634022577e-06, "loss": 0.80362427, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 3.067880868911743 }, { "auxiliary_loss_clip": 0.0118028, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.05490148, "balance_loss_mlp": 1.02280354, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.1499662517546003, "language_loss": 0.86885923, "learning_rate": 1.8102168923990128e-06, "loss": 0.89096594, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.576803207397461 }, { "auxiliary_loss_clip": 0.01168367, "auxiliary_loss_mlp": 0.00710867, "balance_loss_clip": 1.05785322, "balance_loss_mlp": 1.0003562, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 8.628067459594247, "language_loss": 0.80131352, "learning_rate": 1.809441450185714e-06, "loss": 0.82010585, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.6934261322021484 }, { "auxiliary_loss_clip": 0.01150648, "auxiliary_loss_mlp": 0.01023367, "balance_loss_clip": 1.04972684, "balance_loss_mlp": 1.01647329, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.555185344022777, "language_loss": 0.73769009, "learning_rate": 1.8086660368799958e-06, "loss": 0.75943029, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.6556899547576904 }, { "auxiliary_loss_clip": 0.01149657, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.05243659, "balance_loss_mlp": 1.01939344, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 2.010319781879039, "language_loss": 0.77991027, "learning_rate": 1.807890652599488e-06, "loss": 0.80167735, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.7568390369415283 }, { "auxiliary_loss_clip": 0.0117686, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.05585885, "balance_loss_mlp": 1.01945996, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 2.29292137000769, "language_loss": 0.82875335, "learning_rate": 1.8071152974618156e-06, "loss": 0.85078615, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.55574369430542 }, { "auxiliary_loss_clip": 0.01132128, "auxiliary_loss_mlp": 0.00710826, "balance_loss_clip": 1.04857802, "balance_loss_mlp": 1.00028253, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.988311609084309, "language_loss": 0.78840244, "learning_rate": 1.806339971584599e-06, "loss": 0.80683196, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.7322940826416016 }, { "auxiliary_loss_clip": 0.01178606, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.05557787, "balance_loss_mlp": 1.01842105, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 1.8515600830512657, "language_loss": 0.85363448, "learning_rate": 1.8055646750854546e-06, "loss": 0.87567496, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.578252077102661 }, { "auxiliary_loss_clip": 0.01153114, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.053967, "balance_loss_mlp": 1.01838672, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 4.766228076415725, "language_loss": 0.82222003, "learning_rate": 1.8047894080819945e-06, "loss": 0.84400654, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 4.856091737747192 }, { "auxiliary_loss_clip": 0.0109762, "auxiliary_loss_mlp": 0.01004911, "balance_loss_clip": 1.03628898, "balance_loss_mlp": 1.00370121, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7425697864541156, "language_loss": 0.63218373, "learning_rate": 1.8040141706918258e-06, "loss": 0.65320903, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 5.142175912857056 }, { "auxiliary_loss_clip": 0.01148665, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 1.05423808, "balance_loss_mlp": 1.01607037, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 2.0451180376328306, "language_loss": 0.77340448, "learning_rate": 1.8032389630325525e-06, "loss": 0.7951234, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.6911561489105225 }, { "auxiliary_loss_clip": 0.01146192, "auxiliary_loss_mlp": 0.01023383, "balance_loss_clip": 1.04908872, "balance_loss_mlp": 1.01644766, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.6895255149261375, "language_loss": 0.7572968, "learning_rate": 1.8024637852217707e-06, "loss": 0.77899259, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.626939296722412 }, { "auxiliary_loss_clip": 0.01149279, "auxiliary_loss_mlp": 0.01022397, "balance_loss_clip": 1.05534005, "balance_loss_mlp": 1.01568246, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 2.763093635287541, "language_loss": 0.85054821, "learning_rate": 1.8016886373770766e-06, "loss": 0.87226498, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.648566246032715 }, { "auxiliary_loss_clip": 0.01149181, "auxiliary_loss_mlp": 0.01023301, "balance_loss_clip": 1.05409598, "balance_loss_mlp": 1.01624417, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.6801870390424796, "language_loss": 0.7851553, "learning_rate": 1.8009135196160579e-06, "loss": 0.80688012, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.8033156394958496 }, { "auxiliary_loss_clip": 0.01126729, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.04924154, "balance_loss_mlp": 1.01955247, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.7307132327940569, "language_loss": 0.84311146, "learning_rate": 1.8001384320563e-06, "loss": 0.86464024, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.825073480606079 }, { "auxiliary_loss_clip": 0.01097015, "auxiliary_loss_mlp": 0.01004567, "balance_loss_clip": 1.0354197, "balance_loss_mlp": 1.00336313, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7757124331044996, "language_loss": 0.57719308, "learning_rate": 1.7993633748153833e-06, "loss": 0.5982089, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.2078428268432617 }, { "auxiliary_loss_clip": 0.01165202, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.05271387, "balance_loss_mlp": 1.0231781, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 2.0510118897280423, "language_loss": 0.72809041, "learning_rate": 1.7985883480108834e-06, "loss": 0.75004208, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.612499475479126 }, { "auxiliary_loss_clip": 0.01159712, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 1.05182838, "balance_loss_mlp": 1.01946306, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 2.1481603434873895, "language_loss": 0.7185815, "learning_rate": 1.797813351760371e-06, "loss": 0.74044418, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.678997039794922 }, { "auxiliary_loss_clip": 0.01180511, "auxiliary_loss_mlp": 0.01031138, "balance_loss_clip": 1.05585551, "balance_loss_mlp": 1.02403915, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.7859835840985043, "language_loss": 0.7793355, "learning_rate": 1.7970383861814116e-06, "loss": 0.80145204, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.5739340782165527 }, { "auxiliary_loss_clip": 0.01166781, "auxiliary_loss_mlp": 0.01023645, "balance_loss_clip": 1.05668521, "balance_loss_mlp": 1.01649845, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 5.279101001208614, "language_loss": 0.74277639, "learning_rate": 1.7962634513915684e-06, "loss": 0.76468062, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.6778783798217773 }, { "auxiliary_loss_clip": 0.01177632, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.05438066, "balance_loss_mlp": 1.01899743, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.8535666091552347, "language_loss": 0.7937026, "learning_rate": 1.7954885475083969e-06, "loss": 0.81574225, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.5246331691741943 }, { "auxiliary_loss_clip": 0.01181945, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.05616975, "balance_loss_mlp": 1.02268767, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.19304826918188, "language_loss": 0.72671747, "learning_rate": 1.7947136746494513e-06, "loss": 0.74884152, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.5866756439208984 }, { "auxiliary_loss_clip": 0.01159574, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.05232549, "balance_loss_mlp": 1.02004766, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 2.0378007348368397, "language_loss": 0.87986493, "learning_rate": 1.793938832932277e-06, "loss": 0.90173352, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.5846333503723145 }, { "auxiliary_loss_clip": 0.01178648, "auxiliary_loss_mlp": 0.01026351, "balance_loss_clip": 1.0546186, "balance_loss_mlp": 1.01937127, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 2.5326807093820443, "language_loss": 0.7036165, "learning_rate": 1.7931640224744185e-06, "loss": 0.72566658, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.6316587924957275 }, { "auxiliary_loss_clip": 0.01120001, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.0446074, "balance_loss_mlp": 1.01759195, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.7238136211501587, "language_loss": 0.73851478, "learning_rate": 1.7923892433934127e-06, "loss": 0.75996339, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.7409567832946777 }, { "auxiliary_loss_clip": 0.01149358, "auxiliary_loss_mlp": 0.00711488, "balance_loss_clip": 1.05293679, "balance_loss_mlp": 1.00051045, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 1.9830392090130922, "language_loss": 0.79371858, "learning_rate": 1.7916144958067939e-06, "loss": 0.81232703, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.659047842025757 }, { "auxiliary_loss_clip": 0.01164819, "auxiliary_loss_mlp": 0.01027908, "balance_loss_clip": 1.05362797, "balance_loss_mlp": 1.02025449, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.802873821757561, "language_loss": 0.78993332, "learning_rate": 1.7908397798320905e-06, "loss": 0.81186056, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.6141738891601562 }, { "auxiliary_loss_clip": 0.0116289, "auxiliary_loss_mlp": 0.00711434, "balance_loss_clip": 1.05426133, "balance_loss_mlp": 1.00043941, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 1.736416062546202, "language_loss": 0.7494241, "learning_rate": 1.7900650955868265e-06, "loss": 0.76816732, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.7110788822174072 }, { "auxiliary_loss_clip": 0.01163027, "auxiliary_loss_mlp": 0.00710891, "balance_loss_clip": 1.05541539, "balance_loss_mlp": 1.00039434, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.345288073593875, "language_loss": 0.76457399, "learning_rate": 1.7892904431885202e-06, "loss": 0.78331316, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.958753824234009 }, { "auxiliary_loss_clip": 0.01112609, "auxiliary_loss_mlp": 0.01021867, "balance_loss_clip": 1.04471898, "balance_loss_mlp": 1.01508689, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 3.2351314315720896, "language_loss": 0.75576425, "learning_rate": 1.788515822754686e-06, "loss": 0.77710903, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.738098382949829 }, { "auxiliary_loss_clip": 0.01129911, "auxiliary_loss_mlp": 0.01030602, "balance_loss_clip": 1.04699826, "balance_loss_mlp": 1.02242756, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.38237336560709, "language_loss": 0.78026462, "learning_rate": 1.7877412344028335e-06, "loss": 0.80186969, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.7235474586486816 }, { "auxiliary_loss_clip": 0.01164537, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.05313873, "balance_loss_mlp": 1.02728438, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.495036176513095, "language_loss": 0.77614784, "learning_rate": 1.7869666782504668e-06, "loss": 0.79813528, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.6542258262634277 }, { "auxiliary_loss_clip": 0.01139393, "auxiliary_loss_mlp": 0.01024004, "balance_loss_clip": 1.0508312, "balance_loss_mlp": 1.01698256, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 2.185628497484378, "language_loss": 0.69061935, "learning_rate": 1.7861921544150867e-06, "loss": 0.71225333, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.6533451080322266 }, { "auxiliary_loss_clip": 0.01087142, "auxiliary_loss_mlp": 0.00710957, "balance_loss_clip": 1.04551959, "balance_loss_mlp": 1.00029957, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 2.2830430204044903, "language_loss": 0.76680076, "learning_rate": 1.7854176630141856e-06, "loss": 0.78478169, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 3.5961148738861084 }, { "auxiliary_loss_clip": 0.01181189, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 1.05524945, "balance_loss_mlp": 1.02077675, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 5.29588207936867, "language_loss": 0.84355813, "learning_rate": 1.784643204165255e-06, "loss": 0.86564845, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 4.550079107284546 }, { "auxiliary_loss_clip": 0.01160468, "auxiliary_loss_mlp": 0.01020591, "balance_loss_clip": 1.05468762, "balance_loss_mlp": 1.01367402, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 2.7242156597484146, "language_loss": 0.77686203, "learning_rate": 1.7838687779857783e-06, "loss": 0.79867268, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.713409423828125 }, { "auxiliary_loss_clip": 0.0114113, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.05059838, "balance_loss_mlp": 1.01729441, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 2.611836685699392, "language_loss": 0.64165467, "learning_rate": 1.7830943845932366e-06, "loss": 0.66331387, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 3.625491142272949 }, { "auxiliary_loss_clip": 0.01153084, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.05479753, "balance_loss_mlp": 1.01966357, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 2.0739965201658443, "language_loss": 0.75108564, "learning_rate": 1.7823200241051044e-06, "loss": 0.77288747, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.6603236198425293 }, { "auxiliary_loss_clip": 0.01179334, "auxiliary_loss_mlp": 0.01017881, "balance_loss_clip": 1.05482292, "balance_loss_mlp": 1.01105642, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 1.7863013307637436, "language_loss": 0.80677575, "learning_rate": 1.7815456966388513e-06, "loss": 0.82874787, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.676565408706665 }, { "auxiliary_loss_clip": 0.01132447, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 1.04910088, "balance_loss_mlp": 1.0171802, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.361905149253684, "language_loss": 0.80805087, "learning_rate": 1.780771402311943e-06, "loss": 0.82961988, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.7480671405792236 }, { "auxiliary_loss_clip": 0.01149988, "auxiliary_loss_mlp": 0.01028423, "balance_loss_clip": 1.05426478, "balance_loss_mlp": 1.02145553, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 2.1809613559337144, "language_loss": 0.78674579, "learning_rate": 1.7799971412418374e-06, "loss": 0.80852985, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.8052656650543213 }, { "auxiliary_loss_clip": 0.0113084, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.04959404, "balance_loss_mlp": 1.02274561, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.660085072922934, "language_loss": 0.74074769, "learning_rate": 1.7792229135459918e-06, "loss": 0.7623564, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.6993157863616943 }, { "auxiliary_loss_clip": 0.01058509, "auxiliary_loss_mlp": 0.01007902, "balance_loss_clip": 1.05593133, "balance_loss_mlp": 1.00653744, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7454026229797336, "language_loss": 0.61546528, "learning_rate": 1.7784487193418538e-06, "loss": 0.63612938, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.3341524600982666 }, { "auxiliary_loss_clip": 0.01115969, "auxiliary_loss_mlp": 0.01031811, "balance_loss_clip": 1.04459274, "balance_loss_mlp": 1.02376997, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 2.100566594633539, "language_loss": 0.61005306, "learning_rate": 1.7776745587468698e-06, "loss": 0.63153082, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.883700370788574 }, { "auxiliary_loss_clip": 0.01177047, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.05343664, "balance_loss_mlp": 1.01912808, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 4.162610818216715, "language_loss": 0.81781834, "learning_rate": 1.7769004318784776e-06, "loss": 0.83985674, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.649400234222412 }, { "auxiliary_loss_clip": 0.01164653, "auxiliary_loss_mlp": 0.01023907, "balance_loss_clip": 1.0531249, "balance_loss_mlp": 1.01652205, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 2.0414819638562767, "language_loss": 0.80389619, "learning_rate": 1.776126338854113e-06, "loss": 0.82578182, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.6161751747131348 }, { "auxiliary_loss_clip": 0.01160975, "auxiliary_loss_mlp": 0.01024533, "balance_loss_clip": 1.05589342, "balance_loss_mlp": 1.01751733, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.7507134459332026, "language_loss": 0.84495372, "learning_rate": 1.7753522797912044e-06, "loss": 0.86680877, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.7007691860198975 }, { "auxiliary_loss_clip": 0.01151887, "auxiliary_loss_mlp": 0.0102673, "balance_loss_clip": 1.05039656, "balance_loss_mlp": 1.01918983, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.5333211478710553, "language_loss": 0.69809306, "learning_rate": 1.7745782548071765e-06, "loss": 0.71987927, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.6412312984466553 }, { "auxiliary_loss_clip": 0.01129277, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.05549955, "balance_loss_mlp": 1.01656473, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 2.0209986022584285, "language_loss": 0.74354494, "learning_rate": 1.7738042640194482e-06, "loss": 0.76507252, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.6514832973480225 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01021987, "balance_loss_clip": 1.05255747, "balance_loss_mlp": 1.01439953, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.742219139398221, "language_loss": 0.70351362, "learning_rate": 1.7730303075454335e-06, "loss": 0.72549033, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.6343371868133545 }, { "auxiliary_loss_clip": 0.01136281, "auxiliary_loss_mlp": 0.01025498, "balance_loss_clip": 1.05054736, "balance_loss_mlp": 1.01809525, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 8.966680411827932, "language_loss": 0.84846967, "learning_rate": 1.7722563855025402e-06, "loss": 0.8700875, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.6765692234039307 }, { "auxiliary_loss_clip": 0.01144748, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.04733849, "balance_loss_mlp": 1.01645422, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.923524921160013, "language_loss": 0.70994216, "learning_rate": 1.7714824980081721e-06, "loss": 0.73163235, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.7838540077209473 }, { "auxiliary_loss_clip": 0.01157287, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05163944, "balance_loss_mlp": 1.02007127, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 2.5846234965632684, "language_loss": 0.73881185, "learning_rate": 1.7707086451797276e-06, "loss": 0.76065522, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.9247403144836426 }, { "auxiliary_loss_clip": 0.0105451, "auxiliary_loss_mlp": 0.01004896, "balance_loss_clip": 1.02871335, "balance_loss_mlp": 1.00368595, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.7014936056041071, "language_loss": 0.52328408, "learning_rate": 1.7699348271345993e-06, "loss": 0.54387808, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.359018564224243 }, { "auxiliary_loss_clip": 0.01051684, "auxiliary_loss_mlp": 0.01001062, "balance_loss_clip": 1.03502345, "balance_loss_mlp": 0.99984056, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7093529114594493, "language_loss": 0.54420507, "learning_rate": 1.7691610439901753e-06, "loss": 0.56473249, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.4503724575042725 }, { "auxiliary_loss_clip": 0.01163078, "auxiliary_loss_mlp": 0.01025029, "balance_loss_clip": 1.05166876, "balance_loss_mlp": 1.01830244, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 2.8514262322939445, "language_loss": 0.75228554, "learning_rate": 1.7683872958638367e-06, "loss": 0.77416658, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.6435844898223877 }, { "auxiliary_loss_clip": 0.01143467, "auxiliary_loss_mlp": 0.0102275, "balance_loss_clip": 1.04933953, "balance_loss_mlp": 1.01539445, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.6494120904719263, "language_loss": 0.84999478, "learning_rate": 1.7676135828729614e-06, "loss": 0.87165695, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.675747871398926 }, { "auxiliary_loss_clip": 0.0116154, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 1.05359411, "balance_loss_mlp": 1.01835012, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 2.113749429002778, "language_loss": 0.83100694, "learning_rate": 1.7668399051349205e-06, "loss": 0.85288036, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.614309072494507 }, { "auxiliary_loss_clip": 0.01127899, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.04916358, "balance_loss_mlp": 1.02164149, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 2.408175861392256, "language_loss": 0.8357898, "learning_rate": 1.766066262767081e-06, "loss": 0.85735905, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.7095375061035156 }, { "auxiliary_loss_clip": 0.01139517, "auxiliary_loss_mlp": 0.01026649, "balance_loss_clip": 1.05069709, "balance_loss_mlp": 1.01972866, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.122427253847229, "language_loss": 0.7753675, "learning_rate": 1.765292655886803e-06, "loss": 0.79702914, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 3.784505844116211 }, { "auxiliary_loss_clip": 0.01138847, "auxiliary_loss_mlp": 0.01028991, "balance_loss_clip": 1.05021787, "balance_loss_mlp": 1.02176404, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 1.774896440102633, "language_loss": 0.70950288, "learning_rate": 1.764519084611443e-06, "loss": 0.73118126, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 3.744588613510132 }, { "auxiliary_loss_clip": 0.01145158, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.0487082, "balance_loss_mlp": 1.01577854, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.9125256758578308, "language_loss": 0.77597427, "learning_rate": 1.7637455490583505e-06, "loss": 0.79766166, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 3.607490301132202 }, { "auxiliary_loss_clip": 0.01160335, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.05185747, "balance_loss_mlp": 1.0237385, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.1854407078579916, "language_loss": 0.77518773, "learning_rate": 1.7629720493448701e-06, "loss": 0.79710037, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.657109022140503 }, { "auxiliary_loss_clip": 0.01152209, "auxiliary_loss_mlp": 0.01025901, "balance_loss_clip": 1.05069065, "balance_loss_mlp": 1.01877546, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 2.3193726990256587, "language_loss": 0.85434306, "learning_rate": 1.7621985855883418e-06, "loss": 0.87612414, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.6181352138519287 }, { "auxiliary_loss_clip": 0.01137749, "auxiliary_loss_mlp": 0.01023264, "balance_loss_clip": 1.04832757, "balance_loss_mlp": 1.01628447, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.9236584361716371, "language_loss": 0.72424555, "learning_rate": 1.7614251579060983e-06, "loss": 0.74585569, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.646143674850464 }, { "auxiliary_loss_clip": 0.01131415, "auxiliary_loss_mlp": 0.01026024, "balance_loss_clip": 1.04942369, "balance_loss_mlp": 1.01881802, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.7249028656042034, "language_loss": 0.84945905, "learning_rate": 1.76065176641547e-06, "loss": 0.87103343, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.7341551780700684 }, { "auxiliary_loss_clip": 0.01161861, "auxiliary_loss_mlp": 0.01023952, "balance_loss_clip": 1.05059052, "balance_loss_mlp": 1.01673937, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 2.404661477320244, "language_loss": 0.78005219, "learning_rate": 1.759878411233777e-06, "loss": 0.80191028, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.700640916824341 }, { "auxiliary_loss_clip": 0.01159989, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.05062807, "balance_loss_mlp": 1.01821876, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.656802982657153, "language_loss": 0.7608155, "learning_rate": 1.7591050924783388e-06, "loss": 0.78267109, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.5644216537475586 }, { "auxiliary_loss_clip": 0.01043273, "auxiliary_loss_mlp": 0.01005873, "balance_loss_clip": 1.03380036, "balance_loss_mlp": 1.00455558, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8381740146203713, "language_loss": 0.57913315, "learning_rate": 1.7583318102664661e-06, "loss": 0.59962457, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.2849280834198 }, { "auxiliary_loss_clip": 0.01162526, "auxiliary_loss_mlp": 0.01023719, "balance_loss_clip": 1.04810631, "balance_loss_mlp": 1.01641786, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 2.039177169551136, "language_loss": 0.79012322, "learning_rate": 1.757558564715466e-06, "loss": 0.81198567, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.638375759124756 }, { "auxiliary_loss_clip": 0.01159923, "auxiliary_loss_mlp": 0.01022444, "balance_loss_clip": 1.04956663, "balance_loss_mlp": 1.01512408, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 2.5866780451568228, "language_loss": 0.74597138, "learning_rate": 1.7567853559426386e-06, "loss": 0.76779509, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.615405321121216 }, { "auxiliary_loss_clip": 0.01162685, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.05204964, "balance_loss_mlp": 1.0208168, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.092824918233577, "language_loss": 0.74989498, "learning_rate": 1.7560121840652797e-06, "loss": 0.77180576, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.870532989501953 }, { "auxiliary_loss_clip": 0.01121619, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.04806077, "balance_loss_mlp": 1.02085567, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 2.3018098913816245, "language_loss": 0.68762952, "learning_rate": 1.7552390492006782e-06, "loss": 0.70912892, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.782200813293457 }, { "auxiliary_loss_clip": 0.01115487, "auxiliary_loss_mlp": 0.00711537, "balance_loss_clip": 1.04243481, "balance_loss_mlp": 1.00034237, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 7.750625741640977, "language_loss": 0.65332901, "learning_rate": 1.7544659514661184e-06, "loss": 0.67159927, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.8018927574157715 }, { "auxiliary_loss_clip": 0.01138755, "auxiliary_loss_mlp": 0.01023161, "balance_loss_clip": 1.04660594, "balance_loss_mlp": 1.01641655, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.070464563727968, "language_loss": 0.79850638, "learning_rate": 1.7536928909788786e-06, "loss": 0.82012552, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.698755979537964 }, { "auxiliary_loss_clip": 0.0104794, "auxiliary_loss_mlp": 0.01004441, "balance_loss_clip": 1.03173733, "balance_loss_mlp": 1.00301063, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8796337226207342, "language_loss": 0.61947662, "learning_rate": 1.752919867856231e-06, "loss": 0.6400004, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.085258960723877 }, { "auxiliary_loss_clip": 0.01137493, "auxiliary_loss_mlp": 0.01022915, "balance_loss_clip": 1.04795933, "balance_loss_mlp": 1.01595068, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 2.128663053665916, "language_loss": 0.78814417, "learning_rate": 1.7521468822154436e-06, "loss": 0.80974829, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.6200737953186035 }, { "auxiliary_loss_clip": 0.01139158, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.05088997, "balance_loss_mlp": 1.01916814, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 4.506460520467966, "language_loss": 0.74976885, "learning_rate": 1.751373934173777e-06, "loss": 0.77142227, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.738355875015259 }, { "auxiliary_loss_clip": 0.01178456, "auxiliary_loss_mlp": 0.01028126, "balance_loss_clip": 1.05325973, "balance_loss_mlp": 1.02058041, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.870641315623, "language_loss": 0.73520684, "learning_rate": 1.750601023848487e-06, "loss": 0.7572726, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.6214828491210938 }, { "auxiliary_loss_clip": 0.01176373, "auxiliary_loss_mlp": 0.00710975, "balance_loss_clip": 1.05441916, "balance_loss_mlp": 1.00035429, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.8950229311457607, "language_loss": 0.7392211, "learning_rate": 1.749828151356823e-06, "loss": 0.75809461, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.5879130363464355 }, { "auxiliary_loss_clip": 0.01146098, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.04994559, "balance_loss_mlp": 1.01778007, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 2.106951802239136, "language_loss": 0.75637484, "learning_rate": 1.7490553168160297e-06, "loss": 0.77808243, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.6998491287231445 }, { "auxiliary_loss_clip": 0.01141796, "auxiliary_loss_mlp": 0.01023955, "balance_loss_clip": 1.04856777, "balance_loss_mlp": 1.01711798, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 2.581826282868055, "language_loss": 0.76759779, "learning_rate": 1.748282520343345e-06, "loss": 0.78925526, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.5979371070861816 }, { "auxiliary_loss_clip": 0.01165598, "auxiliary_loss_mlp": 0.01025105, "balance_loss_clip": 1.05144382, "balance_loss_mlp": 1.01758862, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 2.2749300729532074, "language_loss": 0.78977782, "learning_rate": 1.7475097620560023e-06, "loss": 0.81168485, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.672621726989746 }, { "auxiliary_loss_clip": 0.01173991, "auxiliary_loss_mlp": 0.01021636, "balance_loss_clip": 1.05162489, "balance_loss_mlp": 1.01484418, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 2.0875840424524825, "language_loss": 0.71342272, "learning_rate": 1.746737042071228e-06, "loss": 0.73537898, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.6326475143432617 }, { "auxiliary_loss_clip": 0.01136569, "auxiliary_loss_mlp": 0.01021941, "balance_loss_clip": 1.04806709, "balance_loss_mlp": 1.01461279, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 2.851862299419618, "language_loss": 0.79151189, "learning_rate": 1.7459643605062424e-06, "loss": 0.813097, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.64217472076416 }, { "auxiliary_loss_clip": 0.01108261, "auxiliary_loss_mlp": 0.01021291, "balance_loss_clip": 1.04631376, "balance_loss_mlp": 1.01412034, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.9710240963319814, "language_loss": 0.80838263, "learning_rate": 1.745191717478262e-06, "loss": 0.82967818, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 3.990579128265381 }, { "auxiliary_loss_clip": 0.01142947, "auxiliary_loss_mlp": 0.01026503, "balance_loss_clip": 1.05090213, "balance_loss_mlp": 1.01976144, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.885962708917919, "language_loss": 0.79569149, "learning_rate": 1.7444191131044948e-06, "loss": 0.81738591, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 4.530771255493164 }, { "auxiliary_loss_clip": 0.01144433, "auxiliary_loss_mlp": 0.01023328, "balance_loss_clip": 1.05145645, "balance_loss_mlp": 1.01547778, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 2.688594183971982, "language_loss": 0.7316587, "learning_rate": 1.7436465475021456e-06, "loss": 0.75333631, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.595435857772827 }, { "auxiliary_loss_clip": 0.01118262, "auxiliary_loss_mlp": 0.01025029, "balance_loss_clip": 1.04474843, "balance_loss_mlp": 1.01850235, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.108291749903358, "language_loss": 0.7120719, "learning_rate": 1.7428740207884111e-06, "loss": 0.73350477, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.6758553981781006 }, { "auxiliary_loss_clip": 0.01113135, "auxiliary_loss_mlp": 0.01022471, "balance_loss_clip": 1.04649711, "balance_loss_mlp": 1.01522326, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 2.810051982274279, "language_loss": 0.61405176, "learning_rate": 1.7421015330804833e-06, "loss": 0.63540781, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.8289756774902344 }, { "auxiliary_loss_clip": 0.01175143, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.05156398, "balance_loss_mlp": 1.01948261, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 2.560502024816696, "language_loss": 0.7239188, "learning_rate": 1.7413290844955475e-06, "loss": 0.74593425, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.5760483741760254 }, { "auxiliary_loss_clip": 0.01153472, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.05129731, "balance_loss_mlp": 1.0192703, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 2.193127685854045, "language_loss": 0.78043205, "learning_rate": 1.7405566751507843e-06, "loss": 0.80223149, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.587587833404541 }, { "auxiliary_loss_clip": 0.01124837, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 1.04652584, "balance_loss_mlp": 1.01641798, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.7627240472169747, "language_loss": 0.67826271, "learning_rate": 1.7397843051633668e-06, "loss": 0.69974226, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 2.941660165786743 }, { "auxiliary_loss_clip": 0.01155801, "auxiliary_loss_mlp": 0.01021597, "balance_loss_clip": 1.0494473, "balance_loss_mlp": 1.01445675, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.8700622392805004, "language_loss": 0.71432841, "learning_rate": 1.739011974650464e-06, "loss": 0.73610234, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.5682575702667236 }, { "auxiliary_loss_clip": 0.01115857, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.04693747, "balance_loss_mlp": 1.01919079, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 2.6677797633118647, "language_loss": 0.769171, "learning_rate": 1.7382396837292365e-06, "loss": 0.79060066, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.778939962387085 }, { "auxiliary_loss_clip": 0.01177671, "auxiliary_loss_mlp": 0.01029458, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.02206397, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.821308040118028, "language_loss": 0.734281, "learning_rate": 1.737467432516841e-06, "loss": 0.75635231, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.557917833328247 }, { "auxiliary_loss_clip": 0.01142295, "auxiliary_loss_mlp": 0.01027965, "balance_loss_clip": 1.04605269, "balance_loss_mlp": 1.02024579, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 2.927098670874654, "language_loss": 0.73667908, "learning_rate": 1.7366952211304274e-06, "loss": 0.75838161, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.6733994483947754 }, { "auxiliary_loss_clip": 0.01133989, "auxiliary_loss_mlp": 0.01024402, "balance_loss_clip": 1.04608059, "balance_loss_mlp": 1.01756573, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.4968237713023917, "language_loss": 0.83689481, "learning_rate": 1.735923049687139e-06, "loss": 0.85847872, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.6118321418762207 }, { "auxiliary_loss_clip": 0.01138554, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.04711926, "balance_loss_mlp": 1.01899469, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.9916082677125848, "language_loss": 0.74013543, "learning_rate": 1.7351509183041144e-06, "loss": 0.76178622, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.744051456451416 }, { "auxiliary_loss_clip": 0.01177816, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.0532248, "balance_loss_mlp": 1.01916826, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.844854294478466, "language_loss": 0.7166028, "learning_rate": 1.7343788270984852e-06, "loss": 0.73864603, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.613614797592163 }, { "auxiliary_loss_clip": 0.01143791, "auxiliary_loss_mlp": 0.01024629, "balance_loss_clip": 1.05110812, "balance_loss_mlp": 1.01735115, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.955785013783351, "language_loss": 0.74948728, "learning_rate": 1.7336067761873764e-06, "loss": 0.77117151, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.7500131130218506 }, { "auxiliary_loss_clip": 0.01164729, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 1.05100787, "balance_loss_mlp": 1.02127075, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 2.673289833421202, "language_loss": 0.76403069, "learning_rate": 1.7328347656879076e-06, "loss": 0.78596675, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.622579574584961 }, { "auxiliary_loss_clip": 0.01128176, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.04630995, "balance_loss_mlp": 1.0233264, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.7312658215798766, "language_loss": 0.68644643, "learning_rate": 1.7320627957171927e-06, "loss": 0.70803654, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.6011133193969727 }, { "auxiliary_loss_clip": 0.011776, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.05530334, "balance_loss_mlp": 1.02288198, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 2.089425642443102, "language_loss": 0.81413257, "learning_rate": 1.7312908663923382e-06, "loss": 0.83621097, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.5363471508026123 }, { "auxiliary_loss_clip": 0.0115384, "auxiliary_loss_mlp": 0.01022253, "balance_loss_clip": 1.04939473, "balance_loss_mlp": 1.01457012, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 1.9821293485992488, "language_loss": 0.67693925, "learning_rate": 1.7305189778304463e-06, "loss": 0.69870019, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.576176881790161 }, { "auxiliary_loss_clip": 0.01147895, "auxiliary_loss_mlp": 0.01020024, "balance_loss_clip": 1.05533779, "balance_loss_mlp": 1.01324081, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 3.821877215384538, "language_loss": 0.79975742, "learning_rate": 1.729747130148611e-06, "loss": 0.82143658, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.652078628540039 }, { "auxiliary_loss_clip": 0.01135532, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.04970968, "balance_loss_mlp": 1.02348363, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 2.31281772900147, "language_loss": 0.76921892, "learning_rate": 1.7289753234639208e-06, "loss": 0.79088676, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.6985292434692383 }, { "auxiliary_loss_clip": 0.01166642, "auxiliary_loss_mlp": 0.01024241, "balance_loss_clip": 1.05313373, "balance_loss_mlp": 1.01647401, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 1.8866865703196976, "language_loss": 0.76599222, "learning_rate": 1.7282035578934592e-06, "loss": 0.78790104, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.6092047691345215 }, { "auxiliary_loss_clip": 0.0113753, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.05057597, "balance_loss_mlp": 1.01817989, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.74754749870508, "language_loss": 0.78764385, "learning_rate": 1.727431833554301e-06, "loss": 0.80927253, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.599715232849121 }, { "auxiliary_loss_clip": 0.01103349, "auxiliary_loss_mlp": 0.01025439, "balance_loss_clip": 1.04440618, "balance_loss_mlp": 1.01852477, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 1.9377647125044077, "language_loss": 0.77208889, "learning_rate": 1.7266601505635175e-06, "loss": 0.7933768, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.6899123191833496 }, { "auxiliary_loss_clip": 0.01161572, "auxiliary_loss_mlp": 0.01029168, "balance_loss_clip": 1.05280149, "balance_loss_mlp": 1.02163982, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 3.0216527458576454, "language_loss": 0.75401163, "learning_rate": 1.7258885090381717e-06, "loss": 0.77591902, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.5738978385925293 }, { "auxiliary_loss_clip": 0.01150447, "auxiliary_loss_mlp": 0.01019632, "balance_loss_clip": 1.05215216, "balance_loss_mlp": 1.01258671, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 2.0143214722940663, "language_loss": 0.78663075, "learning_rate": 1.7251169090953213e-06, "loss": 0.80833149, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 3.5653083324432373 }, { "auxiliary_loss_clip": 0.01159024, "auxiliary_loss_mlp": 0.01023528, "balance_loss_clip": 1.0505476, "balance_loss_mlp": 1.0160296, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 2.5699587548435403, "language_loss": 0.76803493, "learning_rate": 1.7243453508520168e-06, "loss": 0.78986043, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 5.367725133895874 }, { "auxiliary_loss_clip": 0.01143842, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.0477953, "balance_loss_mlp": 1.02393329, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 5.578822900499228, "language_loss": 0.84801728, "learning_rate": 1.7235738344253038e-06, "loss": 0.86976707, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.605581521987915 }, { "auxiliary_loss_clip": 0.01161964, "auxiliary_loss_mlp": 0.01029298, "balance_loss_clip": 1.0547719, "balance_loss_mlp": 1.02188909, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 3.4729644675813867, "language_loss": 0.82724833, "learning_rate": 1.72280235993222e-06, "loss": 0.84916091, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.6618480682373047 }, { "auxiliary_loss_clip": 0.01155666, "auxiliary_loss_mlp": 0.00712054, "balance_loss_clip": 1.04968989, "balance_loss_mlp": 1.00044751, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.247798147380609, "language_loss": 0.69304234, "learning_rate": 1.722030927489798e-06, "loss": 0.71171957, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 2.5405075550079346 }, { "auxiliary_loss_clip": 0.01130459, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.05113459, "balance_loss_mlp": 1.01631284, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.9881196900324554, "language_loss": 0.74384594, "learning_rate": 1.7212595372150634e-06, "loss": 0.76538819, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.70914888381958 }, { "auxiliary_loss_clip": 0.01176916, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.05475569, "balance_loss_mlp": 1.01846051, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 8.946252188842937, "language_loss": 0.72812271, "learning_rate": 1.720488189225035e-06, "loss": 0.75015318, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.5476396083831787 }, { "auxiliary_loss_clip": 0.0116265, "auxiliary_loss_mlp": 0.01028053, "balance_loss_clip": 1.05118155, "balance_loss_mlp": 1.01996422, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 2.5050030162604178, "language_loss": 0.79594523, "learning_rate": 1.7197168836367265e-06, "loss": 0.81785226, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.6340391635894775 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.00711517, "balance_loss_clip": 1.04965019, "balance_loss_mlp": 1.00040317, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.9423118761016718, "language_loss": 0.81890893, "learning_rate": 1.7189456205671433e-06, "loss": 0.83760279, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.619542121887207 }, { "auxiliary_loss_clip": 0.01168278, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.05401897, "balance_loss_mlp": 1.01746535, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 2.3829926745305525, "language_loss": 0.82549983, "learning_rate": 1.7181744001332866e-06, "loss": 0.84743053, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.5568325519561768 }, { "auxiliary_loss_clip": 0.01175744, "auxiliary_loss_mlp": 0.01027457, "balance_loss_clip": 1.05457544, "balance_loss_mlp": 1.02052546, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 2.1016653546881465, "language_loss": 0.63675308, "learning_rate": 1.7174032224521493e-06, "loss": 0.65878516, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.617837905883789 }, { "auxiliary_loss_clip": 0.01161341, "auxiliary_loss_mlp": 0.01024183, "balance_loss_clip": 1.05191123, "balance_loss_mlp": 1.0172807, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.6535158043932896, "language_loss": 0.69763148, "learning_rate": 1.7166320876407184e-06, "loss": 0.71948665, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.582139730453491 }, { "auxiliary_loss_clip": 0.01177664, "auxiliary_loss_mlp": 0.00711446, "balance_loss_clip": 1.05375624, "balance_loss_mlp": 1.00048304, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 2.1371307866098364, "language_loss": 0.68106526, "learning_rate": 1.7158609958159742e-06, "loss": 0.69995642, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.550870895385742 }, { "auxiliary_loss_clip": 0.01109179, "auxiliary_loss_mlp": 0.01028731, "balance_loss_clip": 1.04766726, "balance_loss_mlp": 1.02125919, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 2.1744493122143265, "language_loss": 0.78229141, "learning_rate": 1.7150899470948911e-06, "loss": 0.80367053, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.642368793487549 }, { "auxiliary_loss_clip": 0.01061906, "auxiliary_loss_mlp": 0.01003306, "balance_loss_clip": 1.02696753, "balance_loss_mlp": 1.00204265, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.815945976217019, "language_loss": 0.56664741, "learning_rate": 1.7143189415944365e-06, "loss": 0.58729959, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.2707598209381104 }, { "auxiliary_loss_clip": 0.0115728, "auxiliary_loss_mlp": 0.01023729, "balance_loss_clip": 1.05056489, "balance_loss_mlp": 1.01637423, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 2.4199249551235837, "language_loss": 0.76426309, "learning_rate": 1.7135479794315714e-06, "loss": 0.78607321, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.6057543754577637 }, { "auxiliary_loss_clip": 0.01128281, "auxiliary_loss_mlp": 0.01020988, "balance_loss_clip": 1.04979002, "balance_loss_mlp": 1.01371336, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 2.67948530806508, "language_loss": 0.79007316, "learning_rate": 1.7127770607232502e-06, "loss": 0.81156588, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.6853132247924805 }, { "auxiliary_loss_clip": 0.01136173, "auxiliary_loss_mlp": 0.01023364, "balance_loss_clip": 1.04840064, "balance_loss_mlp": 1.01622319, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 2.309207613458718, "language_loss": 0.79818118, "learning_rate": 1.7120061855864204e-06, "loss": 0.81977654, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.7940971851348877 }, { "auxiliary_loss_clip": 0.01161574, "auxiliary_loss_mlp": 0.0103127, "balance_loss_clip": 1.05400252, "balance_loss_mlp": 1.02385473, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 2.2021934030677097, "language_loss": 0.71535683, "learning_rate": 1.7112353541380233e-06, "loss": 0.73728526, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.702986478805542 }, { "auxiliary_loss_clip": 0.01144929, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.05132651, "balance_loss_mlp": 1.02251124, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.609551706975601, "language_loss": 0.72294641, "learning_rate": 1.7104645664949931e-06, "loss": 0.74469459, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.703094959259033 }, { "auxiliary_loss_clip": 0.01145805, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.04945385, "balance_loss_mlp": 1.02330434, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 2.135630505432833, "language_loss": 0.71662945, "learning_rate": 1.7096938227742584e-06, "loss": 0.73840189, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.674302816390991 }, { "auxiliary_loss_clip": 0.01178462, "auxiliary_loss_mlp": 0.01020257, "balance_loss_clip": 1.0551455, "balance_loss_mlp": 1.01298547, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.4267201907598412, "language_loss": 0.84554839, "learning_rate": 1.70892312309274e-06, "loss": 0.86753559, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.59442400932312 }, { "auxiliary_loss_clip": 0.01143429, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.0452944, "balance_loss_mlp": 1.02024102, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.5864323635366744, "language_loss": 0.68273103, "learning_rate": 1.7081524675673523e-06, "loss": 0.70444202, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.6258127689361572 }, { "auxiliary_loss_clip": 0.01064706, "auxiliary_loss_mlp": 0.0100146, "balance_loss_clip": 1.02925098, "balance_loss_mlp": 1.00019634, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7770462629042129, "language_loss": 0.59619403, "learning_rate": 1.7073818563150026e-06, "loss": 0.61685574, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.2849302291870117 }, { "auxiliary_loss_clip": 0.01156297, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.04984629, "balance_loss_mlp": 1.02113128, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.966558623825563, "language_loss": 0.86694419, "learning_rate": 1.7066112894525935e-06, "loss": 0.88878918, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.553678035736084 }, { "auxiliary_loss_clip": 0.01138233, "auxiliary_loss_mlp": 0.01026454, "balance_loss_clip": 1.04892719, "balance_loss_mlp": 1.01889062, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.937752943029123, "language_loss": 0.7278685, "learning_rate": 1.7058407670970177e-06, "loss": 0.7495153, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.6972243785858154 }, { "auxiliary_loss_clip": 0.01166091, "auxiliary_loss_mlp": 0.01027073, "balance_loss_clip": 1.05216861, "balance_loss_mlp": 1.01957798, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 5.089386047520329, "language_loss": 0.61525804, "learning_rate": 1.7050702893651643e-06, "loss": 0.63718969, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 3.524068593978882 }, { "auxiliary_loss_clip": 0.01162609, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.05409849, "balance_loss_mlp": 1.01880682, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.8849229343976277, "language_loss": 0.7566402, "learning_rate": 1.7042998563739134e-06, "loss": 0.77852726, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 3.6938974857330322 }, { "auxiliary_loss_clip": 0.0115223, "auxiliary_loss_mlp": 0.01026812, "balance_loss_clip": 1.04996145, "balance_loss_mlp": 1.01901579, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 3.3749413795509438, "language_loss": 0.71941292, "learning_rate": 1.703529468240139e-06, "loss": 0.74120331, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 3.9775161743164062 }, { "auxiliary_loss_clip": 0.01139347, "auxiliary_loss_mlp": 0.01024521, "balance_loss_clip": 1.04843211, "balance_loss_mlp": 1.01676369, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 7.841536897070591, "language_loss": 0.73401082, "learning_rate": 1.7027591250807088e-06, "loss": 0.75564957, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.6530635356903076 }, { "auxiliary_loss_clip": 0.01178633, "auxiliary_loss_mlp": 0.01027095, "balance_loss_clip": 1.05420804, "balance_loss_mlp": 1.01995432, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 2.3333569357741895, "language_loss": 0.84365964, "learning_rate": 1.7019888270124825e-06, "loss": 0.86571693, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.5146877765655518 }, { "auxiliary_loss_clip": 0.01164261, "auxiliary_loss_mlp": 0.0102962, "balance_loss_clip": 1.05355656, "balance_loss_mlp": 1.02233887, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 3.2252816194204628, "language_loss": 0.82036233, "learning_rate": 1.7012185741523147e-06, "loss": 0.84230113, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.655428886413574 }, { "auxiliary_loss_clip": 0.01178126, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.05426693, "balance_loss_mlp": 1.01921856, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 2.873498079602762, "language_loss": 0.63073552, "learning_rate": 1.7004483666170514e-06, "loss": 0.65278119, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.5998589992523193 }, { "auxiliary_loss_clip": 0.01157876, "auxiliary_loss_mlp": 0.01021479, "balance_loss_clip": 1.04938006, "balance_loss_mlp": 1.01449871, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 2.2665110449725336, "language_loss": 0.80689627, "learning_rate": 1.699678204523533e-06, "loss": 0.82868981, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.6148130893707275 }, { "auxiliary_loss_clip": 0.01150148, "auxiliary_loss_mlp": 0.0102162, "balance_loss_clip": 1.05416834, "balance_loss_mlp": 1.01405048, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 12.57120408847727, "language_loss": 0.68673801, "learning_rate": 1.6989080879885918e-06, "loss": 0.70845568, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.6487650871276855 }, { "auxiliary_loss_clip": 0.01049915, "auxiliary_loss_mlp": 0.01004857, "balance_loss_clip": 1.02509725, "balance_loss_mlp": 1.00359297, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.8853214628555268, "language_loss": 0.6099028, "learning_rate": 1.6981380171290544e-06, "loss": 0.63045043, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.2355539798736572 }, { "auxiliary_loss_clip": 0.01138883, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.04583788, "balance_loss_mlp": 1.01807904, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 2.103920932874562, "language_loss": 0.74549288, "learning_rate": 1.6973679920617396e-06, "loss": 0.76713383, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.594743013381958 }, { "auxiliary_loss_clip": 0.01144352, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.05118048, "balance_loss_mlp": 1.01938593, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 1.9993975426798682, "language_loss": 0.85417938, "learning_rate": 1.6965980129034603e-06, "loss": 0.87589461, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.6515684127807617 }, { "auxiliary_loss_clip": 0.01147805, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.05340719, "balance_loss_mlp": 1.0225817, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.86802886209192, "language_loss": 0.76528347, "learning_rate": 1.6958280797710209e-06, "loss": 0.78705931, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.655200242996216 }, { "auxiliary_loss_clip": 0.01063256, "auxiliary_loss_mlp": 0.01003188, "balance_loss_clip": 1.02668262, "balance_loss_mlp": 1.0019778, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7142943287820275, "language_loss": 0.54775518, "learning_rate": 1.6950581927812198e-06, "loss": 0.56841958, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.0911924839019775 }, { "auxiliary_loss_clip": 0.01161288, "auxiliary_loss_mlp": 0.01024402, "balance_loss_clip": 1.05031812, "balance_loss_mlp": 1.01671922, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 2.2085191250295644, "language_loss": 0.79229963, "learning_rate": 1.6942883520508486e-06, "loss": 0.81415653, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.665050506591797 }, { "auxiliary_loss_clip": 0.01160566, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 1.05046177, "balance_loss_mlp": 1.01828575, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 3.1442196078197195, "language_loss": 0.77193952, "learning_rate": 1.693518557696691e-06, "loss": 0.79380453, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.594191312789917 }, { "auxiliary_loss_clip": 0.01158372, "auxiliary_loss_mlp": 0.0102355, "balance_loss_clip": 1.04893017, "balance_loss_mlp": 1.01620626, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 3.017673576931804, "language_loss": 0.89169282, "learning_rate": 1.6927488098355252e-06, "loss": 0.91351199, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.6544554233551025 }, { "auxiliary_loss_clip": 0.01044862, "auxiliary_loss_mlp": 0.01002408, "balance_loss_clip": 1.02763295, "balance_loss_mlp": 1.0012219, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.8979484160458037, "language_loss": 0.63177645, "learning_rate": 1.6919791085841201e-06, "loss": 0.65224916, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.21317720413208 }, { "auxiliary_loss_clip": 0.01153916, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.04774737, "balance_loss_mlp": 1.01882565, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.0537856232261293, "language_loss": 0.7859087, "learning_rate": 1.6912094540592396e-06, "loss": 0.80771726, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.590718984603882 }, { "auxiliary_loss_clip": 0.01158577, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 1.05172932, "balance_loss_mlp": 1.01559162, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.691407096338392, "language_loss": 0.81314993, "learning_rate": 1.6904398463776393e-06, "loss": 0.83496904, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.5262277126312256 }, { "auxiliary_loss_clip": 0.0116034, "auxiliary_loss_mlp": 0.01024501, "balance_loss_clip": 1.04982591, "balance_loss_mlp": 1.01756024, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.9922173101086298, "language_loss": 0.73038411, "learning_rate": 1.6896702856560683e-06, "loss": 0.75223249, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.6243715286254883 }, { "auxiliary_loss_clip": 0.01125891, "auxiliary_loss_mlp": 0.01030513, "balance_loss_clip": 1.04487038, "balance_loss_mlp": 1.02354264, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 15.826319905882805, "language_loss": 0.69845629, "learning_rate": 1.6889007720112677e-06, "loss": 0.72002035, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.6323723793029785 }, { "auxiliary_loss_clip": 0.01162851, "auxiliary_loss_mlp": 0.0102285, "balance_loss_clip": 1.05192101, "balance_loss_mlp": 1.01609087, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 2.318148716513872, "language_loss": 0.77323115, "learning_rate": 1.6881313055599734e-06, "loss": 0.79508817, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.635664701461792 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01028317, "balance_loss_clip": 1.04639125, "balance_loss_mlp": 1.0206337, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 8.097108206006872, "language_loss": 0.8216396, "learning_rate": 1.6873618864189117e-06, "loss": 0.84325236, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.6357240676879883 }, { "auxiliary_loss_clip": 0.01158819, "auxiliary_loss_mlp": 0.01024123, "balance_loss_clip": 1.04938471, "balance_loss_mlp": 1.01622546, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.3669393215953924, "language_loss": 0.77977085, "learning_rate": 1.686592514704803e-06, "loss": 0.80160022, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.611034393310547 }, { "auxiliary_loss_clip": 0.0114579, "auxiliary_loss_mlp": 0.01024852, "balance_loss_clip": 1.05304062, "balance_loss_mlp": 1.01800895, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.3985905464928536, "language_loss": 0.71046281, "learning_rate": 1.685823190534361e-06, "loss": 0.73216921, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.6828508377075195 }, { "auxiliary_loss_clip": 0.0118138, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.05444896, "balance_loss_mlp": 1.01850331, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 3.101653511111054, "language_loss": 0.84077251, "learning_rate": 1.6850539140242907e-06, "loss": 0.86284649, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 3.5475828647613525 }, { "auxiliary_loss_clip": 0.01163597, "auxiliary_loss_mlp": 0.0102775, "balance_loss_clip": 1.0509603, "balance_loss_mlp": 1.02042723, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 2.4009969800766506, "language_loss": 0.82233119, "learning_rate": 1.684284685291292e-06, "loss": 0.8442446, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 4.430718421936035 }, { "auxiliary_loss_clip": 0.01178528, "auxiliary_loss_mlp": 0.01027433, "balance_loss_clip": 1.05409956, "balance_loss_mlp": 1.01938081, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 1.9861377188881482, "language_loss": 0.81288689, "learning_rate": 1.683515504452055e-06, "loss": 0.83494651, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 3.4975368976593018 }, { "auxiliary_loss_clip": 0.01121869, "auxiliary_loss_mlp": 0.01028079, "balance_loss_clip": 1.04669511, "balance_loss_mlp": 1.01994336, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.6155310080337337, "language_loss": 0.66215324, "learning_rate": 1.6827463716232648e-06, "loss": 0.68365276, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.663404703140259 }, { "auxiliary_loss_clip": 0.01162086, "auxiliary_loss_mlp": 0.00711289, "balance_loss_clip": 1.05322087, "balance_loss_mlp": 1.00042367, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.654340829433674, "language_loss": 0.75830561, "learning_rate": 1.6819772869215972e-06, "loss": 0.77703929, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.664936065673828 }, { "auxiliary_loss_clip": 0.01148902, "auxiliary_loss_mlp": 0.01019641, "balance_loss_clip": 1.04967153, "balance_loss_mlp": 1.01287913, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 2.654873314489461, "language_loss": 0.82346296, "learning_rate": 1.6812082504637228e-06, "loss": 0.84514844, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.6460087299346924 }, { "auxiliary_loss_clip": 0.0115893, "auxiliary_loss_mlp": 0.0102462, "balance_loss_clip": 1.05271292, "balance_loss_mlp": 1.01790869, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 2.139340794692042, "language_loss": 0.74307704, "learning_rate": 1.6804392623663025e-06, "loss": 0.76491249, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.684964895248413 }, { "auxiliary_loss_clip": 0.01156768, "auxiliary_loss_mlp": 0.01023601, "balance_loss_clip": 1.05217743, "balance_loss_mlp": 1.01705003, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.8346377732810866, "language_loss": 0.78258371, "learning_rate": 1.6796703227459935e-06, "loss": 0.80438733, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.6257622241973877 }, { "auxiliary_loss_clip": 0.01102479, "auxiliary_loss_mlp": 0.01024431, "balance_loss_clip": 1.04226577, "balance_loss_mlp": 1.01694417, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 2.5296672130205105, "language_loss": 0.76042634, "learning_rate": 1.6789014317194407e-06, "loss": 0.78169549, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.8082022666931152 }, { "auxiliary_loss_clip": 0.01151091, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.05124378, "balance_loss_mlp": 1.0238353, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.5885079271799563, "language_loss": 0.72845924, "learning_rate": 1.6781325894032853e-06, "loss": 0.75028336, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.644207000732422 }, { "auxiliary_loss_clip": 0.01141168, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.05061233, "balance_loss_mlp": 1.01982093, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 2.1612794000373867, "language_loss": 0.92029202, "learning_rate": 1.6773637959141608e-06, "loss": 0.9419716, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.806643009185791 }, { "auxiliary_loss_clip": 0.01135748, "auxiliary_loss_mlp": 0.01022122, "balance_loss_clip": 1.04866982, "balance_loss_mlp": 1.01472521, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.344775394826156, "language_loss": 0.66589725, "learning_rate": 1.6765950513686915e-06, "loss": 0.68747592, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.6418282985687256 }, { "auxiliary_loss_clip": 0.01114153, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 1.04538357, "balance_loss_mlp": 1.01948905, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 1.6856911970885464, "language_loss": 0.76481557, "learning_rate": 1.675826355883496e-06, "loss": 0.78622437, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.7275571823120117 }, { "auxiliary_loss_clip": 0.01139441, "auxiliary_loss_mlp": 0.01026072, "balance_loss_clip": 1.05045843, "balance_loss_mlp": 1.01862395, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 2.2616933942521795, "language_loss": 0.79206437, "learning_rate": 1.6750577095751848e-06, "loss": 0.81371951, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.6787257194519043 }, { "auxiliary_loss_clip": 0.01175244, "auxiliary_loss_mlp": 0.01019186, "balance_loss_clip": 1.05340505, "balance_loss_mlp": 1.01234031, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.720329044976058, "language_loss": 0.72639245, "learning_rate": 1.6742891125603605e-06, "loss": 0.74833673, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.590062379837036 }, { "auxiliary_loss_clip": 0.01160649, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.05258977, "balance_loss_mlp": 1.0253582, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 1.9760782199187523, "language_loss": 0.72580516, "learning_rate": 1.6735205649556185e-06, "loss": 0.74774206, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.6941022872924805 }, { "auxiliary_loss_clip": 0.01130911, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 1.04676652, "balance_loss_mlp": 1.01853251, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 2.5312300439546243, "language_loss": 0.84807384, "learning_rate": 1.6727520668775476e-06, "loss": 0.8696366, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.676344871520996 }, { "auxiliary_loss_clip": 0.01177479, "auxiliary_loss_mlp": 0.01025697, "balance_loss_clip": 1.05247009, "balance_loss_mlp": 1.01758218, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.9032734565768514, "language_loss": 0.753901, "learning_rate": 1.6719836184427275e-06, "loss": 0.77593273, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.583545207977295 }, { "auxiliary_loss_clip": 0.01140635, "auxiliary_loss_mlp": 0.01025872, "balance_loss_clip": 1.04825354, "balance_loss_mlp": 1.01883912, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 1.8737451800736629, "language_loss": 0.64559805, "learning_rate": 1.671215219767733e-06, "loss": 0.66726315, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.6692986488342285 }, { "auxiliary_loss_clip": 0.01115275, "auxiliary_loss_mlp": 0.01025926, "balance_loss_clip": 1.04685736, "balance_loss_mlp": 1.01868987, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 2.2671774783341774, "language_loss": 0.76206136, "learning_rate": 1.670446870969127e-06, "loss": 0.78347331, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.720320463180542 }, { "auxiliary_loss_clip": 0.01146085, "auxiliary_loss_mlp": 0.01026067, "balance_loss_clip": 1.05044007, "balance_loss_mlp": 1.01844382, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.6985710298264327, "language_loss": 0.80099982, "learning_rate": 1.6696785721634685e-06, "loss": 0.82272136, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.6126389503479004 }, { "auxiliary_loss_clip": 0.0116234, "auxiliary_loss_mlp": 0.01030497, "balance_loss_clip": 1.05149662, "balance_loss_mlp": 1.02296853, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 2.5753341414040616, "language_loss": 0.73758847, "learning_rate": 1.6689103234673086e-06, "loss": 0.75951684, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.7300031185150146 }, { "auxiliary_loss_clip": 0.0114286, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.05099726, "balance_loss_mlp": 1.01815248, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 2.0252261588456126, "language_loss": 0.77069312, "learning_rate": 1.668142124997189e-06, "loss": 0.79237276, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.6579246520996094 }, { "auxiliary_loss_clip": 0.01054816, "auxiliary_loss_mlp": 0.01004071, "balance_loss_clip": 1.02624464, "balance_loss_mlp": 1.00283754, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7311958808416332, "language_loss": 0.59813476, "learning_rate": 1.6673739768696453e-06, "loss": 0.61872363, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.2422828674316406 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.04936814, "balance_loss_mlp": 1.01887369, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.8947233345717145, "language_loss": 0.77472866, "learning_rate": 1.6666058792012052e-06, "loss": 0.79648399, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.728994607925415 }, { "auxiliary_loss_clip": 0.0107902, "auxiliary_loss_mlp": 0.01001619, "balance_loss_clip": 1.02850938, "balance_loss_mlp": 1.00045037, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.867993337668176, "language_loss": 0.68711531, "learning_rate": 1.6658378321083878e-06, "loss": 0.70792174, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.220780372619629 }, { "auxiliary_loss_clip": 0.01101211, "auxiliary_loss_mlp": 0.01024527, "balance_loss_clip": 1.04542375, "balance_loss_mlp": 1.0174098, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 2.734688172474304, "language_loss": 0.82561707, "learning_rate": 1.6650698357077055e-06, "loss": 0.84687448, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 3.607481002807617 }, { "auxiliary_loss_clip": 0.01149055, "auxiliary_loss_mlp": 0.01024184, "balance_loss_clip": 1.04958498, "balance_loss_mlp": 1.01730323, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.7222639017663584, "language_loss": 0.81570697, "learning_rate": 1.6643018901156632e-06, "loss": 0.83743936, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 4.498487710952759 }, { "auxiliary_loss_clip": 0.01147188, "auxiliary_loss_mlp": 0.01022465, "balance_loss_clip": 1.04816508, "balance_loss_mlp": 1.01543176, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 2.9327248992383947, "language_loss": 0.79497439, "learning_rate": 1.6635339954487566e-06, "loss": 0.81667089, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 3.5797274112701416 }, { "auxiliary_loss_clip": 0.01147261, "auxiliary_loss_mlp": 0.01023716, "balance_loss_clip": 1.04928887, "balance_loss_mlp": 1.01656365, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 2.7979360853407975, "language_loss": 0.82061148, "learning_rate": 1.6627661518234765e-06, "loss": 0.84232128, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.605804920196533 }, { "auxiliary_loss_clip": 0.01119428, "auxiliary_loss_mlp": 0.0103005, "balance_loss_clip": 1.05085349, "balance_loss_mlp": 1.02238441, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 2.052142471132413, "language_loss": 0.85480076, "learning_rate": 1.661998359356302e-06, "loss": 0.87629557, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.7105486392974854 }, { "auxiliary_loss_clip": 0.01087749, "auxiliary_loss_mlp": 0.0100212, "balance_loss_clip": 1.02648246, "balance_loss_mlp": 1.00096989, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.743918707003674, "language_loss": 0.55764329, "learning_rate": 1.6612306181637077e-06, "loss": 0.57854199, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.1826677322387695 }, { "auxiliary_loss_clip": 0.01129638, "auxiliary_loss_mlp": 0.0102064, "balance_loss_clip": 1.04909539, "balance_loss_mlp": 1.01359439, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 3.0633598316433193, "language_loss": 0.65579534, "learning_rate": 1.6604629283621598e-06, "loss": 0.67729813, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.6559412479400635 }, { "auxiliary_loss_clip": 0.01179235, "auxiliary_loss_mlp": 0.01023736, "balance_loss_clip": 1.05302978, "balance_loss_mlp": 1.01583886, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.8960532372612677, "language_loss": 0.74198532, "learning_rate": 1.6596952900681152e-06, "loss": 0.76401502, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.6763768196105957 }, { "auxiliary_loss_clip": 0.01105163, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.04783201, "balance_loss_mlp": 1.02253556, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.6346293683528175, "language_loss": 0.82164156, "learning_rate": 1.658927703398025e-06, "loss": 0.84299612, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.708535671234131 }, { "auxiliary_loss_clip": 0.01109625, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.0414753, "balance_loss_mlp": 1.0165695, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.3123047032093327, "language_loss": 0.78119165, "learning_rate": 1.6581601684683309e-06, "loss": 0.8025223, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.689164400100708 }, { "auxiliary_loss_clip": 0.01159691, "auxiliary_loss_mlp": 0.01020666, "balance_loss_clip": 1.05131245, "balance_loss_mlp": 1.01273811, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 6.302871469940039, "language_loss": 0.68824005, "learning_rate": 1.6573926853954674e-06, "loss": 0.71004367, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.642392158508301 }, { "auxiliary_loss_clip": 0.01138911, "auxiliary_loss_mlp": 0.01024799, "balance_loss_clip": 1.04756236, "balance_loss_mlp": 1.01771164, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 1.976931829606748, "language_loss": 0.83128119, "learning_rate": 1.6566252542958608e-06, "loss": 0.85291827, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.629019260406494 }, { "auxiliary_loss_clip": 0.01117726, "auxiliary_loss_mlp": 0.01022053, "balance_loss_clip": 1.04493392, "balance_loss_mlp": 1.01498413, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 1.9822626289202694, "language_loss": 0.78637016, "learning_rate": 1.6558578752859305e-06, "loss": 0.80776793, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.73751163482666 }, { "auxiliary_loss_clip": 0.01128482, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.04755807, "balance_loss_mlp": 1.01802087, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 1.838297837017485, "language_loss": 0.78536284, "learning_rate": 1.6550905484820865e-06, "loss": 0.80690461, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.6919867992401123 }, { "auxiliary_loss_clip": 0.01178776, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.05271482, "balance_loss_mlp": 1.01878858, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.374479929433054, "language_loss": 0.78597611, "learning_rate": 1.6543232740007328e-06, "loss": 0.80802214, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.579115867614746 }, { "auxiliary_loss_clip": 0.01163467, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 1.05230951, "balance_loss_mlp": 1.01887238, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.535795450835386, "language_loss": 0.6705296, "learning_rate": 1.653556051958263e-06, "loss": 0.69242835, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.6694283485412598 }, { "auxiliary_loss_clip": 0.01081465, "auxiliary_loss_mlp": 0.01023076, "balance_loss_clip": 1.0429554, "balance_loss_mlp": 1.01560199, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 2.2519615601616096, "language_loss": 0.73936951, "learning_rate": 1.6527888824710642e-06, "loss": 0.76041496, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.714390993118286 }, { "auxiliary_loss_clip": 0.01121373, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.04627728, "balance_loss_mlp": 1.021119, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.4351451123851584, "language_loss": 0.76615357, "learning_rate": 1.6520217656555166e-06, "loss": 0.78765643, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.728431224822998 }, { "auxiliary_loss_clip": 0.01132875, "auxiliary_loss_mlp": 0.01031403, "balance_loss_clip": 1.04732144, "balance_loss_mlp": 1.02424777, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.7362196243076058, "language_loss": 0.7102223, "learning_rate": 1.65125470162799e-06, "loss": 0.73186517, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.7015514373779297 }, { "auxiliary_loss_clip": 0.01130903, "auxiliary_loss_mlp": 0.01026495, "balance_loss_clip": 1.04755628, "balance_loss_mlp": 1.01899719, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 2.3152959392275814, "language_loss": 0.70208347, "learning_rate": 1.6504876905048485e-06, "loss": 0.72365743, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.669572353363037 }, { "auxiliary_loss_clip": 0.01173877, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.05240238, "balance_loss_mlp": 1.01944757, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 2.234282780342416, "language_loss": 0.72237754, "learning_rate": 1.6497207324024464e-06, "loss": 0.74437898, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.6125335693359375 }, { "auxiliary_loss_clip": 0.01152489, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.05022907, "balance_loss_mlp": 1.02015841, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 2.4139575754510454, "language_loss": 0.82668161, "learning_rate": 1.6489538274371305e-06, "loss": 0.84847331, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.714949607849121 }, { "auxiliary_loss_clip": 0.01154994, "auxiliary_loss_mlp": 0.01022395, "balance_loss_clip": 1.05243206, "balance_loss_mlp": 1.01520979, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 2.9903506705212597, "language_loss": 0.82968748, "learning_rate": 1.6481869757252396e-06, "loss": 0.85146141, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.576169490814209 }, { "auxiliary_loss_clip": 0.01162187, "auxiliary_loss_mlp": 0.01028543, "balance_loss_clip": 1.05280638, "balance_loss_mlp": 1.02173591, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.4647374196763459, "language_loss": 0.71977937, "learning_rate": 1.647420177383105e-06, "loss": 0.7416867, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.646730422973633 }, { "auxiliary_loss_clip": 0.01157014, "auxiliary_loss_mlp": 0.01022836, "balance_loss_clip": 1.05200255, "balance_loss_mlp": 1.01586509, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.9250537102789793, "language_loss": 0.72399735, "learning_rate": 1.646653432527049e-06, "loss": 0.74579585, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.660504102706909 }, { "auxiliary_loss_clip": 0.01128067, "auxiliary_loss_mlp": 0.01023141, "balance_loss_clip": 1.04588699, "balance_loss_mlp": 1.01651335, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.753716584818807, "language_loss": 0.7427749, "learning_rate": 1.645886741273387e-06, "loss": 0.76428694, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.7000138759613037 }, { "auxiliary_loss_clip": 0.01126368, "auxiliary_loss_mlp": 0.01026664, "balance_loss_clip": 1.05110049, "balance_loss_mlp": 1.01945782, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 4.192089164318466, "language_loss": 0.73402703, "learning_rate": 1.645120103738424e-06, "loss": 0.7555573, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 3.5687875747680664 }, { "auxiliary_loss_clip": 0.0115153, "auxiliary_loss_mlp": 0.00710329, "balance_loss_clip": 1.05143929, "balance_loss_mlp": 1.00033653, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.6733895188419594, "language_loss": 0.84077621, "learning_rate": 1.6443535200384591e-06, "loss": 0.85939485, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 4.50114631652832 }, { "auxiliary_loss_clip": 0.01177132, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.05435228, "balance_loss_mlp": 1.01946306, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 5.1720596615459, "language_loss": 0.71007913, "learning_rate": 1.6435869902897827e-06, "loss": 0.73211831, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 3.5745983123779297 }, { "auxiliary_loss_clip": 0.01055213, "auxiliary_loss_mlp": 0.01002597, "balance_loss_clip": 1.03029871, "balance_loss_mlp": 1.00146437, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.8164683685338189, "language_loss": 0.62028003, "learning_rate": 1.6428205146086764e-06, "loss": 0.64085811, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.290743112564087 }, { "auxiliary_loss_clip": 0.01147042, "auxiliary_loss_mlp": 0.01024148, "balance_loss_clip": 1.04709232, "balance_loss_mlp": 1.01668274, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 2.673438687862195, "language_loss": 0.70950586, "learning_rate": 1.6420540931114142e-06, "loss": 0.73121774, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.7574520111083984 }, { "auxiliary_loss_clip": 0.01146542, "auxiliary_loss_mlp": 0.01025418, "balance_loss_clip": 1.04989696, "balance_loss_mlp": 1.01787806, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 1.8254453359499963, "language_loss": 0.79449153, "learning_rate": 1.6412877259142616e-06, "loss": 0.81621122, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.7044878005981445 }, { "auxiliary_loss_clip": 0.01140963, "auxiliary_loss_mlp": 0.01022936, "balance_loss_clip": 1.04927254, "balance_loss_mlp": 1.01618898, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 1.9831588971265666, "language_loss": 0.7352041, "learning_rate": 1.6405214131334757e-06, "loss": 0.75684309, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.734682083129883 }, { "auxiliary_loss_clip": 0.01110316, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.04891372, "balance_loss_mlp": 1.02140176, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 2.0912636288041178, "language_loss": 0.79879659, "learning_rate": 1.6397551548853052e-06, "loss": 0.82018065, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.7415473461151123 }, { "auxiliary_loss_clip": 0.01142704, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.05065894, "balance_loss_mlp": 1.01810741, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.9101445926026916, "language_loss": 0.70563519, "learning_rate": 1.6389889512859917e-06, "loss": 0.72731811, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.761617660522461 }, { "auxiliary_loss_clip": 0.01062466, "auxiliary_loss_mlp": 0.01002467, "balance_loss_clip": 1.02591789, "balance_loss_mlp": 1.00123298, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8094599859805592, "language_loss": 0.60343039, "learning_rate": 1.638222802451767e-06, "loss": 0.6240797, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.2800183296203613 }, { "auxiliary_loss_clip": 0.01151621, "auxiliary_loss_mlp": 0.01024073, "balance_loss_clip": 1.05097866, "balance_loss_mlp": 1.01752532, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.7818865861245774, "language_loss": 0.75570631, "learning_rate": 1.6374567084988561e-06, "loss": 0.7774632, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.6713314056396484 }, { "auxiliary_loss_clip": 0.01148627, "auxiliary_loss_mlp": 0.01029305, "balance_loss_clip": 1.05244434, "balance_loss_mlp": 1.02238226, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 2.014725934639157, "language_loss": 0.7653985, "learning_rate": 1.6366906695434738e-06, "loss": 0.78717774, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.656320095062256 }, { "auxiliary_loss_clip": 0.01160879, "auxiliary_loss_mlp": 0.0102729, "balance_loss_clip": 1.05403376, "balance_loss_mlp": 1.02044117, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 3.4975274159271508, "language_loss": 0.86210281, "learning_rate": 1.6359246857018275e-06, "loss": 0.88398445, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.5970170497894287 }, { "auxiliary_loss_clip": 0.01111111, "auxiliary_loss_mlp": 0.01022197, "balance_loss_clip": 1.04528058, "balance_loss_mlp": 1.0150032, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 2.1135810868205764, "language_loss": 0.78580356, "learning_rate": 1.6351587570901178e-06, "loss": 0.8071366, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.821638345718384 }, { "auxiliary_loss_clip": 0.01129239, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.05101871, "balance_loss_mlp": 1.0193783, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.9984682833878384, "language_loss": 0.75896639, "learning_rate": 1.634392883824534e-06, "loss": 0.780527, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.672234058380127 }, { "auxiliary_loss_clip": 0.01115386, "auxiliary_loss_mlp": 0.01023107, "balance_loss_clip": 1.04619551, "balance_loss_mlp": 1.01625872, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 2.882095041783, "language_loss": 0.67810553, "learning_rate": 1.6336270660212595e-06, "loss": 0.69949049, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.819275379180908 }, { "auxiliary_loss_clip": 0.01143752, "auxiliary_loss_mlp": 0.0102079, "balance_loss_clip": 1.05273926, "balance_loss_mlp": 1.01322603, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.8254852388413583, "language_loss": 0.6625607, "learning_rate": 1.6328613037964676e-06, "loss": 0.68420613, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.7474148273468018 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01016489, "balance_loss_clip": 1.04981923, "balance_loss_mlp": 1.00940776, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 6.077035710753276, "language_loss": 0.67962724, "learning_rate": 1.6320955972663241e-06, "loss": 0.70137411, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.6591618061065674 }, { "auxiliary_loss_clip": 0.01158278, "auxiliary_loss_mlp": 0.01023272, "balance_loss_clip": 1.05060983, "balance_loss_mlp": 1.01640844, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 1.9838800674825117, "language_loss": 0.65438914, "learning_rate": 1.6313299465469857e-06, "loss": 0.67620468, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.7982983589172363 }, { "auxiliary_loss_clip": 0.01153287, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.04841232, "balance_loss_mlp": 1.02669823, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 3.171934694398394, "language_loss": 0.79034853, "learning_rate": 1.6305643517546014e-06, "loss": 0.81222808, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.635592460632324 }, { "auxiliary_loss_clip": 0.01174898, "auxiliary_loss_mlp": 0.01024855, "balance_loss_clip": 1.05379379, "balance_loss_mlp": 1.01840305, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 2.0503437391254216, "language_loss": 0.84853959, "learning_rate": 1.629798813005311e-06, "loss": 0.87053716, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.596515655517578 }, { "auxiliary_loss_clip": 0.01113799, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 1.04959238, "balance_loss_mlp": 1.01962996, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 2.091965568368697, "language_loss": 0.70920652, "learning_rate": 1.6290333304152473e-06, "loss": 0.73061347, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.757603168487549 }, { "auxiliary_loss_clip": 0.01145301, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.05499935, "balance_loss_mlp": 1.02051139, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 2.1439518007669767, "language_loss": 0.56710696, "learning_rate": 1.6282679041005314e-06, "loss": 0.58884048, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.831101179122925 }, { "auxiliary_loss_clip": 0.01133267, "auxiliary_loss_mlp": 0.01026139, "balance_loss_clip": 1.04610264, "balance_loss_mlp": 1.01918912, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.267998436900633, "language_loss": 0.87285787, "learning_rate": 1.6275025341772789e-06, "loss": 0.89445192, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.6856155395507812 }, { "auxiliary_loss_clip": 0.01146847, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.05100036, "balance_loss_mlp": 1.01859367, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 4.14670265112943, "language_loss": 0.815799, "learning_rate": 1.626737220761596e-06, "loss": 0.83752692, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.6757850646972656 }, { "auxiliary_loss_clip": 0.01157985, "auxiliary_loss_mlp": 0.01031995, "balance_loss_clip": 1.05252361, "balance_loss_mlp": 1.02443123, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 2.1747422646226577, "language_loss": 0.78591633, "learning_rate": 1.62597196396958e-06, "loss": 0.80781615, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.4421112537384033 }, { "auxiliary_loss_clip": 0.01155394, "auxiliary_loss_mlp": 0.01022064, "balance_loss_clip": 1.04971564, "balance_loss_mlp": 1.01465547, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 2.1223913949697364, "language_loss": 0.85705805, "learning_rate": 1.6252067639173197e-06, "loss": 0.87883264, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.6768360137939453 }, { "auxiliary_loss_clip": 0.01160863, "auxiliary_loss_mlp": 0.01023394, "balance_loss_clip": 1.0515132, "balance_loss_mlp": 1.01624739, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 2.09369624188513, "language_loss": 0.69811761, "learning_rate": 1.6244416207208956e-06, "loss": 0.71996021, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 3.5227155685424805 }, { "auxiliary_loss_clip": 0.01128312, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.04860628, "balance_loss_mlp": 1.02132726, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 2.3663412088542213, "language_loss": 0.74138618, "learning_rate": 1.6236765344963787e-06, "loss": 0.76295549, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.747230052947998 }, { "auxiliary_loss_clip": 0.01141298, "auxiliary_loss_mlp": 0.0102249, "balance_loss_clip": 1.05047095, "balance_loss_mlp": 1.01541483, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.233666883082988, "language_loss": 0.69112593, "learning_rate": 1.6229115053598322e-06, "loss": 0.71276379, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.760465383529663 }, { "auxiliary_loss_clip": 0.01160508, "auxiliary_loss_mlp": 0.01025307, "balance_loss_clip": 1.05315232, "balance_loss_mlp": 1.0178678, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 2.4259417765961833, "language_loss": 0.72175545, "learning_rate": 1.6221465334273108e-06, "loss": 0.7436136, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 2.5604560375213623 }, { "auxiliary_loss_clip": 0.01133321, "auxiliary_loss_mlp": 0.01023081, "balance_loss_clip": 1.048823, "balance_loss_mlp": 1.01599741, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 3.0696200150453103, "language_loss": 0.61352903, "learning_rate": 1.6213816188148593e-06, "loss": 0.63509303, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.7172436714172363 }, { "auxiliary_loss_clip": 0.01138802, "auxiliary_loss_mlp": 0.01021676, "balance_loss_clip": 1.05334258, "balance_loss_mlp": 1.01506329, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.7040197363377123, "language_loss": 0.77080125, "learning_rate": 1.6206167616385162e-06, "loss": 0.79240608, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.6695616245269775 }, { "auxiliary_loss_clip": 0.01147943, "auxiliary_loss_mlp": 0.01023943, "balance_loss_clip": 1.05068672, "balance_loss_mlp": 1.01628661, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.0840312797009966, "language_loss": 0.73785186, "learning_rate": 1.6198519620143078e-06, "loss": 0.75957072, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.639085531234741 }, { "auxiliary_loss_clip": 0.01133637, "auxiliary_loss_mlp": 0.01027811, "balance_loss_clip": 1.0509342, "balance_loss_mlp": 1.02097178, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.9864175239792032, "language_loss": 0.78098339, "learning_rate": 1.6190872200582546e-06, "loss": 0.80259788, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.711482286453247 }, { "auxiliary_loss_clip": 0.01139555, "auxiliary_loss_mlp": 0.00710424, "balance_loss_clip": 1.04940152, "balance_loss_mlp": 1.00039434, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 3.2953886772598575, "language_loss": 0.77979541, "learning_rate": 1.6183225358863676e-06, "loss": 0.79829526, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.6726436614990234 }, { "auxiliary_loss_clip": 0.01134299, "auxiliary_loss_mlp": 0.01024368, "balance_loss_clip": 1.04602814, "balance_loss_mlp": 1.0168879, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 3.1932433152061046, "language_loss": 0.71642768, "learning_rate": 1.617557909614648e-06, "loss": 0.73801434, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.811244249343872 }, { "auxiliary_loss_clip": 0.01124455, "auxiliary_loss_mlp": 0.01021496, "balance_loss_clip": 1.0480206, "balance_loss_mlp": 1.01442122, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 2.0455538626731835, "language_loss": 0.85766739, "learning_rate": 1.6167933413590899e-06, "loss": 0.87912691, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.915128469467163 }, { "auxiliary_loss_clip": 0.01158663, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.05091059, "balance_loss_mlp": 1.01972592, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 3.1012612060568916, "language_loss": 0.90553647, "learning_rate": 1.6160288312356773e-06, "loss": 0.92739969, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.57985258102417 }, { "auxiliary_loss_clip": 0.01161169, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.0502106, "balance_loss_mlp": 1.02126563, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 2.029165359282016, "language_loss": 0.81469762, "learning_rate": 1.6152643793603857e-06, "loss": 0.83659387, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.661060094833374 }, { "auxiliary_loss_clip": 0.01176295, "auxiliary_loss_mlp": 0.01018782, "balance_loss_clip": 1.05427718, "balance_loss_mlp": 1.01167095, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.7240816481412249, "language_loss": 0.87549305, "learning_rate": 1.6144999858491815e-06, "loss": 0.89744377, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.5959832668304443 }, { "auxiliary_loss_clip": 0.01147132, "auxiliary_loss_mlp": 0.01023578, "balance_loss_clip": 1.04877651, "balance_loss_mlp": 1.01597786, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 3.908656599641173, "language_loss": 0.85884786, "learning_rate": 1.6137356508180232e-06, "loss": 0.88055491, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.6801364421844482 }, { "auxiliary_loss_clip": 0.01176084, "auxiliary_loss_mlp": 0.0071089, "balance_loss_clip": 1.05308199, "balance_loss_mlp": 1.00036836, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 2.34630163661134, "language_loss": 0.81673896, "learning_rate": 1.6129713743828593e-06, "loss": 0.83560872, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.62133526802063 }, { "auxiliary_loss_clip": 0.01141757, "auxiliary_loss_mlp": 0.0102364, "balance_loss_clip": 1.0467205, "balance_loss_mlp": 1.01680946, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.4161058935673179, "language_loss": 0.75629359, "learning_rate": 1.6122071566596306e-06, "loss": 0.77794755, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.7296974658966064 }, { "auxiliary_loss_clip": 0.0116325, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.05208039, "balance_loss_mlp": 1.02257311, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.319495427042532, "language_loss": 0.83528054, "learning_rate": 1.6114429977642674e-06, "loss": 0.85721326, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.669734239578247 }, { "auxiliary_loss_clip": 0.01158786, "auxiliary_loss_mlp": 0.01023448, "balance_loss_clip": 1.05265701, "balance_loss_mlp": 1.01632762, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 2.037697792553956, "language_loss": 0.73873091, "learning_rate": 1.6106788978126926e-06, "loss": 0.76055324, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.6801986694335938 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.01025375, "balance_loss_clip": 1.0438838, "balance_loss_mlp": 1.01848483, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 5.75917827115334, "language_loss": 0.79536557, "learning_rate": 1.6099148569208196e-06, "loss": 0.81669438, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.7795259952545166 }, { "auxiliary_loss_clip": 0.01146343, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.05199456, "balance_loss_mlp": 1.02227759, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 1.9649131860161098, "language_loss": 0.63322246, "learning_rate": 1.6091508752045523e-06, "loss": 0.65498579, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.6780924797058105 }, { "auxiliary_loss_clip": 0.01117873, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.04316401, "balance_loss_mlp": 1.01662171, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.6675463685935012, "language_loss": 0.86459333, "learning_rate": 1.608386952779787e-06, "loss": 0.88600677, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.8073890209198 }, { "auxiliary_loss_clip": 0.01148301, "auxiliary_loss_mlp": 0.01020688, "balance_loss_clip": 1.05032289, "balance_loss_mlp": 1.01390827, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 3.0577681929335943, "language_loss": 0.7473954, "learning_rate": 1.6076230897624098e-06, "loss": 0.76908529, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.6770687103271484 }, { "auxiliary_loss_clip": 0.0115794, "auxiliary_loss_mlp": 0.01021346, "balance_loss_clip": 1.04697883, "balance_loss_mlp": 1.01387715, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 4.437121300997163, "language_loss": 0.7758525, "learning_rate": 1.6068592862682974e-06, "loss": 0.79764533, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.688455104827881 }, { "auxiliary_loss_clip": 0.01149226, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.05214143, "balance_loss_mlp": 1.0192337, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.9746408074268416, "language_loss": 0.73212439, "learning_rate": 1.6060955424133187e-06, "loss": 0.75388169, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 3.68881893157959 }, { "auxiliary_loss_clip": 0.01157896, "auxiliary_loss_mlp": 0.0102549, "balance_loss_clip": 1.04980195, "balance_loss_mlp": 1.01798594, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.8725989406912844, "language_loss": 0.89821231, "learning_rate": 1.6053318583133332e-06, "loss": 0.92004621, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 3.5327839851379395 }, { "auxiliary_loss_clip": 0.01158362, "auxiliary_loss_mlp": 0.01024765, "balance_loss_clip": 1.05044389, "balance_loss_mlp": 1.01710606, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.3596214847369703, "language_loss": 0.75454903, "learning_rate": 1.6045682340841907e-06, "loss": 0.7763803, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 3.732661485671997 }, { "auxiliary_loss_clip": 0.01053829, "auxiliary_loss_mlp": 0.00701625, "balance_loss_clip": 1.02958858, "balance_loss_mlp": 0.99990577, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.775413263040341, "language_loss": 0.57980365, "learning_rate": 1.6038046698417336e-06, "loss": 0.59735817, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 4.203447580337524 }, { "auxiliary_loss_clip": 0.01159053, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.04992437, "balance_loss_mlp": 1.01763368, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 2.23168559923307, "language_loss": 0.68637985, "learning_rate": 1.6030411657017919e-06, "loss": 0.70821732, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.6697285175323486 }, { "auxiliary_loss_clip": 0.01150642, "auxiliary_loss_mlp": 0.01024991, "balance_loss_clip": 1.04910243, "balance_loss_mlp": 1.01777911, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.854160843685963, "language_loss": 0.84227687, "learning_rate": 1.6022777217801903e-06, "loss": 0.86403322, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.642352342605591 }, { "auxiliary_loss_clip": 0.01128378, "auxiliary_loss_mlp": 0.01025403, "balance_loss_clip": 1.05056477, "balance_loss_mlp": 1.01821792, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 2.1655595225168667, "language_loss": 0.73617619, "learning_rate": 1.601514338192742e-06, "loss": 0.75771403, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.681234121322632 }, { "auxiliary_loss_clip": 0.01170416, "auxiliary_loss_mlp": 0.01024767, "balance_loss_clip": 1.05048895, "balance_loss_mlp": 1.01777494, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.6171081471153665, "language_loss": 0.71290922, "learning_rate": 1.6007510150552514e-06, "loss": 0.73486102, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.654491901397705 }, { "auxiliary_loss_clip": 0.01162285, "auxiliary_loss_mlp": 0.01025605, "balance_loss_clip": 1.04988885, "balance_loss_mlp": 1.01836002, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.5479280090911725, "language_loss": 0.62253737, "learning_rate": 1.599987752483515e-06, "loss": 0.64441621, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.8600614070892334 }, { "auxiliary_loss_clip": 0.01121746, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04563403, "balance_loss_mlp": 1.01595855, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 3.641444603296897, "language_loss": 0.68112606, "learning_rate": 1.5992245505933184e-06, "loss": 0.7025758, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.71391224861145 }, { "auxiliary_loss_clip": 0.0117744, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.05389857, "balance_loss_mlp": 1.02206337, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.2936424789052303, "language_loss": 0.71585989, "learning_rate": 1.5984614095004388e-06, "loss": 0.73792899, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.6647956371307373 }, { "auxiliary_loss_clip": 0.01156561, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.05120969, "balance_loss_mlp": 1.0222131, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.6981173310390782, "language_loss": 0.81159747, "learning_rate": 1.5976983293206438e-06, "loss": 0.8334583, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.5905275344848633 }, { "auxiliary_loss_clip": 0.01139823, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 1.04724443, "balance_loss_mlp": 1.02010143, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 2.5848788575723476, "language_loss": 0.71574849, "learning_rate": 1.5969353101696928e-06, "loss": 0.73741615, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.6668202877044678 }, { "auxiliary_loss_clip": 0.01159028, "auxiliary_loss_mlp": 0.01021849, "balance_loss_clip": 1.0507791, "balance_loss_mlp": 1.01529264, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 2.0062596410161393, "language_loss": 0.79790956, "learning_rate": 1.5961723521633341e-06, "loss": 0.81971836, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.670816421508789 }, { "auxiliary_loss_clip": 0.0114064, "auxiliary_loss_mlp": 0.01022767, "balance_loss_clip": 1.04796267, "balance_loss_mlp": 1.01594853, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.3743925567220248, "language_loss": 0.91357946, "learning_rate": 1.5954094554173097e-06, "loss": 0.93521345, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.649451732635498 }, { "auxiliary_loss_clip": 0.01149867, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.05209446, "balance_loss_mlp": 1.01955938, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.6265275032845996, "language_loss": 0.78974134, "learning_rate": 1.5946466200473482e-06, "loss": 0.81150389, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.730513095855713 }, { "auxiliary_loss_clip": 0.01148202, "auxiliary_loss_mlp": 0.01021197, "balance_loss_clip": 1.04933345, "balance_loss_mlp": 1.01412785, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 2.1681944711218675, "language_loss": 0.83808887, "learning_rate": 1.5938838461691723e-06, "loss": 0.85978281, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.679011106491089 }, { "auxiliary_loss_clip": 0.01178704, "auxiliary_loss_mlp": 0.01027347, "balance_loss_clip": 1.05533385, "balance_loss_mlp": 1.01997709, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.9239472621919895, "language_loss": 0.82542086, "learning_rate": 1.593121133898494e-06, "loss": 0.84748143, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.5448617935180664 }, { "auxiliary_loss_clip": 0.01166036, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.05278945, "balance_loss_mlp": 1.02461374, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 3.8194818935887436, "language_loss": 0.79147929, "learning_rate": 1.592358483351016e-06, "loss": 0.81345385, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.7029340267181396 }, { "auxiliary_loss_clip": 0.01156109, "auxiliary_loss_mlp": 0.01023912, "balance_loss_clip": 1.05013883, "balance_loss_mlp": 1.01702738, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 2.0227553644967933, "language_loss": 0.72484887, "learning_rate": 1.5915958946424326e-06, "loss": 0.74664909, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.5721476078033447 }, { "auxiliary_loss_clip": 0.01133639, "auxiliary_loss_mlp": 0.00711507, "balance_loss_clip": 1.05102932, "balance_loss_mlp": 1.00049627, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.9472906159139334, "language_loss": 0.7489205, "learning_rate": 1.5908333678884271e-06, "loss": 0.76737195, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.9044249057769775 }, { "auxiliary_loss_clip": 0.01160087, "auxiliary_loss_mlp": 0.01027942, "balance_loss_clip": 1.05262852, "balance_loss_mlp": 1.02058125, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 2.203299559017449, "language_loss": 0.73595083, "learning_rate": 1.5900709032046743e-06, "loss": 0.7578311, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.5662643909454346 }, { "auxiliary_loss_clip": 0.01140114, "auxiliary_loss_mlp": 0.01028866, "balance_loss_clip": 1.0514642, "balance_loss_mlp": 1.02186537, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.581193511935362, "language_loss": 0.78191525, "learning_rate": 1.5893085007068391e-06, "loss": 0.80360508, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.6197125911712646 }, { "auxiliary_loss_clip": 0.01132958, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.04703891, "balance_loss_mlp": 1.02238238, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 2.1027147433207034, "language_loss": 0.70918953, "learning_rate": 1.5885461605105786e-06, "loss": 0.73081869, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.67724609375 }, { "auxiliary_loss_clip": 0.01149037, "auxiliary_loss_mlp": 0.01031162, "balance_loss_clip": 1.05218697, "balance_loss_mlp": 1.02392316, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 1.990840540794728, "language_loss": 0.77114922, "learning_rate": 1.5877838827315375e-06, "loss": 0.79295123, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.622642755508423 }, { "auxiliary_loss_clip": 0.0117531, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.05307031, "balance_loss_mlp": 1.02198732, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 2.1271861607827436, "language_loss": 0.70609689, "learning_rate": 1.587021667485355e-06, "loss": 0.72814178, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.7212679386138916 }, { "auxiliary_loss_clip": 0.01145552, "auxiliary_loss_mlp": 0.01024761, "balance_loss_clip": 1.04733038, "balance_loss_mlp": 1.0178082, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.930437838232614, "language_loss": 0.78605068, "learning_rate": 1.5862595148876559e-06, "loss": 0.8077538, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 3.619429111480713 }, { "auxiliary_loss_clip": 0.01117065, "auxiliary_loss_mlp": 0.01028177, "balance_loss_clip": 1.0496608, "balance_loss_mlp": 1.02089345, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.5505242537168584, "language_loss": 0.76598215, "learning_rate": 1.58549742505406e-06, "loss": 0.78743452, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 3.6252601146698 }, { "auxiliary_loss_clip": 0.01175932, "auxiliary_loss_mlp": 0.01021626, "balance_loss_clip": 1.05148911, "balance_loss_mlp": 1.0145303, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.116458949817925, "language_loss": 0.75660992, "learning_rate": 1.5847353981001747e-06, "loss": 0.77858555, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.5950467586517334 }, { "auxiliary_loss_clip": 0.01138241, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.04743278, "balance_loss_mlp": 1.02421963, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 2.5256589975258614, "language_loss": 0.69891357, "learning_rate": 1.5839734341415993e-06, "loss": 0.72061628, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 4.60970139503479 }, { "auxiliary_loss_clip": 0.01154877, "auxiliary_loss_mlp": 0.01029405, "balance_loss_clip": 1.05189562, "balance_loss_mlp": 1.02285433, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.7210826021623584, "language_loss": 0.76486206, "learning_rate": 1.5832115332939238e-06, "loss": 0.7867049, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.667759895324707 }, { "auxiliary_loss_clip": 0.01163251, "auxiliary_loss_mlp": 0.01022006, "balance_loss_clip": 1.05209565, "balance_loss_mlp": 1.01499665, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 1.8875430425415334, "language_loss": 0.74559474, "learning_rate": 1.5824496956727272e-06, "loss": 0.76744729, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.6152212619781494 }, { "auxiliary_loss_clip": 0.01145642, "auxiliary_loss_mlp": 0.0102755, "balance_loss_clip": 1.04987419, "balance_loss_mlp": 1.02062678, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 2.064373402753124, "language_loss": 0.73144829, "learning_rate": 1.5816879213935797e-06, "loss": 0.75318021, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.631504774093628 }, { "auxiliary_loss_clip": 0.01157649, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.05284405, "balance_loss_mlp": 1.02243972, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.8003950861643216, "language_loss": 0.79682678, "learning_rate": 1.5809262105720416e-06, "loss": 0.81869543, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.7004010677337646 }, { "auxiliary_loss_clip": 0.01174252, "auxiliary_loss_mlp": 0.01027531, "balance_loss_clip": 1.05302227, "balance_loss_mlp": 1.02045584, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.8590244542269574, "language_loss": 0.79587448, "learning_rate": 1.5801645633236644e-06, "loss": 0.81789231, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.577164888381958 }, { "auxiliary_loss_clip": 0.01138604, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.04842806, "balance_loss_mlp": 1.02149248, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.9319225436561283, "language_loss": 0.77076387, "learning_rate": 1.579402979763989e-06, "loss": 0.7924332, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.732105255126953 }, { "auxiliary_loss_clip": 0.01105966, "auxiliary_loss_mlp": 0.01021285, "balance_loss_clip": 1.04438472, "balance_loss_mlp": 1.01451373, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 3.6856101942356054, "language_loss": 0.8141135, "learning_rate": 1.578641460008548e-06, "loss": 0.83538604, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.6919867992401123 }, { "auxiliary_loss_clip": 0.01158999, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.05257273, "balance_loss_mlp": 1.0181334, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 3.3284343896601425, "language_loss": 0.67968315, "learning_rate": 1.5778800041728613e-06, "loss": 0.70152462, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.6081628799438477 }, { "auxiliary_loss_clip": 0.01154226, "auxiliary_loss_mlp": 0.01024981, "balance_loss_clip": 1.04980457, "balance_loss_mlp": 1.01814413, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.6498204930481362, "language_loss": 0.66100425, "learning_rate": 1.577118612372443e-06, "loss": 0.68279624, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.7758333683013916 }, { "auxiliary_loss_clip": 0.01142735, "auxiliary_loss_mlp": 0.00711814, "balance_loss_clip": 1.05015719, "balance_loss_mlp": 1.00046706, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.6579539637633913, "language_loss": 0.7042613, "learning_rate": 1.5763572847227943e-06, "loss": 0.72280675, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.8937361240386963 }, { "auxiliary_loss_clip": 0.01159565, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.0506202, "balance_loss_mlp": 1.01865053, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 2.4820711364017547, "language_loss": 0.8117103, "learning_rate": 1.5755960213394091e-06, "loss": 0.83356291, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.7368521690368652 }, { "auxiliary_loss_clip": 0.01131988, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.04769409, "balance_loss_mlp": 1.01678646, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 3.0892723397570445, "language_loss": 0.7871787, "learning_rate": 1.5748348223377703e-06, "loss": 0.80873746, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.756239891052246 }, { "auxiliary_loss_clip": 0.01142833, "auxiliary_loss_mlp": 0.01024246, "balance_loss_clip": 1.05193079, "balance_loss_mlp": 1.01779366, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 6.640511107358076, "language_loss": 0.77720428, "learning_rate": 1.5740736878333507e-06, "loss": 0.79887509, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.696430206298828 }, { "auxiliary_loss_clip": 0.01147515, "auxiliary_loss_mlp": 0.01027182, "balance_loss_clip": 1.04940248, "balance_loss_mlp": 1.01962423, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 4.0094726270509495, "language_loss": 0.78365719, "learning_rate": 1.5733126179416143e-06, "loss": 0.80540419, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.6909003257751465 }, { "auxiliary_loss_clip": 0.01161711, "auxiliary_loss_mlp": 0.01025173, "balance_loss_clip": 1.05306816, "balance_loss_mlp": 1.01771665, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.1874166921132425, "language_loss": 0.72386545, "learning_rate": 1.5725516127780137e-06, "loss": 0.74573427, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.7486236095428467 }, { "auxiliary_loss_clip": 0.01166488, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.05112863, "balance_loss_mlp": 1.02087152, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.2520107122571473, "language_loss": 0.88862598, "learning_rate": 1.5717906724579943e-06, "loss": 0.91057819, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.5619089603424072 }, { "auxiliary_loss_clip": 0.01134265, "auxiliary_loss_mlp": 0.0102907, "balance_loss_clip": 1.04724085, "balance_loss_mlp": 1.02219117, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 3.2064655212572553, "language_loss": 0.68437785, "learning_rate": 1.571029797096989e-06, "loss": 0.70601118, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.8186869621276855 }, { "auxiliary_loss_clip": 0.01174145, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 1.05216956, "balance_loss_mlp": 1.02339029, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.8457628003441515, "language_loss": 0.78953838, "learning_rate": 1.570268986810423e-06, "loss": 0.81158209, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.6088497638702393 }, { "auxiliary_loss_clip": 0.01141239, "auxiliary_loss_mlp": 0.01023905, "balance_loss_clip": 1.05003846, "balance_loss_mlp": 1.0169934, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 2.0210657302660033, "language_loss": 0.74444139, "learning_rate": 1.5695082417137096e-06, "loss": 0.76609278, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.672471523284912 }, { "auxiliary_loss_clip": 0.01141079, "auxiliary_loss_mlp": 0.0102145, "balance_loss_clip": 1.0464319, "balance_loss_mlp": 1.01505184, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 2.0106801034664303, "language_loss": 0.75441986, "learning_rate": 1.5687475619222539e-06, "loss": 0.77604508, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.6730704307556152 }, { "auxiliary_loss_clip": 0.01139442, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.04767108, "balance_loss_mlp": 1.02325499, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.2109316567983894, "language_loss": 0.73471683, "learning_rate": 1.5679869475514496e-06, "loss": 0.75641966, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.6847808361053467 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.05229759, "balance_loss_mlp": 1.02172637, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.246595988084626, "language_loss": 0.8122623, "learning_rate": 1.567226398716682e-06, "loss": 0.83417463, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.6356375217437744 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01020111, "balance_loss_clip": 1.05026138, "balance_loss_mlp": 1.01254117, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.8914226037798927, "language_loss": 0.61946666, "learning_rate": 1.566465915533326e-06, "loss": 0.64118433, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 3.657127857208252 }, { "auxiliary_loss_clip": 0.01158677, "auxiliary_loss_mlp": 0.01024427, "balance_loss_clip": 1.05143321, "balance_loss_mlp": 1.01729178, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 1.9079635762547762, "language_loss": 0.87742317, "learning_rate": 1.5657054981167458e-06, "loss": 0.8992542, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 2.684739112854004 }, { "auxiliary_loss_clip": 0.01156409, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 1.05061984, "balance_loss_mlp": 1.01628637, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 1.7454728271916453, "language_loss": 0.68316257, "learning_rate": 1.5649451465822965e-06, "loss": 0.70495343, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 4.425864219665527 }, { "auxiliary_loss_clip": 0.01113183, "auxiliary_loss_mlp": 0.01027264, "balance_loss_clip": 1.0496453, "balance_loss_mlp": 1.02013814, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.91340158271455, "language_loss": 0.83657354, "learning_rate": 1.5641848610453218e-06, "loss": 0.85797799, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 3.7416341304779053 }, { "auxiliary_loss_clip": 0.01154563, "auxiliary_loss_mlp": 0.01022234, "balance_loss_clip": 1.04930937, "balance_loss_mlp": 1.01477134, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 5.22841351693418, "language_loss": 0.85938966, "learning_rate": 1.563424641621158e-06, "loss": 0.88115758, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.636188507080078 }, { "auxiliary_loss_clip": 0.01150816, "auxiliary_loss_mlp": 0.01026934, "balance_loss_clip": 1.05300343, "balance_loss_mlp": 1.01973045, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 2.0013951442007003, "language_loss": 0.69994622, "learning_rate": 1.5626644884251282e-06, "loss": 0.72172368, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.614429473876953 }, { "auxiliary_loss_clip": 0.0117665, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 1.05329156, "balance_loss_mlp": 1.01667118, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 4.636866130387408, "language_loss": 0.88236344, "learning_rate": 1.5619044015725488e-06, "loss": 0.90436852, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.606151819229126 }, { "auxiliary_loss_clip": 0.01184497, "auxiliary_loss_mlp": 0.01038226, "balance_loss_clip": 1.05790448, "balance_loss_mlp": 1.03020287, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.22097738193724, "language_loss": 0.86675626, "learning_rate": 1.5611443811787224e-06, "loss": 0.88898349, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.5466597080230713 }, { "auxiliary_loss_clip": 0.0115733, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 1.05155385, "balance_loss_mlp": 1.01757085, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 3.8458313535371977, "language_loss": 0.69155306, "learning_rate": 1.560384427358945e-06, "loss": 0.7133745, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.583078384399414 }, { "auxiliary_loss_clip": 0.01135251, "auxiliary_loss_mlp": 0.01022762, "balance_loss_clip": 1.04439807, "balance_loss_mlp": 1.0158236, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.6571417979528449, "language_loss": 0.73082054, "learning_rate": 1.5596245402284998e-06, "loss": 0.7524007, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.7184269428253174 }, { "auxiliary_loss_clip": 0.01162387, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.05397618, "balance_loss_mlp": 1.01701045, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 2.17457756777104, "language_loss": 0.81583881, "learning_rate": 1.5588647199026619e-06, "loss": 0.83770418, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.630803108215332 }, { "auxiliary_loss_clip": 0.01180879, "auxiliary_loss_mlp": 0.01033623, "balance_loss_clip": 1.05593228, "balance_loss_mlp": 1.02608895, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.3975802610072554, "language_loss": 0.87615275, "learning_rate": 1.5581049664966956e-06, "loss": 0.89829779, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.529834508895874 }, { "auxiliary_loss_clip": 0.01033834, "auxiliary_loss_mlp": 0.01001166, "balance_loss_clip": 1.03298521, "balance_loss_mlp": 0.99989665, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9951351756454547, "language_loss": 0.65148401, "learning_rate": 1.5573452801258545e-06, "loss": 0.67183405, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.0720629692077637 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.01024638, "balance_loss_clip": 1.05233908, "balance_loss_mlp": 1.01758659, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 3.5565900468832954, "language_loss": 0.63792157, "learning_rate": 1.5565856609053824e-06, "loss": 0.65982306, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.6425485610961914 }, { "auxiliary_loss_clip": 0.01178639, "auxiliary_loss_mlp": 0.010215, "balance_loss_clip": 1.05411243, "balance_loss_mlp": 1.01426971, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 2.2173775611566033, "language_loss": 0.80245876, "learning_rate": 1.5558261089505127e-06, "loss": 0.82446015, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.5299506187438965 }, { "auxiliary_loss_clip": 0.01159806, "auxiliary_loss_mlp": 0.01030911, "balance_loss_clip": 1.05142069, "balance_loss_mlp": 1.02385402, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 1.911776874175247, "language_loss": 0.80111206, "learning_rate": 1.5550666243764697e-06, "loss": 0.82301915, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.6814239025115967 }, { "auxiliary_loss_clip": 0.01160046, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.05308533, "balance_loss_mlp": 1.02094126, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.4555312215710634, "language_loss": 0.77492768, "learning_rate": 1.554307207298465e-06, "loss": 0.79680765, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.5820045471191406 }, { "auxiliary_loss_clip": 0.01179551, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.05415118, "balance_loss_mlp": 1.02338636, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 1.9047883252369648, "language_loss": 0.78424615, "learning_rate": 1.553547857831704e-06, "loss": 0.80634797, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.6335389614105225 }, { "auxiliary_loss_clip": 0.01095291, "auxiliary_loss_mlp": 0.01002588, "balance_loss_clip": 1.03514481, "balance_loss_mlp": 1.00120544, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.9031912646091538, "language_loss": 0.64166713, "learning_rate": 1.5527885760913771e-06, "loss": 0.66264582, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.9804656505584717 }, { "auxiliary_loss_clip": 0.0114257, "auxiliary_loss_mlp": 0.01025674, "balance_loss_clip": 1.04954934, "balance_loss_mlp": 1.01919782, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 2.2532036824440587, "language_loss": 0.76303744, "learning_rate": 1.552029362192668e-06, "loss": 0.78471988, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.721027374267578 }, { "auxiliary_loss_clip": 0.0112511, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.04806626, "balance_loss_mlp": 1.01932478, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 2.7101434877218313, "language_loss": 0.725425, "learning_rate": 1.5512702162507478e-06, "loss": 0.74694598, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.7205123901367188 }, { "auxiliary_loss_clip": 0.0106867, "auxiliary_loss_mlp": 0.01001167, "balance_loss_clip": 1.03404951, "balance_loss_mlp": 0.99983186, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.1476479600165226, "language_loss": 0.5583086, "learning_rate": 1.5505111383807792e-06, "loss": 0.57900703, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.263188362121582 }, { "auxiliary_loss_clip": 0.01115817, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.04583943, "balance_loss_mlp": 1.02246833, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.9431771096603043, "language_loss": 0.80495006, "learning_rate": 1.5497521286979138e-06, "loss": 0.82640898, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.7689871788024902 }, { "auxiliary_loss_clip": 0.01133951, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.04817486, "balance_loss_mlp": 1.02190614, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.903268544346625, "language_loss": 0.74686003, "learning_rate": 1.5489931873172927e-06, "loss": 0.7684992, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.6913259029388428 }, { "auxiliary_loss_clip": 0.01082276, "auxiliary_loss_mlp": 0.01025426, "balance_loss_clip": 1.04190564, "balance_loss_mlp": 1.01802325, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 2.2335302432174626, "language_loss": 0.79064155, "learning_rate": 1.5482343143540467e-06, "loss": 0.81171852, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.837665319442749 }, { "auxiliary_loss_clip": 0.01130756, "auxiliary_loss_mlp": 0.00710658, "balance_loss_clip": 1.04825878, "balance_loss_mlp": 1.00047839, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 5.9027749064204285, "language_loss": 0.83202076, "learning_rate": 1.547475509923295e-06, "loss": 0.85043484, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.6225500106811523 }, { "auxiliary_loss_clip": 0.01044434, "auxiliary_loss_mlp": 0.01002187, "balance_loss_clip": 1.03097284, "balance_loss_mlp": 1.00082207, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7267320834853621, "language_loss": 0.56078178, "learning_rate": 1.5467167741401495e-06, "loss": 0.58124799, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 4.181311368942261 }, { "auxiliary_loss_clip": 0.01144306, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.04980695, "balance_loss_mlp": 1.01886892, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.42781107453324, "language_loss": 0.71694541, "learning_rate": 1.5459581071197083e-06, "loss": 0.73864961, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.624821186065674 }, { "auxiliary_loss_clip": 0.01165234, "auxiliary_loss_mlp": 0.01028532, "balance_loss_clip": 1.05495524, "balance_loss_mlp": 1.02168036, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.4107313159551236, "language_loss": 0.83221877, "learning_rate": 1.5451995089770624e-06, "loss": 0.85415649, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 4.575282335281372 }, { "auxiliary_loss_clip": 0.01175378, "auxiliary_loss_mlp": 0.01026227, "balance_loss_clip": 1.05373096, "balance_loss_mlp": 1.02003407, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.5613616596315454, "language_loss": 0.71989155, "learning_rate": 1.5444409798272885e-06, "loss": 0.7419076, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 3.5765058994293213 }, { "auxiliary_loss_clip": 0.01129883, "auxiliary_loss_mlp": 0.01024852, "balance_loss_clip": 1.04798865, "balance_loss_mlp": 1.01665688, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.888537795807569, "language_loss": 0.80795592, "learning_rate": 1.543682519785456e-06, "loss": 0.8295033, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.7213618755340576 }, { "auxiliary_loss_clip": 0.01144474, "auxiliary_loss_mlp": 0.01026986, "balance_loss_clip": 1.04932094, "balance_loss_mlp": 1.01963067, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.621532491562094, "language_loss": 0.80257368, "learning_rate": 1.5429241289666219e-06, "loss": 0.82428825, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.6190314292907715 }, { "auxiliary_loss_clip": 0.01137859, "auxiliary_loss_mlp": 0.01029792, "balance_loss_clip": 1.04961872, "balance_loss_mlp": 1.02251434, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 1.9741810841345069, "language_loss": 0.69906777, "learning_rate": 1.5421658074858342e-06, "loss": 0.72074425, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.7905428409576416 }, { "auxiliary_loss_clip": 0.01139863, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.049034, "balance_loss_mlp": 1.01818931, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.2551255238337253, "language_loss": 0.66269356, "learning_rate": 1.5414075554581298e-06, "loss": 0.68435287, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.8057749271392822 }, { "auxiliary_loss_clip": 0.01181431, "auxiliary_loss_mlp": 0.01027168, "balance_loss_clip": 1.05517554, "balance_loss_mlp": 1.02009273, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.5495434745617955, "language_loss": 0.78917336, "learning_rate": 1.5406493729985348e-06, "loss": 0.81125939, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.613060712814331 }, { "auxiliary_loss_clip": 0.01120454, "auxiliary_loss_mlp": 0.00711702, "balance_loss_clip": 1.05006588, "balance_loss_mlp": 1.00043416, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 4.252785049083066, "language_loss": 0.71924675, "learning_rate": 1.5398912602220644e-06, "loss": 0.73756832, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.7190651893615723 }, { "auxiliary_loss_clip": 0.01126043, "auxiliary_loss_mlp": 0.01027523, "balance_loss_clip": 1.04890728, "balance_loss_mlp": 1.02043855, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.7953438625908062, "language_loss": 0.7899918, "learning_rate": 1.539133217243724e-06, "loss": 0.81152749, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.705998182296753 }, { "auxiliary_loss_clip": 0.01137853, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.05043554, "balance_loss_mlp": 1.01899827, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.405632215837226, "language_loss": 0.76204783, "learning_rate": 1.5383752441785081e-06, "loss": 0.78369427, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.6702375411987305 }, { "auxiliary_loss_clip": 0.01165717, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.05310714, "balance_loss_mlp": 1.02258289, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.4391567991371264, "language_loss": 0.86135155, "learning_rate": 1.5376173411414003e-06, "loss": 0.88330555, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.6594176292419434 }, { "auxiliary_loss_clip": 0.01145928, "auxiliary_loss_mlp": 0.01026686, "balance_loss_clip": 1.04801631, "balance_loss_mlp": 1.01947069, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 2.5423493612421058, "language_loss": 0.78928739, "learning_rate": 1.5368595082473753e-06, "loss": 0.81101346, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.731335401535034 }, { "auxiliary_loss_clip": 0.01161677, "auxiliary_loss_mlp": 0.0102472, "balance_loss_clip": 1.05029917, "balance_loss_mlp": 1.0176506, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.8325466386911873, "language_loss": 0.78464043, "learning_rate": 1.5361017456113935e-06, "loss": 0.80650443, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.6315770149230957 }, { "auxiliary_loss_clip": 0.01161464, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.05057335, "balance_loss_mlp": 1.02165055, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 3.4760686943007904, "language_loss": 0.86276573, "learning_rate": 1.5353440533484085e-06, "loss": 0.88467705, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.638718366622925 }, { "auxiliary_loss_clip": 0.0114725, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.05096543, "balance_loss_mlp": 1.02067065, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 3.259906564057346, "language_loss": 0.65799916, "learning_rate": 1.534586431573361e-06, "loss": 0.67975414, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 2.9021096229553223 }, { "auxiliary_loss_clip": 0.01097729, "auxiliary_loss_mlp": 0.01025457, "balance_loss_clip": 1.04340863, "balance_loss_mlp": 1.01756811, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 2.3188299064258033, "language_loss": 0.79296541, "learning_rate": 1.5338288804011817e-06, "loss": 0.8141973, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.791437864303589 }, { "auxiliary_loss_clip": 0.01141396, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 1.04765725, "balance_loss_mlp": 1.0233345, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 2.3015757528725977, "language_loss": 0.71220255, "learning_rate": 1.533071399946791e-06, "loss": 0.73392218, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.7080464363098145 }, { "auxiliary_loss_clip": 0.01146602, "auxiliary_loss_mlp": 0.01023578, "balance_loss_clip": 1.04924965, "balance_loss_mlp": 1.01615691, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.9376236641609437, "language_loss": 0.56908429, "learning_rate": 1.5323139903250977e-06, "loss": 0.5907861, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.745948553085327 }, { "auxiliary_loss_clip": 0.01148409, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.05209529, "balance_loss_mlp": 1.02362072, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.716503381427458, "language_loss": 0.77084798, "learning_rate": 1.5315566516510002e-06, "loss": 0.79264235, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.6604678630828857 }, { "auxiliary_loss_clip": 0.01176172, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.05279636, "balance_loss_mlp": 1.02218175, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 2.0972976889503343, "language_loss": 0.67583531, "learning_rate": 1.5307993840393857e-06, "loss": 0.69789553, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.6145222187042236 }, { "auxiliary_loss_clip": 0.01176469, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.0520817, "balance_loss_mlp": 1.01996076, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 2.0287407811099807, "language_loss": 0.8031863, "learning_rate": 1.530042187605132e-06, "loss": 0.82521605, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.588059186935425 }, { "auxiliary_loss_clip": 0.01162642, "auxiliary_loss_mlp": 0.00711351, "balance_loss_clip": 1.052284, "balance_loss_mlp": 1.00051856, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.4041731814337655, "language_loss": 0.84211028, "learning_rate": 1.5292850624631044e-06, "loss": 0.86085021, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.6916754245758057 }, { "auxiliary_loss_clip": 0.01158359, "auxiliary_loss_mlp": 0.01021935, "balance_loss_clip": 1.05219519, "balance_loss_mlp": 1.0144012, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 1.957821775618845, "language_loss": 0.80454671, "learning_rate": 1.5285280087281593e-06, "loss": 0.82634962, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.675417423248291 }, { "auxiliary_loss_clip": 0.01069331, "auxiliary_loss_mlp": 0.01002892, "balance_loss_clip": 1.03222942, "balance_loss_mlp": 1.0013541, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 1.4751015530615021, "language_loss": 0.56570047, "learning_rate": 1.5277710265151398e-06, "loss": 0.58642268, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.37725830078125 }, { "auxiliary_loss_clip": 0.01159528, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.0502429, "balance_loss_mlp": 1.0198127, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 3.0569814029242486, "language_loss": 0.76425231, "learning_rate": 1.5270141159388803e-06, "loss": 0.78612459, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 3.494997978210449 }, { "auxiliary_loss_clip": 0.01176947, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.05212593, "balance_loss_mlp": 1.01968729, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 5.528346806672068, "language_loss": 0.8057341, "learning_rate": 1.526257277114203e-06, "loss": 0.82777542, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.575868606567383 }, { "auxiliary_loss_clip": 0.01144054, "auxiliary_loss_mlp": 0.01025185, "balance_loss_clip": 1.05186987, "balance_loss_mlp": 1.01787758, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 2.5170060531433527, "language_loss": 0.79589516, "learning_rate": 1.5255005101559201e-06, "loss": 0.81758755, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 4.496236562728882 }, { "auxiliary_loss_clip": 0.01163837, "auxiliary_loss_mlp": 0.01019746, "balance_loss_clip": 1.05097151, "balance_loss_mlp": 1.01282024, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 2.86459512080434, "language_loss": 0.76691401, "learning_rate": 1.524743815178833e-06, "loss": 0.78874981, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 3.5699622631073 }, { "auxiliary_loss_clip": 0.01147944, "auxiliary_loss_mlp": 0.01025386, "balance_loss_clip": 1.0493865, "balance_loss_mlp": 1.01805758, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 3.7291095429900984, "language_loss": 0.8082478, "learning_rate": 1.5239871922977315e-06, "loss": 0.82998103, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.6502983570098877 }, { "auxiliary_loss_clip": 0.01143918, "auxiliary_loss_mlp": 0.01024985, "balance_loss_clip": 1.04914892, "balance_loss_mlp": 1.01762938, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 2.4439078043696347, "language_loss": 0.90362895, "learning_rate": 1.523230641627394e-06, "loss": 0.925318, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.64180850982666 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01024272, "balance_loss_clip": 1.04329515, "balance_loss_mlp": 1.01633835, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 3.294903716005547, "language_loss": 0.7248826, "learning_rate": 1.5224741632825888e-06, "loss": 0.74627733, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.8293685913085938 }, { "auxiliary_loss_clip": 0.01181342, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.05594087, "balance_loss_mlp": 1.02098846, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.8966596068539536, "language_loss": 0.69204336, "learning_rate": 1.521717757378074e-06, "loss": 0.71414721, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.783102035522461 }, { "auxiliary_loss_clip": 0.01167425, "auxiliary_loss_mlp": 0.01028139, "balance_loss_clip": 1.05306637, "balance_loss_mlp": 1.02040839, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 3.1442200822960005, "language_loss": 0.69319737, "learning_rate": 1.5209614240285943e-06, "loss": 0.71515298, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.6738810539245605 }, { "auxiliary_loss_clip": 0.0117844, "auxiliary_loss_mlp": 0.00711531, "balance_loss_clip": 1.05358839, "balance_loss_mlp": 1.00049996, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.2164525538315814, "language_loss": 0.84706795, "learning_rate": 1.520205163348887e-06, "loss": 0.86596769, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.5603091716766357 }, { "auxiliary_loss_clip": 0.01060011, "auxiliary_loss_mlp": 0.01002123, "balance_loss_clip": 1.03333187, "balance_loss_mlp": 1.00075197, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7245365946040307, "language_loss": 0.56925726, "learning_rate": 1.519448975453674e-06, "loss": 0.58987856, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.140627861022949 }, { "auxiliary_loss_clip": 0.01163795, "auxiliary_loss_mlp": 0.00712205, "balance_loss_clip": 1.05487049, "balance_loss_mlp": 1.00060248, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 2.1440369946526348, "language_loss": 0.75729954, "learning_rate": 1.5186928604576696e-06, "loss": 0.77605945, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.6019434928894043 }, { "auxiliary_loss_clip": 0.01147105, "auxiliary_loss_mlp": 0.01024318, "balance_loss_clip": 1.05032229, "balance_loss_mlp": 1.01746106, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.349152345863878, "language_loss": 0.77139211, "learning_rate": 1.5179368184755752e-06, "loss": 0.79310632, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.634190797805786 }, { "auxiliary_loss_clip": 0.01147539, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.0527153, "balance_loss_mlp": 1.02004731, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 3.599546547806237, "language_loss": 0.82450438, "learning_rate": 1.5171808496220821e-06, "loss": 0.84625638, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.6961045265197754 }, { "auxiliary_loss_clip": 0.01151274, "auxiliary_loss_mlp": 0.01021216, "balance_loss_clip": 1.05102634, "balance_loss_mlp": 1.01405418, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.972124010918994, "language_loss": 0.8120898, "learning_rate": 1.5164249540118708e-06, "loss": 0.83381468, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.6935853958129883 }, { "auxiliary_loss_clip": 0.01105239, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04732537, "balance_loss_mlp": 1.01820171, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.744944936324418, "language_loss": 0.83066756, "learning_rate": 1.5156691317596093e-06, "loss": 0.8519786, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.790357828140259 }, { "auxiliary_loss_clip": 0.01165864, "auxiliary_loss_mlp": 0.00711199, "balance_loss_clip": 1.05292058, "balance_loss_mlp": 1.00049877, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.1253876242813026, "language_loss": 0.66696668, "learning_rate": 1.5149133829799556e-06, "loss": 0.68573737, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.6626813411712646 }, { "auxiliary_loss_clip": 0.01154824, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.05226004, "balance_loss_mlp": 1.02060771, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.296310137575368, "language_loss": 0.81305039, "learning_rate": 1.5141577077875556e-06, "loss": 0.83487427, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.680265426635742 }, { "auxiliary_loss_clip": 0.01165987, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.05348051, "balance_loss_mlp": 1.01739264, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 2.4211677129827915, "language_loss": 0.72697413, "learning_rate": 1.5134021062970451e-06, "loss": 0.74887431, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.599745035171509 }, { "auxiliary_loss_clip": 0.01124554, "auxiliary_loss_mlp": 0.01027289, "balance_loss_clip": 1.05216432, "balance_loss_mlp": 1.02038038, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 3.341166925152328, "language_loss": 0.81108814, "learning_rate": 1.5126465786230483e-06, "loss": 0.83260655, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.6922965049743652 }, { "auxiliary_loss_clip": 0.01177023, "auxiliary_loss_mlp": 0.01024667, "balance_loss_clip": 1.05349576, "balance_loss_mlp": 1.0169481, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 2.9691673376294356, "language_loss": 0.8186419, "learning_rate": 1.5118911248801787e-06, "loss": 0.84065878, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.59309458732605 }, { "auxiliary_loss_clip": 0.01157784, "auxiliary_loss_mlp": 0.01024166, "balance_loss_clip": 1.05085218, "balance_loss_mlp": 1.01691771, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.329953687491871, "language_loss": 0.79604137, "learning_rate": 1.5111357451830364e-06, "loss": 0.8178609, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.653085231781006 }, { "auxiliary_loss_clip": 0.0116424, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.05286562, "balance_loss_mlp": 1.02107346, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 1.872924093578093, "language_loss": 0.70895982, "learning_rate": 1.5103804396462131e-06, "loss": 0.73088402, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.591456890106201 }, { "auxiliary_loss_clip": 0.01167683, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.05304599, "balance_loss_mlp": 1.01855254, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.2212587589069663, "language_loss": 0.80255157, "learning_rate": 1.5096252083842877e-06, "loss": 0.82449383, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.7147579193115234 }, { "auxiliary_loss_clip": 0.0116103, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.05105054, "balance_loss_mlp": 1.01866639, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 2.1910889642114393, "language_loss": 0.85408127, "learning_rate": 1.5088700515118285e-06, "loss": 0.87595403, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.659400463104248 }, { "auxiliary_loss_clip": 0.01126762, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.04914784, "balance_loss_mlp": 1.02097845, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 3.3624571377998245, "language_loss": 0.6639533, "learning_rate": 1.508114969143392e-06, "loss": 0.68550545, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.7868707180023193 }, { "auxiliary_loss_clip": 0.011477, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.04831314, "balance_loss_mlp": 1.02094185, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.5516576923807626, "language_loss": 0.77619088, "learning_rate": 1.5073599613935238e-06, "loss": 0.79794657, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 3.653902769088745 }, { "auxiliary_loss_clip": 0.01146869, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 1.05086637, "balance_loss_mlp": 1.01841402, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 1.927861482982085, "language_loss": 0.57420349, "learning_rate": 1.5066050283767574e-06, "loss": 0.59593147, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 4.565190315246582 }, { "auxiliary_loss_clip": 0.01143453, "auxiliary_loss_mlp": 0.01024737, "balance_loss_clip": 1.0511961, "balance_loss_mlp": 1.01771009, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 1.9305671242679634, "language_loss": 0.82588899, "learning_rate": 1.505850170207616e-06, "loss": 0.8475709, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 3.5103137493133545 }, { "auxiliary_loss_clip": 0.01147165, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.05142367, "balance_loss_mlp": 1.0210743, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.375657894608736, "language_loss": 0.78314275, "learning_rate": 1.505095387000611e-06, "loss": 0.80489713, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.7125377655029297 }, { "auxiliary_loss_clip": 0.0114104, "auxiliary_loss_mlp": 0.01029948, "balance_loss_clip": 1.052037, "balance_loss_mlp": 1.02240193, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 1.9292037790082814, "language_loss": 0.74427944, "learning_rate": 1.504340678870242e-06, "loss": 0.7659893, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.6419620513916016 }, { "auxiliary_loss_clip": 0.01162222, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.05257058, "balance_loss_mlp": 1.01988888, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.025040794719769, "language_loss": 0.90004796, "learning_rate": 1.5035860459309989e-06, "loss": 0.92194659, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.651287794113159 }, { "auxiliary_loss_clip": 0.01143307, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.05064714, "balance_loss_mlp": 1.02256775, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 5.154690733818783, "language_loss": 0.63711327, "learning_rate": 1.5028314882973568e-06, "loss": 0.65884531, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.6868457794189453 }, { "auxiliary_loss_clip": 0.01146405, "auxiliary_loss_mlp": 0.01025767, "balance_loss_clip": 1.05102086, "balance_loss_mlp": 1.01825428, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 2.201248471870075, "language_loss": 0.84645247, "learning_rate": 1.502077006083783e-06, "loss": 0.8681742, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.6593992710113525 }, { "auxiliary_loss_clip": 0.01167185, "auxiliary_loss_mlp": 0.00711423, "balance_loss_clip": 1.05299449, "balance_loss_mlp": 1.00050974, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 3.455791958753208, "language_loss": 0.76426756, "learning_rate": 1.5013225994047315e-06, "loss": 0.78305364, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.58748459815979 }, { "auxiliary_loss_clip": 0.01165425, "auxiliary_loss_mlp": 0.00711059, "balance_loss_clip": 1.05322957, "balance_loss_mlp": 1.00055385, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 5.063619423109058, "language_loss": 0.80505347, "learning_rate": 1.5005682683746452e-06, "loss": 0.82381833, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.649188756942749 }, { "auxiliary_loss_clip": 0.01166419, "auxiliary_loss_mlp": 0.01036671, "balance_loss_clip": 1.0571301, "balance_loss_mlp": 1.02892232, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.314256791819297, "language_loss": 0.72914177, "learning_rate": 1.4998140131079553e-06, "loss": 0.75117266, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.6300086975097656 }, { "auxiliary_loss_clip": 0.01098083, "auxiliary_loss_mlp": 0.00711384, "balance_loss_clip": 1.04670882, "balance_loss_mlp": 1.00041664, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.9200756423606098, "language_loss": 0.73616564, "learning_rate": 1.4990598337190821e-06, "loss": 0.75426042, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.7410025596618652 }, { "auxiliary_loss_clip": 0.0117862, "auxiliary_loss_mlp": 0.00711388, "balance_loss_clip": 1.0537498, "balance_loss_mlp": 1.000561, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.8682345452703482, "language_loss": 0.68046999, "learning_rate": 1.4983057303224338e-06, "loss": 0.69937015, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.673139810562134 }, { "auxiliary_loss_clip": 0.01116687, "auxiliary_loss_mlp": 0.01029196, "balance_loss_clip": 1.05049205, "balance_loss_mlp": 1.02144766, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 6.217997273301975, "language_loss": 0.87617701, "learning_rate": 1.4975517030324072e-06, "loss": 0.89763582, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.759915590286255 }, { "auxiliary_loss_clip": 0.01091808, "auxiliary_loss_mlp": 0.00701889, "balance_loss_clip": 1.0320034, "balance_loss_mlp": 1.00019157, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7809929765485295, "language_loss": 0.61816096, "learning_rate": 1.4967977519633882e-06, "loss": 0.63609797, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.2613754272460938 }, { "auxiliary_loss_clip": 0.01128121, "auxiliary_loss_mlp": 0.01029757, "balance_loss_clip": 1.0496254, "balance_loss_mlp": 1.02210999, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 2.2525443814403943, "language_loss": 0.77972656, "learning_rate": 1.4960438772297494e-06, "loss": 0.80130529, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.6415750980377197 }, { "auxiliary_loss_clip": 0.01151298, "auxiliary_loss_mlp": 0.01023775, "balance_loss_clip": 1.05144882, "balance_loss_mlp": 1.01606226, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.7456494413423482, "language_loss": 0.7329846, "learning_rate": 1.495290078945855e-06, "loss": 0.75473535, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.7666845321655273 }, { "auxiliary_loss_clip": 0.01179418, "auxiliary_loss_mlp": 0.01031668, "balance_loss_clip": 1.05627823, "balance_loss_mlp": 1.02422905, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 5.175076540377058, "language_loss": 0.74414122, "learning_rate": 1.4945363572260529e-06, "loss": 0.7662521, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.696991205215454 }, { "auxiliary_loss_clip": 0.01161381, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.05106771, "balance_loss_mlp": 1.01783991, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.2288780017980727, "language_loss": 0.67813146, "learning_rate": 1.4937827121846845e-06, "loss": 0.70000255, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.7022790908813477 }, { "auxiliary_loss_clip": 0.01128457, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.05293357, "balance_loss_mlp": 1.0186727, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.8683441684429445, "language_loss": 0.73665738, "learning_rate": 1.4930291439360755e-06, "loss": 0.75820231, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.6886532306671143 }, { "auxiliary_loss_clip": 0.01166078, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.05444741, "balance_loss_mlp": 1.02028954, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 2.5392865681836603, "language_loss": 0.79336309, "learning_rate": 1.4922756525945427e-06, "loss": 0.8153041, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.584723949432373 }, { "auxiliary_loss_clip": 0.01078862, "auxiliary_loss_mlp": 0.01004114, "balance_loss_clip": 1.03020215, "balance_loss_mlp": 1.00269508, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7747460764082992, "language_loss": 0.59563768, "learning_rate": 1.4915222382743894e-06, "loss": 0.61646736, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.24554443359375 }, { "auxiliary_loss_clip": 0.01165704, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.05499983, "balance_loss_mlp": 1.02090788, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.458862788191143, "language_loss": 0.71997905, "learning_rate": 1.4907689010899085e-06, "loss": 0.74192142, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.5787036418914795 }, { "auxiliary_loss_clip": 0.01145725, "auxiliary_loss_mlp": 0.0102827, "balance_loss_clip": 1.04954743, "balance_loss_mlp": 1.02081919, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 3.7075642816788084, "language_loss": 0.62449014, "learning_rate": 1.4900156411553804e-06, "loss": 0.6462301, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.753999710083008 }, { "auxiliary_loss_clip": 0.01148711, "auxiliary_loss_mlp": 0.01028989, "balance_loss_clip": 1.05245686, "balance_loss_mlp": 1.02119839, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.2161044142775523, "language_loss": 0.85511255, "learning_rate": 1.4892624585850739e-06, "loss": 0.87688959, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.6522703170776367 }, { "auxiliary_loss_clip": 0.01177069, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 1.05307603, "balance_loss_mlp": 1.02045751, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 1.9416084285617392, "language_loss": 0.79882789, "learning_rate": 1.4885093534932465e-06, "loss": 0.82087862, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.63820219039917 }, { "auxiliary_loss_clip": 0.01144915, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.0531764, "balance_loss_mlp": 1.02208269, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.451882519830453, "language_loss": 0.71648192, "learning_rate": 1.4877563259941433e-06, "loss": 0.73822629, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 3.560037612915039 }, { "auxiliary_loss_clip": 0.01169858, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.05461836, "balance_loss_mlp": 1.02148616, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.053521791404601, "language_loss": 0.67916352, "learning_rate": 1.4870033762019988e-06, "loss": 0.70115471, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 3.8466274738311768 }, { "auxiliary_loss_clip": 0.01146432, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.05153048, "balance_loss_mlp": 1.02529705, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.7674838309665137, "language_loss": 0.73384333, "learning_rate": 1.4862505042310334e-06, "loss": 0.75563484, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 4.495223760604858 }, { "auxiliary_loss_clip": 0.01140555, "auxiliary_loss_mlp": 0.01030018, "balance_loss_clip": 1.05272532, "balance_loss_mlp": 1.02282667, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.6079366641052746, "language_loss": 0.69749671, "learning_rate": 1.4854977101954587e-06, "loss": 0.7192024, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.7657344341278076 }, { "auxiliary_loss_clip": 0.01160604, "auxiliary_loss_mlp": 0.01027028, "balance_loss_clip": 1.04816341, "balance_loss_mlp": 1.0194937, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 3.8322330953994124, "language_loss": 0.86672437, "learning_rate": 1.4847449942094716e-06, "loss": 0.88860071, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.656956434249878 }, { "auxiliary_loss_clip": 0.01142172, "auxiliary_loss_mlp": 0.01023292, "balance_loss_clip": 1.04919481, "balance_loss_mlp": 1.01621139, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 2.054285132365356, "language_loss": 0.87115008, "learning_rate": 1.4839923563872598e-06, "loss": 0.89280468, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.6243903636932373 }, { "auxiliary_loss_clip": 0.01131505, "auxiliary_loss_mlp": 0.01027192, "balance_loss_clip": 1.05146432, "balance_loss_mlp": 1.01937211, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 2.395521036667003, "language_loss": 0.761657, "learning_rate": 1.483239796842997e-06, "loss": 0.78324401, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.6248745918273926 }, { "auxiliary_loss_clip": 0.0112873, "auxiliary_loss_mlp": 0.01023814, "balance_loss_clip": 1.05002105, "balance_loss_mlp": 1.01666713, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 2.0550596885009753, "language_loss": 0.83766365, "learning_rate": 1.4824873156908462e-06, "loss": 0.85918903, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.6859207153320312 }, { "auxiliary_loss_clip": 0.01163854, "auxiliary_loss_mlp": 0.00711839, "balance_loss_clip": 1.05374873, "balance_loss_mlp": 1.00050843, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.6574015636880353, "language_loss": 0.75643134, "learning_rate": 1.4817349130449584e-06, "loss": 0.77518827, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.6160662174224854 }, { "auxiliary_loss_clip": 0.01160187, "auxiliary_loss_mlp": 0.01025383, "balance_loss_clip": 1.05352473, "balance_loss_mlp": 1.01831985, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 2.0987308801275026, "language_loss": 0.83385074, "learning_rate": 1.4809825890194717e-06, "loss": 0.85570645, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.6859309673309326 }, { "auxiliary_loss_clip": 0.01141927, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 1.05011284, "balance_loss_mlp": 1.02120697, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.9899999562166473, "language_loss": 0.77021706, "learning_rate": 1.4802303437285139e-06, "loss": 0.79191667, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.636864423751831 }, { "auxiliary_loss_clip": 0.01142501, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.04779863, "balance_loss_mlp": 1.01915336, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.433953545674128, "language_loss": 0.80780733, "learning_rate": 1.4794781772861994e-06, "loss": 0.82949907, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.6300973892211914 }, { "auxiliary_loss_clip": 0.01145364, "auxiliary_loss_mlp": 0.00711416, "balance_loss_clip": 1.05073893, "balance_loss_mlp": 1.00046098, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 2.4142035342594794, "language_loss": 0.67086244, "learning_rate": 1.4787260898066324e-06, "loss": 0.68943024, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.7238872051239014 }, { "auxiliary_loss_clip": 0.01174737, "auxiliary_loss_mlp": 0.01022476, "balance_loss_clip": 1.05376816, "balance_loss_mlp": 1.01512074, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.1098750012919085, "language_loss": 0.85859233, "learning_rate": 1.4779740814039023e-06, "loss": 0.88056451, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.617966413497925 }, { "auxiliary_loss_clip": 0.01177989, "auxiliary_loss_mlp": 0.01026528, "balance_loss_clip": 1.05504179, "balance_loss_mlp": 1.01917839, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 6.589092352444789, "language_loss": 0.68331051, "learning_rate": 1.4772221521920894e-06, "loss": 0.70535564, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.6328563690185547 }, { "auxiliary_loss_clip": 0.01145649, "auxiliary_loss_mlp": 0.0102584, "balance_loss_clip": 1.05350876, "balance_loss_mlp": 1.01889062, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 1.961537880645069, "language_loss": 0.74230003, "learning_rate": 1.4764703022852598e-06, "loss": 0.76401496, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.7269012928009033 }, { "auxiliary_loss_clip": 0.01082412, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.04182792, "balance_loss_mlp": 1.02053261, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.8528038411092083, "language_loss": 0.76958477, "learning_rate": 1.4757185317974696e-06, "loss": 0.7906853, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.7560946941375732 }, { "auxiliary_loss_clip": 0.01160824, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.05106854, "balance_loss_mlp": 1.01602125, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.504379978151729, "language_loss": 0.71631742, "learning_rate": 1.474966840842761e-06, "loss": 0.7381655, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.616063117980957 }, { "auxiliary_loss_clip": 0.01165505, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.05314672, "balance_loss_mlp": 1.01767302, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.8649239613543167, "language_loss": 0.87233198, "learning_rate": 1.4742152295351655e-06, "loss": 0.89423752, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.647881269454956 }, { "auxiliary_loss_clip": 0.01160823, "auxiliary_loss_mlp": 0.00711732, "balance_loss_clip": 1.05150461, "balance_loss_mlp": 1.00057316, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 2.8966593650748806, "language_loss": 0.64438182, "learning_rate": 1.4734636979887016e-06, "loss": 0.6631074, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.585075855255127 }, { "auxiliary_loss_clip": 0.01136393, "auxiliary_loss_mlp": 0.01024971, "balance_loss_clip": 1.05082345, "balance_loss_mlp": 1.01734757, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.195866219725044, "language_loss": 0.90132231, "learning_rate": 1.4727122463173755e-06, "loss": 0.92293596, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.791807174682617 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01028652, "balance_loss_clip": 1.0506022, "balance_loss_mlp": 1.02127326, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.8950593282144383, "language_loss": 0.645859, "learning_rate": 1.471960874635183e-06, "loss": 0.66759408, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.63516902923584 }, { "auxiliary_loss_clip": 0.0114035, "auxiliary_loss_mlp": 0.01024042, "balance_loss_clip": 1.04890895, "balance_loss_mlp": 1.01610279, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 8.903721656102343, "language_loss": 0.7114712, "learning_rate": 1.4712095830561055e-06, "loss": 0.73311508, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.632967472076416 }, { "auxiliary_loss_clip": 0.01143924, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04831648, "balance_loss_mlp": 1.01780105, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 1.9018535766603457, "language_loss": 0.81159818, "learning_rate": 1.4704583716941147e-06, "loss": 0.83328974, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.6034069061279297 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01026687, "balance_loss_clip": 1.0530262, "balance_loss_mlp": 1.01923096, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 1.730880384316333, "language_loss": 0.72547501, "learning_rate": 1.4697072406631672e-06, "loss": 0.74730211, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.6742396354675293 }, { "auxiliary_loss_clip": 0.01119152, "auxiliary_loss_mlp": 0.01032048, "balance_loss_clip": 1.05108953, "balance_loss_mlp": 1.02423978, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.5860948427872705, "language_loss": 0.7247901, "learning_rate": 1.4689561900772097e-06, "loss": 0.74630213, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.7925984859466553 }, { "auxiliary_loss_clip": 0.01143212, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.04855108, "balance_loss_mlp": 1.02055633, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 3.3797283191421785, "language_loss": 0.72305667, "learning_rate": 1.4682052200501758e-06, "loss": 0.74476695, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 3.5272085666656494 }, { "auxiliary_loss_clip": 0.01176177, "auxiliary_loss_mlp": 0.01024724, "balance_loss_clip": 1.05367124, "balance_loss_mlp": 1.01745176, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 2.394612380397662, "language_loss": 0.80394375, "learning_rate": 1.4674543306959876e-06, "loss": 0.82595283, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 5.477544784545898 }, { "auxiliary_loss_clip": 0.01151608, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.0520637, "balance_loss_mlp": 1.02327788, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.7012356250500242, "language_loss": 0.84471393, "learning_rate": 1.4667035221285535e-06, "loss": 0.86654019, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.7326812744140625 }, { "auxiliary_loss_clip": 0.01157298, "auxiliary_loss_mlp": 0.01028215, "balance_loss_clip": 1.05099702, "balance_loss_mlp": 1.02077579, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 2.6598259655435945, "language_loss": 0.74255079, "learning_rate": 1.4659527944617715e-06, "loss": 0.76440591, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.6609153747558594 }, { "auxiliary_loss_clip": 0.01090042, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 1.04225159, "balance_loss_mlp": 1.01988769, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.7248638220050196, "language_loss": 0.75928682, "learning_rate": 1.465202147809526e-06, "loss": 0.78045624, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.7247490882873535 }, { "auxiliary_loss_clip": 0.01178531, "auxiliary_loss_mlp": 0.010263, "balance_loss_clip": 1.05498385, "balance_loss_mlp": 1.01919174, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 2.0292621565271087, "language_loss": 0.76291239, "learning_rate": 1.4644515822856888e-06, "loss": 0.78496063, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.5964949131011963 }, { "auxiliary_loss_clip": 0.01050916, "auxiliary_loss_mlp": 0.01002358, "balance_loss_clip": 1.02684879, "balance_loss_mlp": 1.00102258, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7541779884261101, "language_loss": 0.56545514, "learning_rate": 1.4637010980041215e-06, "loss": 0.58598787, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.2836475372314453 }, { "auxiliary_loss_clip": 0.0117773, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.05436945, "balance_loss_mlp": 1.02245402, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 3.3127719277517915, "language_loss": 0.89654398, "learning_rate": 1.4629506950786707e-06, "loss": 0.91862547, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.7157235145568848 }, { "auxiliary_loss_clip": 0.01084555, "auxiliary_loss_mlp": 0.01000778, "balance_loss_clip": 1.02628481, "balance_loss_mlp": 0.99931127, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.807273844997165, "language_loss": 0.56108654, "learning_rate": 1.4622003736231733e-06, "loss": 0.58193994, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.2764198780059814 }, { "auxiliary_loss_clip": 0.01162437, "auxiliary_loss_mlp": 0.01025735, "balance_loss_clip": 1.05476785, "balance_loss_mlp": 1.01857638, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 2.438607692406423, "language_loss": 0.80582482, "learning_rate": 1.461450133751451e-06, "loss": 0.82770652, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.613598346710205 }, { "auxiliary_loss_clip": 0.0116404, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.05243397, "balance_loss_mlp": 1.01831055, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.925049744384527, "language_loss": 0.75561786, "learning_rate": 1.4606999755773153e-06, "loss": 0.77751279, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.6328513622283936 }, { "auxiliary_loss_clip": 0.01175183, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.05327666, "balance_loss_mlp": 1.02210104, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.8602803266260872, "language_loss": 0.82176256, "learning_rate": 1.4599498992145643e-06, "loss": 0.84380537, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.6896519660949707 }, { "auxiliary_loss_clip": 0.01152042, "auxiliary_loss_mlp": 0.00711568, "balance_loss_clip": 1.05248141, "balance_loss_mlp": 1.00054562, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 2.320767836917402, "language_loss": 0.7056756, "learning_rate": 1.4591999047769846e-06, "loss": 0.72431171, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.6451687812805176 }, { "auxiliary_loss_clip": 0.01091192, "auxiliary_loss_mlp": 0.01025868, "balance_loss_clip": 1.04199374, "balance_loss_mlp": 1.01822376, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 2.1880417902392875, "language_loss": 0.74990177, "learning_rate": 1.4584499923783486e-06, "loss": 0.77107239, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.752472162246704 }, { "auxiliary_loss_clip": 0.01144527, "auxiliary_loss_mlp": 0.01027532, "balance_loss_clip": 1.0505724, "balance_loss_mlp": 1.02048409, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 2.1424350353022916, "language_loss": 0.76030052, "learning_rate": 1.457700162132419e-06, "loss": 0.78202116, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.6334519386291504 }, { "auxiliary_loss_clip": 0.01106549, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.046, "balance_loss_mlp": 1.0243125, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.170270343789455, "language_loss": 0.73077691, "learning_rate": 1.4569504141529433e-06, "loss": 0.75215685, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.697645902633667 }, { "auxiliary_loss_clip": 0.0116124, "auxiliary_loss_mlp": 0.01027169, "balance_loss_clip": 1.05382037, "balance_loss_mlp": 1.01950395, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.31158900820757, "language_loss": 0.71903324, "learning_rate": 1.456200748553658e-06, "loss": 0.74091733, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.711230754852295 }, { "auxiliary_loss_clip": 0.01177988, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.05426598, "balance_loss_mlp": 1.01934576, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.8433758527428932, "language_loss": 0.78982472, "learning_rate": 1.455451165448287e-06, "loss": 0.81187022, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.647587299346924 }, { "auxiliary_loss_clip": 0.01145061, "auxiliary_loss_mlp": 0.01024184, "balance_loss_clip": 1.05125976, "balance_loss_mlp": 1.01693642, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.4754695891358365, "language_loss": 0.74381697, "learning_rate": 1.4547016649505407e-06, "loss": 0.76550943, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.6533052921295166 }, { "auxiliary_loss_clip": 0.0112531, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.04586875, "balance_loss_mlp": 1.0214659, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.57000890210331, "language_loss": 0.84958673, "learning_rate": 1.4539522471741193e-06, "loss": 0.87112546, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 219.11671686172485 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.0102799, "balance_loss_clip": 1.05072451, "balance_loss_mlp": 1.02047396, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.1101066985107235, "language_loss": 0.71210229, "learning_rate": 1.4532029122327067e-06, "loss": 0.73401994, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.736503839492798 }, { "auxiliary_loss_clip": 0.01124431, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 1.05286741, "balance_loss_mlp": 1.01924002, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 2.025493053493075, "language_loss": 0.75332928, "learning_rate": 1.4524536602399783e-06, "loss": 0.77484119, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.6620676517486572 }, { "auxiliary_loss_clip": 0.0114598, "auxiliary_loss_mlp": 0.010228, "balance_loss_clip": 1.05558801, "balance_loss_mlp": 1.01564157, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.7187606700245999, "language_loss": 0.77576828, "learning_rate": 1.4517044913095938e-06, "loss": 0.79745603, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.701449394226074 }, { "auxiliary_loss_clip": 0.01159596, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.05110931, "balance_loss_mlp": 1.02221036, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.7752782624465022, "language_loss": 0.81586599, "learning_rate": 1.4509554055552022e-06, "loss": 0.83775747, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.7696518898010254 }, { "auxiliary_loss_clip": 0.01144416, "auxiliary_loss_mlp": 0.01025722, "balance_loss_clip": 1.0508858, "balance_loss_mlp": 1.01830411, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 4.408482044455934, "language_loss": 0.83928239, "learning_rate": 1.450206403090439e-06, "loss": 0.86098373, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.733365774154663 }, { "auxiliary_loss_clip": 0.01153599, "auxiliary_loss_mlp": 0.01024873, "balance_loss_clip": 1.05032969, "balance_loss_mlp": 1.0176425, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.2198091807106874, "language_loss": 0.86581373, "learning_rate": 1.4494574840289274e-06, "loss": 0.88759845, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 3.510010004043579 }, { "auxiliary_loss_clip": 0.01163768, "auxiliary_loss_mlp": 0.0102312, "balance_loss_clip": 1.0501976, "balance_loss_mlp": 1.01584852, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 1.8284350790057458, "language_loss": 0.73851693, "learning_rate": 1.4487086484842782e-06, "loss": 0.76038581, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 3.6114261150360107 }, { "auxiliary_loss_clip": 0.01176657, "auxiliary_loss_mlp": 0.01029638, "balance_loss_clip": 1.05502987, "balance_loss_mlp": 1.02233613, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 2.2827391815732057, "language_loss": 0.6022464, "learning_rate": 1.4479598965700878e-06, "loss": 0.62430936, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 3.6412813663482666 }, { "auxiliary_loss_clip": 0.01125698, "auxiliary_loss_mlp": 0.01025655, "balance_loss_clip": 1.0468843, "balance_loss_mlp": 1.01838374, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.8180284162146148, "language_loss": 0.68451953, "learning_rate": 1.4472112283999427e-06, "loss": 0.70603311, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.6851961612701416 }, { "auxiliary_loss_clip": 0.01158972, "auxiliary_loss_mlp": 0.01026085, "balance_loss_clip": 1.05398905, "balance_loss_mlp": 1.01824713, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 2.142109913438043, "language_loss": 0.69067067, "learning_rate": 1.4464626440874143e-06, "loss": 0.71252126, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.598480701446533 }, { "auxiliary_loss_clip": 0.01119293, "auxiliary_loss_mlp": 0.01026157, "balance_loss_clip": 1.04469371, "balance_loss_mlp": 1.01876354, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.566127938958959, "language_loss": 0.74475038, "learning_rate": 1.4457141437460636e-06, "loss": 0.76620489, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.7093422412872314 }, { "auxiliary_loss_clip": 0.01144769, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.05039573, "balance_loss_mlp": 1.017555, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 2.464571284082758, "language_loss": 0.73205209, "learning_rate": 1.444965727489436e-06, "loss": 0.75375539, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.6362550258636475 }, { "auxiliary_loss_clip": 0.01127451, "auxiliary_loss_mlp": 0.01027689, "balance_loss_clip": 1.04751742, "balance_loss_mlp": 1.02001786, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 2.3981731582569714, "language_loss": 0.62825286, "learning_rate": 1.444217395431066e-06, "loss": 0.64980423, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.7736027240753174 }, { "auxiliary_loss_clip": 0.01045824, "auxiliary_loss_mlp": 0.01002387, "balance_loss_clip": 1.02815902, "balance_loss_mlp": 1.00111723, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.7907488903514749, "language_loss": 0.55821931, "learning_rate": 1.4434691476844755e-06, "loss": 0.57870138, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.216796398162842 }, { "auxiliary_loss_clip": 0.01141451, "auxiliary_loss_mlp": 0.01019671, "balance_loss_clip": 1.05126715, "balance_loss_mlp": 1.01261365, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.360659820799145, "language_loss": 0.66826642, "learning_rate": 1.4427209843631729e-06, "loss": 0.68987763, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.70661997795105 }, { "auxiliary_loss_clip": 0.01176153, "auxiliary_loss_mlp": 0.00711171, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.00060678, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.7842364608209196, "language_loss": 0.81232864, "learning_rate": 1.4419729055806534e-06, "loss": 0.83120191, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.7070767879486084 }, { "auxiliary_loss_clip": 0.01140296, "auxiliary_loss_mlp": 0.00710549, "balance_loss_clip": 1.05135572, "balance_loss_mlp": 1.00046921, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.892550219941226, "language_loss": 0.8226161, "learning_rate": 1.441224911450401e-06, "loss": 0.84112448, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.636319875717163 }, { "auxiliary_loss_clip": 0.01165324, "auxiliary_loss_mlp": 0.01028646, "balance_loss_clip": 1.05467272, "balance_loss_mlp": 1.02126956, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.7322110503475474, "language_loss": 0.82387888, "learning_rate": 1.4404770020858851e-06, "loss": 0.84581858, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.677288055419922 }, { "auxiliary_loss_clip": 0.01152901, "auxiliary_loss_mlp": 0.01020374, "balance_loss_clip": 1.04981351, "balance_loss_mlp": 1.01327229, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 3.9595352975338045, "language_loss": 0.86232096, "learning_rate": 1.439729177600563e-06, "loss": 0.88405371, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.648264169692993 }, { "auxiliary_loss_clip": 0.0116136, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.05382991, "balance_loss_mlp": 1.02421618, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 2.5227236911135593, "language_loss": 0.73617852, "learning_rate": 1.4389814381078793e-06, "loss": 0.75810397, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.6061747074127197 }, { "auxiliary_loss_clip": 0.01052849, "auxiliary_loss_mlp": 0.01028698, "balance_loss_clip": 1.04228556, "balance_loss_mlp": 1.02152717, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 27.935306290622595, "language_loss": 0.80414426, "learning_rate": 1.438233783721265e-06, "loss": 0.8249597, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 3.017521381378174 }, { "auxiliary_loss_clip": 0.01142268, "auxiliary_loss_mlp": 0.01030948, "balance_loss_clip": 1.05453992, "balance_loss_mlp": 1.02392709, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 2.3593250373022414, "language_loss": 0.77928126, "learning_rate": 1.43748621455414e-06, "loss": 0.80101347, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 3.0834968090057373 }, { "auxiliary_loss_clip": 0.01142109, "auxiliary_loss_mlp": 0.01029102, "balance_loss_clip": 1.05037689, "balance_loss_mlp": 1.0215323, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.81506034542645, "language_loss": 0.80707437, "learning_rate": 1.4367387307199082e-06, "loss": 0.82878643, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.5666446685791016 }, { "auxiliary_loss_clip": 0.01155998, "auxiliary_loss_mlp": 0.01025859, "balance_loss_clip": 1.05036211, "balance_loss_mlp": 1.01858711, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 2.2313614721008106, "language_loss": 0.82489264, "learning_rate": 1.4359913323319632e-06, "loss": 0.84671116, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.639901638031006 }, { "auxiliary_loss_clip": 0.01084625, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.03998947, "balance_loss_mlp": 1.02269387, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.9378691460919666, "language_loss": 0.77700603, "learning_rate": 1.4352440195036847e-06, "loss": 0.7981565, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.7751333713531494 }, { "auxiliary_loss_clip": 0.01082396, "auxiliary_loss_mlp": 0.01023119, "balance_loss_clip": 1.03794241, "balance_loss_mlp": 1.01632392, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 1.7143935140061854, "language_loss": 0.79967237, "learning_rate": 1.4344967923484395e-06, "loss": 0.82072759, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.758711814880371 }, { "auxiliary_loss_clip": 0.01158431, "auxiliary_loss_mlp": 0.01026404, "balance_loss_clip": 1.05116594, "balance_loss_mlp": 1.01961517, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.7441948660393956, "language_loss": 0.72355473, "learning_rate": 1.433749650979581e-06, "loss": 0.74540311, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.7113118171691895 }, { "auxiliary_loss_clip": 0.01132418, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.04810524, "balance_loss_mlp": 1.01726937, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 2.1733826458525485, "language_loss": 0.68207061, "learning_rate": 1.433002595510451e-06, "loss": 0.70363772, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.6927709579467773 }, { "auxiliary_loss_clip": 0.01139559, "auxiliary_loss_mlp": 0.0071125, "balance_loss_clip": 1.04773068, "balance_loss_mlp": 1.00050902, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 2.303099001447745, "language_loss": 0.71930969, "learning_rate": 1.4322556260543757e-06, "loss": 0.73781776, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.6583688259124756 }, { "auxiliary_loss_clip": 0.01050078, "auxiliary_loss_mlp": 0.01005779, "balance_loss_clip": 1.02758169, "balance_loss_mlp": 1.00447416, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 1.0683874593844152, "language_loss": 0.62702501, "learning_rate": 1.4315087427246703e-06, "loss": 0.6475836, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.153266668319702 }, { "auxiliary_loss_clip": 0.0108496, "auxiliary_loss_mlp": 0.01002489, "balance_loss_clip": 1.02612376, "balance_loss_mlp": 1.00124919, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8685556879514078, "language_loss": 0.58463275, "learning_rate": 1.4307619456346372e-06, "loss": 0.60550725, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.901655673980713 }, { "auxiliary_loss_clip": 0.01158161, "auxiliary_loss_mlp": 0.01022869, "balance_loss_clip": 1.04699767, "balance_loss_mlp": 1.01580036, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 2.1593662603301973, "language_loss": 0.73804384, "learning_rate": 1.430015234897564e-06, "loss": 0.7598542, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 3.854935884475708 }, { "auxiliary_loss_clip": 0.01175555, "auxiliary_loss_mlp": 0.00711171, "balance_loss_clip": 1.05300152, "balance_loss_mlp": 1.00060487, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 2.1914944857647116, "language_loss": 0.66191918, "learning_rate": 1.4292686106267274e-06, "loss": 0.68078643, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 3.872077703475952 }, { "auxiliary_loss_clip": 0.01161824, "auxiliary_loss_mlp": 0.0102562, "balance_loss_clip": 1.0497241, "balance_loss_mlp": 1.01815128, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.8043761415675093, "language_loss": 0.77073175, "learning_rate": 1.4285220729353876e-06, "loss": 0.79260617, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 3.4532768726348877 }, { "auxiliary_loss_clip": 0.01140912, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.04706538, "balance_loss_mlp": 1.02370143, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 5.8358423899119725, "language_loss": 0.77834684, "learning_rate": 1.4277756219367957e-06, "loss": 0.80006135, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.58351993560791 }, { "auxiliary_loss_clip": 0.01133499, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.04866147, "balance_loss_mlp": 1.02369976, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 2.0767064952198755, "language_loss": 0.80029821, "learning_rate": 1.4270292577441864e-06, "loss": 0.82194257, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.5887396335601807 }, { "auxiliary_loss_clip": 0.01161272, "auxiliary_loss_mlp": 0.01024887, "balance_loss_clip": 1.04958701, "balance_loss_mlp": 1.01747227, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.653449482257427, "language_loss": 0.71348429, "learning_rate": 1.4262829804707836e-06, "loss": 0.73534584, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.6036996841430664 }, { "auxiliary_loss_clip": 0.01158863, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.04837656, "balance_loss_mlp": 1.02302468, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 2.26425625402684, "language_loss": 0.70018899, "learning_rate": 1.4255367902297958e-06, "loss": 0.72207987, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.6566975116729736 }, { "auxiliary_loss_clip": 0.01172742, "auxiliary_loss_mlp": 0.01024349, "balance_loss_clip": 1.05219305, "balance_loss_mlp": 1.01755428, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.2677087783561065, "language_loss": 0.78881919, "learning_rate": 1.4247906871344215e-06, "loss": 0.81079018, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.56443452835083 }, { "auxiliary_loss_clip": 0.01137413, "auxiliary_loss_mlp": 0.01020938, "balance_loss_clip": 1.04707003, "balance_loss_mlp": 1.01408887, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.8890439435297335, "language_loss": 0.75395596, "learning_rate": 1.4240446712978415e-06, "loss": 0.77553946, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.8014659881591797 }, { "auxiliary_loss_clip": 0.01163993, "auxiliary_loss_mlp": 0.01025751, "balance_loss_clip": 1.05228114, "balance_loss_mlp": 1.01848555, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 2.1796366445856705, "language_loss": 0.74260747, "learning_rate": 1.423298742833227e-06, "loss": 0.76450491, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.6853206157684326 }, { "auxiliary_loss_clip": 0.01131569, "auxiliary_loss_mlp": 0.01027244, "balance_loss_clip": 1.04526865, "balance_loss_mlp": 1.02001405, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 2.083742882636117, "language_loss": 0.7184732, "learning_rate": 1.4225529018537352e-06, "loss": 0.74006128, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.6725547313690186 }, { "auxiliary_loss_clip": 0.01174359, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.05160582, "balance_loss_mlp": 1.02287734, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 7.141726143181831, "language_loss": 0.7834903, "learning_rate": 1.4218071484725082e-06, "loss": 0.80553091, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.654296875 }, { "auxiliary_loss_clip": 0.01143101, "auxiliary_loss_mlp": 0.01027166, "balance_loss_clip": 1.05190933, "balance_loss_mlp": 1.01988578, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 2.0741640899289826, "language_loss": 0.76082426, "learning_rate": 1.4210614828026786e-06, "loss": 0.78252697, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.609847068786621 }, { "auxiliary_loss_clip": 0.01172162, "auxiliary_loss_mlp": 0.01024797, "balance_loss_clip": 1.04934931, "balance_loss_mlp": 1.01772773, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 2.0212056360720028, "language_loss": 0.74475437, "learning_rate": 1.4203159049573605e-06, "loss": 0.76672393, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.6437454223632812 }, { "auxiliary_loss_clip": 0.0114859, "auxiliary_loss_mlp": 0.01026211, "balance_loss_clip": 1.04823267, "balance_loss_mlp": 1.01891506, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.5869982943685645, "language_loss": 0.87069613, "learning_rate": 1.4195704150496593e-06, "loss": 0.89244413, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.623953104019165 }, { "auxiliary_loss_clip": 0.0114245, "auxiliary_loss_mlp": 0.01027344, "balance_loss_clip": 1.04958141, "balance_loss_mlp": 1.02048898, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.8285953669843111, "language_loss": 0.7381289, "learning_rate": 1.4188250131926639e-06, "loss": 0.7598269, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.6643874645233154 }, { "auxiliary_loss_clip": 0.01142441, "auxiliary_loss_mlp": 0.01027427, "balance_loss_clip": 1.04795909, "balance_loss_mlp": 1.01924968, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 1.947990157293428, "language_loss": 0.80430865, "learning_rate": 1.4180796994994525e-06, "loss": 0.82600725, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.610283136367798 }, { "auxiliary_loss_clip": 0.01141186, "auxiliary_loss_mlp": 0.01021811, "balance_loss_clip": 1.0482595, "balance_loss_mlp": 1.014781, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.9902613936558058, "language_loss": 0.72400212, "learning_rate": 1.4173344740830877e-06, "loss": 0.74563211, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.6510794162750244 }, { "auxiliary_loss_clip": 0.01138534, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.05197191, "balance_loss_mlp": 1.02211165, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 2.8471470931628238, "language_loss": 0.70704651, "learning_rate": 1.4165893370566206e-06, "loss": 0.72872841, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.823525905609131 }, { "auxiliary_loss_clip": 0.01154932, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.04930902, "balance_loss_mlp": 1.01817322, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 2.181709809295318, "language_loss": 0.77476323, "learning_rate": 1.4158442885330865e-06, "loss": 0.79657412, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.6995701789855957 }, { "auxiliary_loss_clip": 0.01151261, "auxiliary_loss_mlp": 0.01024854, "balance_loss_clip": 1.04786682, "balance_loss_mlp": 1.01766586, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.172604400074371, "language_loss": 0.78808427, "learning_rate": 1.4150993286255094e-06, "loss": 0.80984545, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.647531032562256 }, { "auxiliary_loss_clip": 0.0117318, "auxiliary_loss_mlp": 0.01022299, "balance_loss_clip": 1.05160975, "balance_loss_mlp": 1.01548648, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.222547534879372, "language_loss": 0.79969096, "learning_rate": 1.4143544574468993e-06, "loss": 0.8216458, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.5464694499969482 }, { "auxiliary_loss_clip": 0.01154594, "auxiliary_loss_mlp": 0.01020353, "balance_loss_clip": 1.05005682, "balance_loss_mlp": 1.01318884, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 2.203892922574325, "language_loss": 0.8259064, "learning_rate": 1.4136096751102523e-06, "loss": 0.84765589, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.6039340496063232 }, { "auxiliary_loss_clip": 0.01146459, "auxiliary_loss_mlp": 0.01023502, "balance_loss_clip": 1.05061817, "balance_loss_mlp": 1.01634097, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.2780070023288737, "language_loss": 0.83274746, "learning_rate": 1.4128649817285516e-06, "loss": 0.85444707, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.772183418273926 }, { "auxiliary_loss_clip": 0.01144877, "auxiliary_loss_mlp": 0.01027923, "balance_loss_clip": 1.04810727, "balance_loss_mlp": 1.02085066, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 2.1218751617518974, "language_loss": 0.63655233, "learning_rate": 1.412120377414766e-06, "loss": 0.65828037, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.724893808364868 }, { "auxiliary_loss_clip": 0.01174211, "auxiliary_loss_mlp": 0.01023982, "balance_loss_clip": 1.05410588, "balance_loss_mlp": 1.01706767, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.692968229572737, "language_loss": 0.71462107, "learning_rate": 1.4113758622818522e-06, "loss": 0.73660302, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.654768705368042 }, { "auxiliary_loss_clip": 0.01147705, "auxiliary_loss_mlp": 0.00710095, "balance_loss_clip": 1.05089509, "balance_loss_mlp": 1.00046372, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 1.9427948335675689, "language_loss": 0.82938659, "learning_rate": 1.410631436442751e-06, "loss": 0.84796453, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 3.5798892974853516 }, { "auxiliary_loss_clip": 0.01164446, "auxiliary_loss_mlp": 0.0102484, "balance_loss_clip": 1.05159712, "balance_loss_mlp": 1.01764584, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 2.0551098180605365, "language_loss": 0.86942554, "learning_rate": 1.4098871000103936e-06, "loss": 0.89131832, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 4.471165180206299 }, { "auxiliary_loss_clip": 0.01142313, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.04878271, "balance_loss_mlp": 1.02620125, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.8274346438245583, "language_loss": 0.82489461, "learning_rate": 1.409142853097693e-06, "loss": 0.84665358, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.720479726791382 }, { "auxiliary_loss_clip": 0.0114544, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.05043733, "balance_loss_mlp": 1.01502454, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 2.018125394986531, "language_loss": 0.7913841, "learning_rate": 1.408398695817553e-06, "loss": 0.81306827, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.641209363937378 }, { "auxiliary_loss_clip": 0.01142281, "auxiliary_loss_mlp": 0.0103038, "balance_loss_clip": 1.04794097, "balance_loss_mlp": 1.02310872, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 16.066337799295994, "language_loss": 0.70246083, "learning_rate": 1.4076546282828593e-06, "loss": 0.72418743, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.7077279090881348 }, { "auxiliary_loss_clip": 0.01144163, "auxiliary_loss_mlp": 0.01028266, "balance_loss_clip": 1.04582143, "balance_loss_mlp": 1.02155423, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.5866211209759262, "language_loss": 0.66394007, "learning_rate": 1.4069106506064874e-06, "loss": 0.68566442, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.7434945106506348 }, { "auxiliary_loss_clip": 0.01137953, "auxiliary_loss_mlp": 0.01021464, "balance_loss_clip": 1.05128431, "balance_loss_mlp": 1.01502395, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 2.0875054835372424, "language_loss": 0.78517139, "learning_rate": 1.4061667629012989e-06, "loss": 0.80676556, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.711047410964966 }, { "auxiliary_loss_clip": 0.0113243, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.04791236, "balance_loss_mlp": 1.01656973, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 2.533672304304302, "language_loss": 0.83311439, "learning_rate": 1.40542296528014e-06, "loss": 0.85467017, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.68938946723938 }, { "auxiliary_loss_clip": 0.01155, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.04799008, "balance_loss_mlp": 1.02144516, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.659980254737074, "language_loss": 0.75879419, "learning_rate": 1.4046792578558452e-06, "loss": 0.7806344, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.6096572875976562 }, { "auxiliary_loss_clip": 0.01139497, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.04859972, "balance_loss_mlp": 1.01768327, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.862445763611154, "language_loss": 0.76285356, "learning_rate": 1.4039356407412325e-06, "loss": 0.78449345, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.612347364425659 }, { "auxiliary_loss_clip": 0.01075166, "auxiliary_loss_mlp": 0.01002937, "balance_loss_clip": 1.0275054, "balance_loss_mlp": 1.00157762, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7798031942896321, "language_loss": 0.57123768, "learning_rate": 1.40319211404911e-06, "loss": 0.59201872, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.163020372390747 }, { "auxiliary_loss_clip": 0.01174531, "auxiliary_loss_mlp": 0.01022859, "balance_loss_clip": 1.05274606, "balance_loss_mlp": 1.0163976, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 2.0628137822321255, "language_loss": 0.90791976, "learning_rate": 1.4024486778922691e-06, "loss": 0.92989361, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.5647518634796143 }, { "auxiliary_loss_clip": 0.01146313, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.04732156, "balance_loss_mlp": 1.0176878, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 2.624029728483392, "language_loss": 0.77444834, "learning_rate": 1.4017053323834884e-06, "loss": 0.79615629, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.659489393234253 }, { "auxiliary_loss_clip": 0.01144688, "auxiliary_loss_mlp": 0.01025265, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.01840794, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 1.8903597952590805, "language_loss": 0.75936484, "learning_rate": 1.4009620776355333e-06, "loss": 0.78106439, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.6417601108551025 }, { "auxiliary_loss_clip": 0.01156749, "auxiliary_loss_mlp": 0.01026219, "balance_loss_clip": 1.05085731, "balance_loss_mlp": 1.01900959, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 1.8313783878458427, "language_loss": 0.79330504, "learning_rate": 1.4002189137611553e-06, "loss": 0.81513476, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.686495780944824 }, { "auxiliary_loss_clip": 0.0115702, "auxiliary_loss_mlp": 0.01028237, "balance_loss_clip": 1.05073428, "balance_loss_mlp": 1.02123976, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 2.1588336101393395, "language_loss": 0.69825244, "learning_rate": 1.3994758408730901e-06, "loss": 0.72010505, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.6474759578704834 }, { "auxiliary_loss_clip": 0.01143712, "auxiliary_loss_mlp": 0.01026347, "balance_loss_clip": 1.05101669, "balance_loss_mlp": 1.01923954, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 2.5485334414532654, "language_loss": 0.76323968, "learning_rate": 1.3987328590840629e-06, "loss": 0.78494024, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.6700775623321533 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01024773, "balance_loss_clip": 1.04886019, "balance_loss_mlp": 1.01818371, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.93186038273054, "language_loss": 0.86469042, "learning_rate": 1.397989968506783e-06, "loss": 0.88648492, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.6509571075439453 }, { "auxiliary_loss_clip": 0.01178314, "auxiliary_loss_mlp": 0.01026262, "balance_loss_clip": 1.05509973, "balance_loss_mlp": 1.01907933, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 3.5539046021724046, "language_loss": 0.72857326, "learning_rate": 1.3972471692539458e-06, "loss": 0.75061899, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.5074892044067383 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01023818, "balance_loss_clip": 1.04996657, "balance_loss_mlp": 1.01647496, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.8453724101730185, "language_loss": 0.75110692, "learning_rate": 1.3965044614382348e-06, "loss": 0.77273631, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.629824161529541 }, { "auxiliary_loss_clip": 0.01177021, "auxiliary_loss_mlp": 0.01023064, "balance_loss_clip": 1.05340278, "balance_loss_mlp": 1.01592934, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 6.547257460340346, "language_loss": 0.75476152, "learning_rate": 1.3957618451723162e-06, "loss": 0.77676237, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.5686521530151367 }, { "auxiliary_loss_clip": 0.01145135, "auxiliary_loss_mlp": 0.01020993, "balance_loss_clip": 1.04945445, "balance_loss_mlp": 1.01374507, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 2.0446722883427517, "language_loss": 0.71903312, "learning_rate": 1.3950193205688457e-06, "loss": 0.7406944, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.6875052452087402 }, { "auxiliary_loss_clip": 0.01141211, "auxiliary_loss_mlp": 0.01023602, "balance_loss_clip": 1.05048203, "balance_loss_mlp": 1.01703358, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 2.6649144659831236, "language_loss": 0.8388136, "learning_rate": 1.3942768877404627e-06, "loss": 0.86046171, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.612581253051758 }, { "auxiliary_loss_clip": 0.01173631, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.05170178, "balance_loss_mlp": 1.01907849, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.740743481033528, "language_loss": 0.73807746, "learning_rate": 1.393534546799795e-06, "loss": 0.76006937, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.5957891941070557 }, { "auxiliary_loss_clip": 0.01133038, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.04813552, "balance_loss_mlp": 1.01962471, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.9292933497681348, "language_loss": 0.68126279, "learning_rate": 1.3927922978594536e-06, "loss": 0.70286286, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.6678173542022705 }, { "auxiliary_loss_clip": 0.01071462, "auxiliary_loss_mlp": 0.01007527, "balance_loss_clip": 1.02664936, "balance_loss_mlp": 1.00609672, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7757760887011103, "language_loss": 0.57399249, "learning_rate": 1.3920501410320387e-06, "loss": 0.59478235, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.141036033630371 }, { "auxiliary_loss_clip": 0.01141953, "auxiliary_loss_mlp": 0.01022793, "balance_loss_clip": 1.04793108, "balance_loss_mlp": 1.01561689, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 3.2543893689867, "language_loss": 0.7645843, "learning_rate": 1.3913080764301333e-06, "loss": 0.78623182, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 3.558955192565918 }, { "auxiliary_loss_clip": 0.01119647, "auxiliary_loss_mlp": 0.01023872, "balance_loss_clip": 1.04569077, "balance_loss_mlp": 1.017277, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 3.639860471356355, "language_loss": 0.71251559, "learning_rate": 1.3905661041663085e-06, "loss": 0.73395073, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 4.596017837524414 }, { "auxiliary_loss_clip": 0.01158643, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 1.05153298, "balance_loss_mlp": 1.01880002, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.1009647953182387, "language_loss": 0.65511793, "learning_rate": 1.389824224353122e-06, "loss": 0.67697227, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 3.667264223098755 }, { "auxiliary_loss_clip": 0.01159362, "auxiliary_loss_mlp": 0.01022347, "balance_loss_clip": 1.05395436, "balance_loss_mlp": 1.01582575, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 3.024642758927516, "language_loss": 0.76928818, "learning_rate": 1.389082437103115e-06, "loss": 0.79110527, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.697657585144043 }, { "auxiliary_loss_clip": 0.01125018, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.04537213, "balance_loss_mlp": 1.02113211, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 4.918945259823067, "language_loss": 0.78328359, "learning_rate": 1.3883407425288172e-06, "loss": 0.80481935, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.646787405014038 }, { "auxiliary_loss_clip": 0.0113966, "auxiliary_loss_mlp": 0.01024453, "balance_loss_clip": 1.04755592, "balance_loss_mlp": 1.01749969, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.2839601603762083, "language_loss": 0.79791725, "learning_rate": 1.3875991407427417e-06, "loss": 0.81955838, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.6543564796447754 }, { "auxiliary_loss_clip": 0.01052273, "auxiliary_loss_mlp": 0.01002589, "balance_loss_clip": 1.02647591, "balance_loss_mlp": 1.0011586, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.7676658668055486, "language_loss": 0.58154428, "learning_rate": 1.38685763185739e-06, "loss": 0.60209292, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.252838134765625 }, { "auxiliary_loss_clip": 0.0117323, "auxiliary_loss_mlp": 0.01024585, "balance_loss_clip": 1.051759, "balance_loss_mlp": 1.01705062, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 3.338289275429684, "language_loss": 0.67998922, "learning_rate": 1.3861162159852476e-06, "loss": 0.70196736, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.5808398723602295 }, { "auxiliary_loss_clip": 0.01146987, "auxiliary_loss_mlp": 0.01026862, "balance_loss_clip": 1.04966235, "balance_loss_mlp": 1.01964378, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.7746827153548976, "language_loss": 0.80049628, "learning_rate": 1.3853748932387875e-06, "loss": 0.82223475, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.6825594902038574 }, { "auxiliary_loss_clip": 0.01130775, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.04521835, "balance_loss_mlp": 1.0212127, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 6.5060842665765275, "language_loss": 0.74662167, "learning_rate": 1.3846336637304671e-06, "loss": 0.76821542, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.664593458175659 }, { "auxiliary_loss_clip": 0.01138381, "auxiliary_loss_mlp": 0.01026338, "balance_loss_clip": 1.05098248, "balance_loss_mlp": 1.01919699, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 2.199011175919107, "language_loss": 0.83277851, "learning_rate": 1.3838925275727316e-06, "loss": 0.85442567, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.710524320602417 }, { "auxiliary_loss_clip": 0.01174449, "auxiliary_loss_mlp": 0.01020773, "balance_loss_clip": 1.05272365, "balance_loss_mlp": 1.01392436, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 2.088833978009668, "language_loss": 0.78939605, "learning_rate": 1.3831514848780089e-06, "loss": 0.8113482, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.531046152114868 }, { "auxiliary_loss_clip": 0.01149232, "auxiliary_loss_mlp": 0.01029732, "balance_loss_clip": 1.04777575, "balance_loss_mlp": 1.02285409, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 3.094042511997595, "language_loss": 0.91963238, "learning_rate": 1.3824105357587152e-06, "loss": 0.94142199, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.6436660289764404 }, { "auxiliary_loss_clip": 0.0113562, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 1.04501402, "balance_loss_mlp": 1.02053618, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.6653980978890073, "language_loss": 0.82535386, "learning_rate": 1.381669680327253e-06, "loss": 0.84698725, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.723212480545044 }, { "auxiliary_loss_clip": 0.01135671, "auxiliary_loss_mlp": 0.01024924, "balance_loss_clip": 1.04987752, "balance_loss_mlp": 1.01770854, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 2.0428588609823697, "language_loss": 0.71059155, "learning_rate": 1.380928918696008e-06, "loss": 0.7321974, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.7015573978424072 }, { "auxiliary_loss_clip": 0.01155765, "auxiliary_loss_mlp": 0.01023524, "balance_loss_clip": 1.04892123, "balance_loss_mlp": 1.01622224, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.9792998532257076, "language_loss": 0.71498847, "learning_rate": 1.3801882509773548e-06, "loss": 0.73678136, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.551835060119629 }, { "auxiliary_loss_clip": 0.0115221, "auxiliary_loss_mlp": 0.01022846, "balance_loss_clip": 1.04732454, "balance_loss_mlp": 1.01549041, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.739537143459195, "language_loss": 0.81716436, "learning_rate": 1.3794476772836503e-06, "loss": 0.83891493, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.6635799407958984 }, { "auxiliary_loss_clip": 0.01117979, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.0453856, "balance_loss_mlp": 1.01858473, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.901948314279558, "language_loss": 0.84567553, "learning_rate": 1.3787071977272402e-06, "loss": 0.86711526, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.6788952350616455 }, { "auxiliary_loss_clip": 0.01108162, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04858828, "balance_loss_mlp": 1.01971221, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 2.7959821354842496, "language_loss": 0.71804732, "learning_rate": 1.3779668124204535e-06, "loss": 0.73940396, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.717298746109009 }, { "auxiliary_loss_clip": 0.01136051, "auxiliary_loss_mlp": 0.0102683, "balance_loss_clip": 1.05019474, "balance_loss_mlp": 1.01925397, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.8908739816681, "language_loss": 0.81182241, "learning_rate": 1.3772265214756074e-06, "loss": 0.83345115, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.6540133953094482 }, { "auxiliary_loss_clip": 0.01159491, "auxiliary_loss_mlp": 0.01024857, "balance_loss_clip": 1.04941928, "balance_loss_mlp": 1.01787376, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 1.8199696107553014, "language_loss": 0.75177336, "learning_rate": 1.3764863250050025e-06, "loss": 0.77361685, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.6185286045074463 }, { "auxiliary_loss_clip": 0.01127687, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04808438, "balance_loss_mlp": 1.01754582, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 2.409057806493531, "language_loss": 0.80712038, "learning_rate": 1.3757462231209272e-06, "loss": 0.82864249, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.730602979660034 }, { "auxiliary_loss_clip": 0.0113443, "auxiliary_loss_mlp": 0.01020888, "balance_loss_clip": 1.04663002, "balance_loss_mlp": 1.01450741, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 4.454136838299852, "language_loss": 0.88456422, "learning_rate": 1.3750062159356525e-06, "loss": 0.90611744, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.7232513427734375 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01022664, "balance_loss_clip": 1.04617, "balance_loss_mlp": 1.01482654, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 2.121669404132912, "language_loss": 0.83407009, "learning_rate": 1.3742663035614382e-06, "loss": 0.85546327, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.649061441421509 }, { "auxiliary_loss_clip": 0.01176224, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.05358171, "balance_loss_mlp": 1.01951647, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 2.380675163742297, "language_loss": 0.80243886, "learning_rate": 1.3735264861105283e-06, "loss": 0.82447034, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.627307891845703 }, { "auxiliary_loss_clip": 0.01126604, "auxiliary_loss_mlp": 0.01019085, "balance_loss_clip": 1.04776978, "balance_loss_mlp": 1.01271033, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 2.3687667916393913, "language_loss": 0.78585804, "learning_rate": 1.372786763695152e-06, "loss": 0.80731487, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.699744701385498 }, { "auxiliary_loss_clip": 0.01161345, "auxiliary_loss_mlp": 0.01023602, "balance_loss_clip": 1.05059564, "balance_loss_mlp": 1.01618087, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 1.9521164471679018, "language_loss": 0.77586961, "learning_rate": 1.3720471364275257e-06, "loss": 0.797719, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 3.519164800643921 }, { "auxiliary_loss_clip": 0.01122395, "auxiliary_loss_mlp": 0.00711354, "balance_loss_clip": 1.04674125, "balance_loss_mlp": 1.00041854, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 1.9793482494011845, "language_loss": 0.78540462, "learning_rate": 1.3713076044198486e-06, "loss": 0.80374205, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 3.6146962642669678 }, { "auxiliary_loss_clip": 0.01136027, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.04865551, "balance_loss_mlp": 1.02274466, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.9406130473326204, "language_loss": 0.81268573, "learning_rate": 1.3705681677843086e-06, "loss": 0.83434224, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.6628410816192627 }, { "auxiliary_loss_clip": 0.0108529, "auxiliary_loss_mlp": 0.01001628, "balance_loss_clip": 1.0265342, "balance_loss_mlp": 1.00006628, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7684215910232894, "language_loss": 0.60579014, "learning_rate": 1.3698288266330768e-06, "loss": 0.62665933, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.2526841163635254 }, { "auxiliary_loss_clip": 0.0113851, "auxiliary_loss_mlp": 0.01026939, "balance_loss_clip": 1.04996932, "balance_loss_mlp": 1.01968479, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.2125144156174557, "language_loss": 0.73221147, "learning_rate": 1.3690895810783113e-06, "loss": 0.75386596, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.6806750297546387 }, { "auxiliary_loss_clip": 0.01098303, "auxiliary_loss_mlp": 0.00711537, "balance_loss_clip": 1.04113555, "balance_loss_mlp": 1.00039124, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 3.161903964091717, "language_loss": 0.71801233, "learning_rate": 1.3683504312321543e-06, "loss": 0.73611081, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.87681245803833 }, { "auxiliary_loss_clip": 0.01161205, "auxiliary_loss_mlp": 0.01022261, "balance_loss_clip": 1.05030644, "balance_loss_mlp": 1.01501572, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 6.487311297425412, "language_loss": 0.80245978, "learning_rate": 1.3676113772067355e-06, "loss": 0.82429445, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.9869964122772217 }, { "auxiliary_loss_clip": 0.01115833, "auxiliary_loss_mlp": 0.01025127, "balance_loss_clip": 1.04630435, "balance_loss_mlp": 1.01800466, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 1.9194598566849632, "language_loss": 0.73077065, "learning_rate": 1.3668724191141671e-06, "loss": 0.75218028, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.76846981048584 }, { "auxiliary_loss_clip": 0.01126107, "auxiliary_loss_mlp": 0.01026184, "balance_loss_clip": 1.05476809, "balance_loss_mlp": 1.01903164, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 4.285318585966979, "language_loss": 0.66641867, "learning_rate": 1.3661335570665493e-06, "loss": 0.68794155, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.6695775985717773 }, { "auxiliary_loss_clip": 0.01146409, "auxiliary_loss_mlp": 0.01020981, "balance_loss_clip": 1.05173111, "balance_loss_mlp": 1.01407599, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 4.724974049588409, "language_loss": 0.70311701, "learning_rate": 1.3653947911759676e-06, "loss": 0.72479093, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.654106378555298 }, { "auxiliary_loss_clip": 0.01104437, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.0447216, "balance_loss_mlp": 1.02868509, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.7582924254359835, "language_loss": 0.74722439, "learning_rate": 1.3646561215544904e-06, "loss": 0.76863843, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.8450920581817627 }, { "auxiliary_loss_clip": 0.01157779, "auxiliary_loss_mlp": 0.0102488, "balance_loss_clip": 1.05155444, "balance_loss_mlp": 1.01789773, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.293936563084688, "language_loss": 0.80004936, "learning_rate": 1.363917548314176e-06, "loss": 0.82187599, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.6705121994018555 }, { "auxiliary_loss_clip": 0.01164058, "auxiliary_loss_mlp": 0.01022108, "balance_loss_clip": 1.05086493, "balance_loss_mlp": 1.01448178, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 2.1531524039260193, "language_loss": 0.73243046, "learning_rate": 1.3631790715670626e-06, "loss": 0.75429213, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.6166977882385254 }, { "auxiliary_loss_clip": 0.01066717, "auxiliary_loss_mlp": 0.01023175, "balance_loss_clip": 1.04092669, "balance_loss_mlp": 1.01680326, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 2.242686778078653, "language_loss": 0.85623789, "learning_rate": 1.3624406914251783e-06, "loss": 0.87713677, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.842116117477417 }, { "auxiliary_loss_clip": 0.01159265, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.04983985, "balance_loss_mlp": 1.02122808, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 2.2474166801644566, "language_loss": 0.88361174, "learning_rate": 1.3617024080005335e-06, "loss": 0.9054873, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.589120388031006 }, { "auxiliary_loss_clip": 0.01145365, "auxiliary_loss_mlp": 0.00710982, "balance_loss_clip": 1.04795945, "balance_loss_mlp": 1.00041592, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.6913294291687067, "language_loss": 0.74037111, "learning_rate": 1.3609642214051266e-06, "loss": 0.75893456, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.7507424354553223 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.05133319, "balance_loss_mlp": 1.01916993, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 2.844754618647892, "language_loss": 0.66077363, "learning_rate": 1.3602261317509385e-06, "loss": 0.68241835, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.6269609928131104 }, { "auxiliary_loss_clip": 0.01159667, "auxiliary_loss_mlp": 0.01023605, "balance_loss_clip": 1.05025423, "balance_loss_mlp": 1.01584446, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 2.835543710327759, "language_loss": 0.82310015, "learning_rate": 1.3594881391499387e-06, "loss": 0.84493279, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.644827365875244 }, { "auxiliary_loss_clip": 0.01145168, "auxiliary_loss_mlp": 0.01025962, "balance_loss_clip": 1.05054736, "balance_loss_mlp": 1.01877689, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 2.239580575213605, "language_loss": 0.79436868, "learning_rate": 1.3587502437140778e-06, "loss": 0.81607997, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.67988920211792 }, { "auxiliary_loss_clip": 0.01143811, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.04748881, "balance_loss_mlp": 1.01935256, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.2588744716584803, "language_loss": 0.85300541, "learning_rate": 1.3580124455552952e-06, "loss": 0.87471193, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.667691469192505 }, { "auxiliary_loss_clip": 0.01161273, "auxiliary_loss_mlp": 0.00711177, "balance_loss_clip": 1.05372787, "balance_loss_mlp": 1.00048733, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 5.623878928166256, "language_loss": 0.87433887, "learning_rate": 1.3572747447855148e-06, "loss": 0.89306331, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.5900707244873047 }, { "auxiliary_loss_clip": 0.01176192, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.05331373, "balance_loss_mlp": 1.02306235, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 1.8827986991486663, "language_loss": 0.69310445, "learning_rate": 1.356537141516644e-06, "loss": 0.71517092, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.5683481693267822 }, { "auxiliary_loss_clip": 0.01158857, "auxiliary_loss_mlp": 0.01024022, "balance_loss_clip": 1.05230594, "balance_loss_mlp": 1.01742029, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 2.03211513836304, "language_loss": 0.61627555, "learning_rate": 1.3557996358605775e-06, "loss": 0.63810432, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.7132720947265625 }, { "auxiliary_loss_clip": 0.01157475, "auxiliary_loss_mlp": 0.0101865, "balance_loss_clip": 1.04928803, "balance_loss_mlp": 1.01196861, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.6406525929507283, "language_loss": 0.70668662, "learning_rate": 1.3550622279291941e-06, "loss": 0.72844779, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.62485408782959 }, { "auxiliary_loss_clip": 0.0110111, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.04188728, "balance_loss_mlp": 1.01759458, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 7.799362909132844, "language_loss": 0.83235729, "learning_rate": 1.354324917834358e-06, "loss": 0.85361946, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.7428348064422607 }, { "auxiliary_loss_clip": 0.01094092, "auxiliary_loss_mlp": 0.00710848, "balance_loss_clip": 1.04690528, "balance_loss_mlp": 1.00039744, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 3.0410266946041755, "language_loss": 0.76964027, "learning_rate": 1.353587705687918e-06, "loss": 0.78768969, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.69024920463562 }, { "auxiliary_loss_clip": 0.01148204, "auxiliary_loss_mlp": 0.01027305, "balance_loss_clip": 1.05116725, "balance_loss_mlp": 1.01958632, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.617774379130163, "language_loss": 0.71681994, "learning_rate": 1.3528505916017096e-06, "loss": 0.73857498, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 3.602738857269287 }, { "auxiliary_loss_clip": 0.0115673, "auxiliary_loss_mlp": 0.01024061, "balance_loss_clip": 1.04784894, "balance_loss_mlp": 1.0165683, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.5595587318868955, "language_loss": 0.88572824, "learning_rate": 1.3521135756875514e-06, "loss": 0.90753615, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 5.3895862102508545 }, { "auxiliary_loss_clip": 0.01085783, "auxiliary_loss_mlp": 0.0102666, "balance_loss_clip": 1.04317975, "balance_loss_mlp": 1.01996684, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.4624942583936065, "language_loss": 0.86062205, "learning_rate": 1.3513766580572496e-06, "loss": 0.88174653, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.755250930786133 }, { "auxiliary_loss_clip": 0.01158469, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.05078197, "balance_loss_mlp": 1.02014089, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.169947415825179, "language_loss": 0.76935565, "learning_rate": 1.3506398388225924e-06, "loss": 0.79120964, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.6165387630462646 }, { "auxiliary_loss_clip": 0.01174958, "auxiliary_loss_mlp": 0.01025359, "balance_loss_clip": 1.05409336, "balance_loss_mlp": 1.01870394, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 2.197010707442274, "language_loss": 0.71712804, "learning_rate": 1.349903118095355e-06, "loss": 0.73913127, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.577148675918579 }, { "auxiliary_loss_clip": 0.01163589, "auxiliary_loss_mlp": 0.01026561, "balance_loss_clip": 1.05162883, "balance_loss_mlp": 1.01950932, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 2.3523396069841795, "language_loss": 0.73722178, "learning_rate": 1.349166495987298e-06, "loss": 0.75912333, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.7520368099212646 }, { "auxiliary_loss_clip": 0.01076044, "auxiliary_loss_mlp": 0.01004021, "balance_loss_clip": 1.0481503, "balance_loss_mlp": 1.0028286, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8163213140453556, "language_loss": 0.60838521, "learning_rate": 1.348429972610166e-06, "loss": 0.6291858, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.295480489730835 }, { "auxiliary_loss_clip": 0.01047319, "auxiliary_loss_mlp": 0.01007499, "balance_loss_clip": 1.04598451, "balance_loss_mlp": 1.00627136, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8459691469960645, "language_loss": 0.57788342, "learning_rate": 1.3476935480756897e-06, "loss": 0.59843165, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.16292142868042 }, { "auxiliary_loss_clip": 0.0111626, "auxiliary_loss_mlp": 0.01025362, "balance_loss_clip": 1.04341841, "balance_loss_mlp": 1.01835501, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 3.2311655090422327, "language_loss": 0.75972998, "learning_rate": 1.346957222495583e-06, "loss": 0.78114617, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.689892292022705 }, { "auxiliary_loss_clip": 0.01145366, "auxiliary_loss_mlp": 0.00711038, "balance_loss_clip": 1.04927468, "balance_loss_mlp": 1.00045693, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.931058261280552, "language_loss": 0.70242065, "learning_rate": 1.3462209959815466e-06, "loss": 0.7209847, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.567564010620117 }, { "auxiliary_loss_clip": 0.01146133, "auxiliary_loss_mlp": 0.01029167, "balance_loss_clip": 1.05076706, "balance_loss_mlp": 1.02214241, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 2.2815585575384834, "language_loss": 0.74383259, "learning_rate": 1.345484868645265e-06, "loss": 0.76558554, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.687608242034912 }, { "auxiliary_loss_clip": 0.01137422, "auxiliary_loss_mlp": 0.01024364, "balance_loss_clip": 1.04938984, "balance_loss_mlp": 1.0168246, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 13.149544427223917, "language_loss": 0.78367209, "learning_rate": 1.3447488405984088e-06, "loss": 0.80528998, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.748331308364868 }, { "auxiliary_loss_clip": 0.01140559, "auxiliary_loss_mlp": 0.01024768, "balance_loss_clip": 1.04938364, "balance_loss_mlp": 1.01742172, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 3.3896686569768106, "language_loss": 0.69783938, "learning_rate": 1.3440129119526322e-06, "loss": 0.71949267, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.7618601322174072 }, { "auxiliary_loss_clip": 0.01088123, "auxiliary_loss_mlp": 0.01005847, "balance_loss_clip": 1.02910268, "balance_loss_mlp": 1.00460768, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.82823368414101, "language_loss": 0.51197612, "learning_rate": 1.3432770828195762e-06, "loss": 0.53291583, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.3123109340667725 }, { "auxiliary_loss_clip": 0.01115379, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.04288733, "balance_loss_mlp": 1.01716959, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.7072642499078325, "language_loss": 0.70223588, "learning_rate": 1.3425413533108635e-06, "loss": 0.72363484, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.699814558029175 }, { "auxiliary_loss_clip": 0.01113778, "auxiliary_loss_mlp": 0.01025672, "balance_loss_clip": 1.04970312, "balance_loss_mlp": 1.01800144, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 2.8657624721852355, "language_loss": 0.7062276, "learning_rate": 1.341805723538105e-06, "loss": 0.72762215, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.763601541519165 }, { "auxiliary_loss_clip": 0.01146204, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.04785311, "balance_loss_mlp": 1.01793098, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.7399787282066044, "language_loss": 0.77645743, "learning_rate": 1.3410701936128948e-06, "loss": 0.79817176, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.69421648979187 }, { "auxiliary_loss_clip": 0.01158532, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.05224633, "balance_loss_mlp": 1.02231526, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.5743190760820287, "language_loss": 0.85169053, "learning_rate": 1.340334763646812e-06, "loss": 0.87356985, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.576808452606201 }, { "auxiliary_loss_clip": 0.01175118, "auxiliary_loss_mlp": 0.01024121, "balance_loss_clip": 1.05145812, "balance_loss_mlp": 1.01715052, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 2.5437953198588166, "language_loss": 0.74156952, "learning_rate": 1.3395994337514218e-06, "loss": 0.76356184, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.609933853149414 }, { "auxiliary_loss_clip": 0.0115192, "auxiliary_loss_mlp": 0.01021452, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.01417136, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.6999867536546887, "language_loss": 0.78520405, "learning_rate": 1.3388642040382725e-06, "loss": 0.80693769, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.693486452102661 }, { "auxiliary_loss_clip": 0.01129617, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.04509628, "balance_loss_mlp": 1.02392125, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.8508758493257162, "language_loss": 0.83820641, "learning_rate": 1.3381290746188975e-06, "loss": 0.85981393, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.755582094192505 }, { "auxiliary_loss_clip": 0.01159495, "auxiliary_loss_mlp": 0.01028105, "balance_loss_clip": 1.05300784, "balance_loss_mlp": 1.02129841, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.9586596928380051, "language_loss": 0.6724478, "learning_rate": 1.3373940456048152e-06, "loss": 0.69432384, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.6482126712799072 }, { "auxiliary_loss_clip": 0.0117225, "auxiliary_loss_mlp": 0.01027918, "balance_loss_clip": 1.05195045, "balance_loss_mlp": 1.0209744, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.8628393791727391, "language_loss": 0.59010297, "learning_rate": 1.3366591171075299e-06, "loss": 0.61210465, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.7281110286712646 }, { "auxiliary_loss_clip": 0.01142022, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.04963553, "balance_loss_mlp": 1.01822615, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 2.3450770691202583, "language_loss": 0.90759116, "learning_rate": 1.335924289238529e-06, "loss": 0.9292658, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.673495054244995 }, { "auxiliary_loss_clip": 0.01160217, "auxiliary_loss_mlp": 0.00711177, "balance_loss_clip": 1.05386281, "balance_loss_mlp": 1.00055277, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 2.2239196060389994, "language_loss": 0.76910186, "learning_rate": 1.3351895621092859e-06, "loss": 0.78781581, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.68630051612854 }, { "auxiliary_loss_clip": 0.01051299, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.03572261, "balance_loss_mlp": 1.01932514, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 2.0457865450577315, "language_loss": 0.76514554, "learning_rate": 1.3344549358312567e-06, "loss": 0.78591877, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.8472299575805664 }, { "auxiliary_loss_clip": 0.01159303, "auxiliary_loss_mlp": 0.0102126, "balance_loss_clip": 1.04965973, "balance_loss_mlp": 1.01391697, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 1.9945660487093166, "language_loss": 0.78404641, "learning_rate": 1.3337204105158852e-06, "loss": 0.80585206, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 4.5756611824035645 }, { "auxiliary_loss_clip": 0.0111128, "auxiliary_loss_mlp": 0.01023662, "balance_loss_clip": 1.03928947, "balance_loss_mlp": 1.01616347, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 2.0133728069296892, "language_loss": 0.7271384, "learning_rate": 1.332985986274597e-06, "loss": 0.74848777, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 3.508118152618408 }, { "auxiliary_loss_clip": 0.01089304, "auxiliary_loss_mlp": 0.00710363, "balance_loss_clip": 1.04437935, "balance_loss_mlp": 1.00057828, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 6.459736344436486, "language_loss": 0.75408483, "learning_rate": 1.3322516632188047e-06, "loss": 0.77208149, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 3.848287582397461 }, { "auxiliary_loss_clip": 0.01125165, "auxiliary_loss_mlp": 0.01023201, "balance_loss_clip": 1.04767954, "balance_loss_mlp": 1.01575041, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 1.8177456335694409, "language_loss": 0.6680612, "learning_rate": 1.3315174414599045e-06, "loss": 0.68954492, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.702697277069092 }, { "auxiliary_loss_clip": 0.01151715, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.0479399, "balance_loss_mlp": 1.01761794, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 2.1336667572092822, "language_loss": 0.75860399, "learning_rate": 1.3307833211092768e-06, "loss": 0.78037024, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.6413750648498535 }, { "auxiliary_loss_clip": 0.01175746, "auxiliary_loss_mlp": 0.01025489, "balance_loss_clip": 1.0537703, "balance_loss_mlp": 1.01791, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 2.4411289549412967, "language_loss": 0.7500366, "learning_rate": 1.3300493022782873e-06, "loss": 0.77204895, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.602605104446411 }, { "auxiliary_loss_clip": 0.01099273, "auxiliary_loss_mlp": 0.00711592, "balance_loss_clip": 1.04452109, "balance_loss_mlp": 1.00050271, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 2.351519307255323, "language_loss": 0.7260325, "learning_rate": 1.3293153850782855e-06, "loss": 0.7441411, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.7470273971557617 }, { "auxiliary_loss_clip": 0.01115367, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.04457498, "balance_loss_mlp": 1.01723146, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 2.5155551399474407, "language_loss": 0.70843399, "learning_rate": 1.3285815696206069e-06, "loss": 0.72983575, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.8546230792999268 }, { "auxiliary_loss_clip": 0.01126727, "auxiliary_loss_mlp": 0.01026363, "balance_loss_clip": 1.04500186, "balance_loss_mlp": 1.01829851, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 2.050398908476408, "language_loss": 0.76930833, "learning_rate": 1.32784785601657e-06, "loss": 0.79083925, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.7276816368103027 }, { "auxiliary_loss_clip": 0.01143005, "auxiliary_loss_mlp": 0.01022713, "balance_loss_clip": 1.04522598, "balance_loss_mlp": 1.01582897, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.9157039620502068, "language_loss": 0.73670113, "learning_rate": 1.3271142443774798e-06, "loss": 0.75835824, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.749382734298706 }, { "auxiliary_loss_clip": 0.01138712, "auxiliary_loss_mlp": 0.01027466, "balance_loss_clip": 1.04823256, "balance_loss_mlp": 1.0200516, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 2.0329878836731363, "language_loss": 0.81877887, "learning_rate": 1.3263807348146228e-06, "loss": 0.84044063, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.7064766883850098 }, { "auxiliary_loss_clip": 0.01139306, "auxiliary_loss_mlp": 0.01032287, "balance_loss_clip": 1.04594469, "balance_loss_mlp": 1.02445543, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 2.882546296326234, "language_loss": 0.73548877, "learning_rate": 1.3256473274392733e-06, "loss": 0.75720477, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.6941282749176025 }, { "auxiliary_loss_clip": 0.01172879, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.05058837, "balance_loss_mlp": 1.02194858, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 1.9141019724987998, "language_loss": 0.70144224, "learning_rate": 1.3249140223626873e-06, "loss": 0.7234689, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.6905670166015625 }, { "auxiliary_loss_clip": 0.01154476, "auxiliary_loss_mlp": 0.01022441, "balance_loss_clip": 1.05051184, "balance_loss_mlp": 1.01506519, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 2.291492066145326, "language_loss": 0.75540018, "learning_rate": 1.3241808196961077e-06, "loss": 0.77716935, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.6508185863494873 }, { "auxiliary_loss_clip": 0.01131475, "auxiliary_loss_mlp": 0.01024948, "balance_loss_clip": 1.04697108, "balance_loss_mlp": 1.01827812, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.736277466778733, "language_loss": 0.7072922, "learning_rate": 1.3234477195507608e-06, "loss": 0.72885644, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.65541672706604 }, { "auxiliary_loss_clip": 0.01126144, "auxiliary_loss_mlp": 0.01022393, "balance_loss_clip": 1.04793811, "balance_loss_mlp": 1.01556504, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.654186797827593, "language_loss": 0.62679505, "learning_rate": 1.322714722037857e-06, "loss": 0.64828044, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.8358349800109863 }, { "auxiliary_loss_clip": 0.01134456, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.04749584, "balance_loss_mlp": 1.02164829, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 2.724091443112711, "language_loss": 0.7748189, "learning_rate": 1.321981827268591e-06, "loss": 0.79645371, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.755101203918457 }, { "auxiliary_loss_clip": 0.01143198, "auxiliary_loss_mlp": 0.01022235, "balance_loss_clip": 1.04732585, "balance_loss_mlp": 1.0151422, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 2.1148332314598295, "language_loss": 0.81368476, "learning_rate": 1.3212490353541426e-06, "loss": 0.83533913, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.658050775527954 }, { "auxiliary_loss_clip": 0.01173819, "auxiliary_loss_mlp": 0.01025904, "balance_loss_clip": 1.05089009, "balance_loss_mlp": 1.01854873, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 9.844480514645703, "language_loss": 0.80457711, "learning_rate": 1.3205163464056762e-06, "loss": 0.82657433, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.5510776042938232 }, { "auxiliary_loss_clip": 0.01155998, "auxiliary_loss_mlp": 0.0102576, "balance_loss_clip": 1.04936218, "balance_loss_mlp": 1.01863384, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 2.9462698596237584, "language_loss": 0.73018682, "learning_rate": 1.319783760534339e-06, "loss": 0.75200438, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.665482521057129 }, { "auxiliary_loss_clip": 0.01157106, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.05015707, "balance_loss_mlp": 1.02402604, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.226526679155337, "language_loss": 0.75337881, "learning_rate": 1.319051277851266e-06, "loss": 0.77526194, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.5868401527404785 }, { "auxiliary_loss_clip": 0.01159394, "auxiliary_loss_mlp": 0.01026413, "balance_loss_clip": 1.05032337, "balance_loss_mlp": 1.01926303, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 2.132289958741956, "language_loss": 0.84235895, "learning_rate": 1.3183188984675716e-06, "loss": 0.86421704, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.6494991779327393 }, { "auxiliary_loss_clip": 0.01143302, "auxiliary_loss_mlp": 0.01021595, "balance_loss_clip": 1.05163825, "balance_loss_mlp": 1.01436162, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 6.574467313823439, "language_loss": 0.71491963, "learning_rate": 1.3175866224943586e-06, "loss": 0.73656857, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.694368839263916 }, { "auxiliary_loss_clip": 0.01147751, "auxiliary_loss_mlp": 0.01023903, "balance_loss_clip": 1.05007148, "balance_loss_mlp": 1.01635718, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 7.412538145279424, "language_loss": 0.73979276, "learning_rate": 1.316854450042712e-06, "loss": 0.7615093, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.6215755939483643 }, { "auxiliary_loss_clip": 0.0116363, "auxiliary_loss_mlp": 0.01023603, "balance_loss_clip": 1.05203271, "balance_loss_mlp": 1.01671839, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 1.9991528135344403, "language_loss": 0.74225634, "learning_rate": 1.3161223812237024e-06, "loss": 0.76412868, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.561782121658325 }, { "auxiliary_loss_clip": 0.01172551, "auxiliary_loss_mlp": 0.0102432, "balance_loss_clip": 1.04932976, "balance_loss_mlp": 1.01703024, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 5.522643399806377, "language_loss": 0.85793406, "learning_rate": 1.3153904161483842e-06, "loss": 0.87990272, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.5673630237579346 }, { "auxiliary_loss_clip": 0.01121425, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.04496694, "balance_loss_mlp": 1.0181694, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.1363714005075267, "language_loss": 0.8594566, "learning_rate": 1.3146585549277953e-06, "loss": 0.88092935, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 4.647363662719727 }, { "auxiliary_loss_clip": 0.01151495, "auxiliary_loss_mlp": 0.01022247, "balance_loss_clip": 1.05120182, "balance_loss_mlp": 1.01516867, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 3.0102950945304117, "language_loss": 0.78748643, "learning_rate": 1.3139267976729591e-06, "loss": 0.80922389, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 4.523502826690674 }, { "auxiliary_loss_clip": 0.01161695, "auxiliary_loss_mlp": 0.01025389, "balance_loss_clip": 1.05148864, "balance_loss_mlp": 1.01810253, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.978182432214806, "language_loss": 0.71926188, "learning_rate": 1.3131951444948815e-06, "loss": 0.74113262, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.666102409362793 }, { "auxiliary_loss_clip": 0.01146683, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.05105889, "balance_loss_mlp": 1.02058768, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 2.002456287020206, "language_loss": 0.76200861, "learning_rate": 1.3124635955045546e-06, "loss": 0.78375983, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.6422765254974365 }, { "auxiliary_loss_clip": 0.01098565, "auxiliary_loss_mlp": 0.00711284, "balance_loss_clip": 1.04061699, "balance_loss_mlp": 1.00035322, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 2.0131015853016914, "language_loss": 0.84599888, "learning_rate": 1.3117321508129537e-06, "loss": 0.86409736, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.6903810501098633 }, { "auxiliary_loss_clip": 0.01147333, "auxiliary_loss_mlp": 0.01023164, "balance_loss_clip": 1.05173254, "balance_loss_mlp": 1.0163449, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.8499755531203188, "language_loss": 0.76240087, "learning_rate": 1.3110008105310388e-06, "loss": 0.78410578, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.6442627906799316 }, { "auxiliary_loss_clip": 0.01174397, "auxiliary_loss_mlp": 0.01029052, "balance_loss_clip": 1.05038917, "balance_loss_mlp": 1.02135432, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.9749164466945734, "language_loss": 0.7853865, "learning_rate": 1.3102695747697526e-06, "loss": 0.80742097, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.640143394470215 }, { "auxiliary_loss_clip": 0.01097945, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.04734528, "balance_loss_mlp": 1.02718878, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 6.3330988591422495, "language_loss": 0.90663016, "learning_rate": 1.3095384436400237e-06, "loss": 0.9279567, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.7026255130767822 }, { "auxiliary_loss_clip": 0.01150173, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.04938316, "balance_loss_mlp": 1.0207479, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 1.9331266881712212, "language_loss": 0.82165116, "learning_rate": 1.3088074172527633e-06, "loss": 0.84342891, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 2.606764078140259 }, { "auxiliary_loss_clip": 0.0114317, "auxiliary_loss_mlp": 0.01023486, "balance_loss_clip": 1.04545212, "balance_loss_mlp": 1.01620793, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 1.9712782857100997, "language_loss": 0.71301895, "learning_rate": 1.3080764957188684e-06, "loss": 0.73468554, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.7128477096557617 }, { "auxiliary_loss_clip": 0.01113367, "auxiliary_loss_mlp": 0.01025965, "balance_loss_clip": 1.04281688, "balance_loss_mlp": 1.018502, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 2.430679772938301, "language_loss": 0.70821905, "learning_rate": 1.3073456791492192e-06, "loss": 0.72961235, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.6973462104797363 }, { "auxiliary_loss_clip": 0.01143882, "auxiliary_loss_mlp": 0.01024219, "balance_loss_clip": 1.04695928, "balance_loss_mlp": 1.01702476, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 2.6523701813390237, "language_loss": 0.78453195, "learning_rate": 1.3066149676546801e-06, "loss": 0.80621302, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.6596975326538086 }, { "auxiliary_loss_clip": 0.01141691, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.05123341, "balance_loss_mlp": 1.02736998, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 2.079166172428281, "language_loss": 0.65996116, "learning_rate": 1.3058843613460985e-06, "loss": 0.6817199, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.6510324478149414 }, { "auxiliary_loss_clip": 0.0113461, "auxiliary_loss_mlp": 0.01024463, "balance_loss_clip": 1.04591179, "balance_loss_mlp": 1.01715565, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 2.1337105476239975, "language_loss": 0.74500406, "learning_rate": 1.3051538603343075e-06, "loss": 0.76659477, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.6491336822509766 }, { "auxiliary_loss_clip": 0.01157356, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.05149436, "balance_loss_mlp": 1.0193224, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 3.0785432355642866, "language_loss": 0.6791119, "learning_rate": 1.3044234647301235e-06, "loss": 0.70094961, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.6517531871795654 }, { "auxiliary_loss_clip": 0.01151962, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.04750752, "balance_loss_mlp": 1.01596558, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.9626101761953996, "language_loss": 0.72761011, "learning_rate": 1.303693174644347e-06, "loss": 0.74935699, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.5738348960876465 }, { "auxiliary_loss_clip": 0.01135794, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.04647362, "balance_loss_mlp": 1.02257502, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 2.7887951885856785, "language_loss": 0.80534357, "learning_rate": 1.3029629901877625e-06, "loss": 0.82699901, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.689042091369629 }, { "auxiliary_loss_clip": 0.01164471, "auxiliary_loss_mlp": 0.01028347, "balance_loss_clip": 1.0516746, "balance_loss_mlp": 1.02087271, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 6.438274155878535, "language_loss": 0.77732331, "learning_rate": 1.3022329114711376e-06, "loss": 0.79925156, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.545452833175659 }, { "auxiliary_loss_clip": 0.01137051, "auxiliary_loss_mlp": 0.01023075, "balance_loss_clip": 1.04852474, "balance_loss_mlp": 1.01567173, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 2.432897500280153, "language_loss": 0.697083, "learning_rate": 1.3015029386052256e-06, "loss": 0.71868426, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.7651755809783936 }, { "auxiliary_loss_clip": 0.01135139, "auxiliary_loss_mlp": 0.01028254, "balance_loss_clip": 1.04689312, "balance_loss_mlp": 1.0209074, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 2.084089614639474, "language_loss": 0.73108947, "learning_rate": 1.3007730717007622e-06, "loss": 0.75272334, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.755507707595825 }, { "auxiliary_loss_clip": 0.01175534, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.05110347, "balance_loss_mlp": 1.02537346, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.8420533731384854, "language_loss": 0.75559568, "learning_rate": 1.3000433108684676e-06, "loss": 0.77767354, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.5995945930480957 }, { "auxiliary_loss_clip": 0.01153488, "auxiliary_loss_mlp": 0.01020809, "balance_loss_clip": 1.05007505, "balance_loss_mlp": 1.01408005, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 3.0882918701080233, "language_loss": 0.81163019, "learning_rate": 1.2993136562190467e-06, "loss": 0.83337319, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.6415491104125977 }, { "auxiliary_loss_clip": 0.01143871, "auxiliary_loss_mlp": 0.01022084, "balance_loss_clip": 1.04737139, "balance_loss_mlp": 1.01537216, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 2.1274613062115897, "language_loss": 0.70352876, "learning_rate": 1.2985841078631871e-06, "loss": 0.72518831, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.6439716815948486 }, { "auxiliary_loss_clip": 0.0109334, "auxiliary_loss_mlp": 0.0103008, "balance_loss_clip": 1.04116929, "balance_loss_mlp": 1.02249241, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 2.001652433361633, "language_loss": 0.78194845, "learning_rate": 1.2978546659115608e-06, "loss": 0.80318266, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.8947393894195557 }, { "auxiliary_loss_clip": 0.01147198, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04951382, "balance_loss_mlp": 1.02019882, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 2.356913229190759, "language_loss": 0.85789198, "learning_rate": 1.2971253304748228e-06, "loss": 0.8796345, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.640139579772949 }, { "auxiliary_loss_clip": 0.01161804, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.05305791, "balance_loss_mlp": 1.02069199, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 2.4518293954341694, "language_loss": 0.74559546, "learning_rate": 1.296396101663614e-06, "loss": 0.7674979, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.5908472537994385 }, { "auxiliary_loss_clip": 0.0116132, "auxiliary_loss_mlp": 0.01022548, "balance_loss_clip": 1.0516777, "balance_loss_mlp": 1.01564002, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.2408367921044707, "language_loss": 0.84550405, "learning_rate": 1.2956669795885565e-06, "loss": 0.86734271, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 4.504064083099365 }, { "auxiliary_loss_clip": 0.01120538, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.04759419, "balance_loss_mlp": 1.02537417, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 6.046215104718949, "language_loss": 0.68103725, "learning_rate": 1.294937964360259e-06, "loss": 0.70257396, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 3.680474281311035 }, { "auxiliary_loss_clip": 0.01148777, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 1.04885197, "balance_loss_mlp": 1.01837373, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.666709748139202, "language_loss": 0.70994425, "learning_rate": 1.2942090560893108e-06, "loss": 0.73169547, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.753117322921753 }, { "auxiliary_loss_clip": 0.01172848, "auxiliary_loss_mlp": 0.0102412, "balance_loss_clip": 1.05151653, "balance_loss_mlp": 1.01800752, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 2.056957525921943, "language_loss": 0.60532838, "learning_rate": 1.2934802548862882e-06, "loss": 0.62729806, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.7256083488464355 }, { "auxiliary_loss_clip": 0.01138558, "auxiliary_loss_mlp": 0.01023359, "balance_loss_clip": 1.0464375, "balance_loss_mlp": 1.01586127, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.935381664461386, "language_loss": 0.82456982, "learning_rate": 1.292751560861749e-06, "loss": 0.84618896, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.593309164047241 }, { "auxiliary_loss_clip": 0.01176436, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.05127192, "balance_loss_mlp": 1.01885808, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.8852657234601373, "language_loss": 0.79843783, "learning_rate": 1.2920229741262354e-06, "loss": 0.82046115, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.543123722076416 }, { "auxiliary_loss_clip": 0.01141036, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.0474925, "balance_loss_mlp": 1.01917052, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.395633888510934, "language_loss": 0.75969088, "learning_rate": 1.2912944947902739e-06, "loss": 0.7813617, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.654778242111206 }, { "auxiliary_loss_clip": 0.01148891, "auxiliary_loss_mlp": 0.0102623, "balance_loss_clip": 1.0495975, "balance_loss_mlp": 1.01856494, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 7.626898260309377, "language_loss": 0.71500409, "learning_rate": 1.2905661229643742e-06, "loss": 0.73675525, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.6988961696624756 }, { "auxiliary_loss_clip": 0.01173033, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.04946744, "balance_loss_mlp": 1.01865268, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.454782181850528, "language_loss": 0.84243804, "learning_rate": 1.2898378587590299e-06, "loss": 0.86442983, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.5511200428009033 }, { "auxiliary_loss_clip": 0.01152934, "auxiliary_loss_mlp": 0.01021642, "balance_loss_clip": 1.04889953, "balance_loss_mlp": 1.01443887, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 2.268468311093536, "language_loss": 0.87690717, "learning_rate": 1.2891097022847173e-06, "loss": 0.89865291, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.570838689804077 }, { "auxiliary_loss_clip": 0.01142402, "auxiliary_loss_mlp": 0.01026824, "balance_loss_clip": 1.04892349, "balance_loss_mlp": 1.01922989, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 3.779603737815156, "language_loss": 0.66981602, "learning_rate": 1.2883816536518978e-06, "loss": 0.69150829, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.6732466220855713 }, { "auxiliary_loss_clip": 0.01154497, "auxiliary_loss_mlp": 0.01024897, "balance_loss_clip": 1.049196, "balance_loss_mlp": 1.01782179, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 2.1718929632625614, "language_loss": 0.82016313, "learning_rate": 1.2876537129710155e-06, "loss": 0.84195709, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.657374620437622 }, { "auxiliary_loss_clip": 0.01141517, "auxiliary_loss_mlp": 0.01031375, "balance_loss_clip": 1.05226088, "balance_loss_mlp": 1.02360868, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 2.457436137944877, "language_loss": 0.7544089, "learning_rate": 1.286925880352499e-06, "loss": 0.77613783, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.65523099899292 }, { "auxiliary_loss_clip": 0.01140105, "auxiliary_loss_mlp": 0.01021293, "balance_loss_clip": 1.04760551, "balance_loss_mlp": 1.01372325, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 2.5548318447033753, "language_loss": 0.71252048, "learning_rate": 1.2861981559067592e-06, "loss": 0.73413444, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.660433769226074 }, { "auxiliary_loss_clip": 0.01103253, "auxiliary_loss_mlp": 0.0102209, "balance_loss_clip": 1.04357886, "balance_loss_mlp": 1.01526499, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 2.1313982700855836, "language_loss": 0.80183554, "learning_rate": 1.2854705397441917e-06, "loss": 0.82308888, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.690581798553467 }, { "auxiliary_loss_clip": 0.01120467, "auxiliary_loss_mlp": 0.01022553, "balance_loss_clip": 1.04417694, "balance_loss_mlp": 1.01542377, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 3.460427876047926, "language_loss": 0.77595377, "learning_rate": 1.2847430319751747e-06, "loss": 0.7973839, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.730854034423828 }, { "auxiliary_loss_clip": 0.01152564, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.04999924, "balance_loss_mlp": 1.01852322, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 7.9366230296969, "language_loss": 0.67199922, "learning_rate": 1.2840156327100712e-06, "loss": 0.69377804, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.6147854328155518 }, { "auxiliary_loss_clip": 0.01172647, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 1.05152535, "balance_loss_mlp": 1.01804507, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 6.2496095810691585, "language_loss": 0.72674859, "learning_rate": 1.2832883420592272e-06, "loss": 0.748725, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.651970148086548 }, { "auxiliary_loss_clip": 0.01138186, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.04866385, "balance_loss_mlp": 1.01795125, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.219651619651248, "language_loss": 0.64690787, "learning_rate": 1.282561160132972e-06, "loss": 0.66854346, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.762549877166748 }, { "auxiliary_loss_clip": 0.0114775, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.04688978, "balance_loss_mlp": 1.02143288, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 2.0971928094834684, "language_loss": 0.80864531, "learning_rate": 1.2818340870416186e-06, "loss": 0.83041179, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.7576141357421875 }, { "auxiliary_loss_clip": 0.01133928, "auxiliary_loss_mlp": 0.01026858, "balance_loss_clip": 1.04634285, "balance_loss_mlp": 1.01948476, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 2.673120486529736, "language_loss": 0.75976241, "learning_rate": 1.2811071228954626e-06, "loss": 0.78137028, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.6821205615997314 }, { "auxiliary_loss_clip": 0.01141983, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.04948783, "balance_loss_mlp": 1.02549911, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 2.359818886519253, "language_loss": 0.81164372, "learning_rate": 1.2803802678047846e-06, "loss": 0.83339095, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.6736841201782227 }, { "auxiliary_loss_clip": 0.01144673, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 1.05002022, "balance_loss_mlp": 1.02078521, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.9848435982046977, "language_loss": 0.74211764, "learning_rate": 1.279653521879848e-06, "loss": 0.76384711, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.629408836364746 }, { "auxiliary_loss_clip": 0.01069287, "auxiliary_loss_mlp": 0.01025565, "balance_loss_clip": 1.03950441, "balance_loss_mlp": 1.01906824, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 3.6861970729265487, "language_loss": 0.83653867, "learning_rate": 1.2789268852308997e-06, "loss": 0.85748714, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 3.0225236415863037 }, { "auxiliary_loss_clip": 0.01150486, "auxiliary_loss_mlp": 0.01027611, "balance_loss_clip": 1.04889345, "balance_loss_mlp": 1.01980031, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.078408109063746, "language_loss": 0.70977813, "learning_rate": 1.2782003579681688e-06, "loss": 0.7315591, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.789306640625 }, { "auxiliary_loss_clip": 0.01175262, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.05234742, "balance_loss_mlp": 1.02004528, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 2.1754110885889517, "language_loss": 0.74331427, "learning_rate": 1.2774739402018701e-06, "loss": 0.76534355, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.6182479858398438 }, { "auxiliary_loss_clip": 0.01158355, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.05190492, "balance_loss_mlp": 1.02101922, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 2.1895912830570943, "language_loss": 0.7328307, "learning_rate": 1.2767476320422002e-06, "loss": 0.75470054, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 4.7417192459106445 }, { "auxiliary_loss_clip": 0.01057086, "auxiliary_loss_mlp": 0.01008774, "balance_loss_clip": 1.03207207, "balance_loss_mlp": 1.00731397, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.6792109927549428, "language_loss": 0.5718438, "learning_rate": 1.2760214335993392e-06, "loss": 0.59250242, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 4.245827674865723 }, { "auxiliary_loss_clip": 0.01148147, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.04605699, "balance_loss_mlp": 1.02013803, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 2.2313725008659464, "language_loss": 0.58701944, "learning_rate": 1.2752953449834514e-06, "loss": 0.60876834, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.691108465194702 }, { "auxiliary_loss_clip": 0.01174024, "auxiliary_loss_mlp": 0.01025201, "balance_loss_clip": 1.05259681, "balance_loss_mlp": 1.01809561, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.689729336406013, "language_loss": 0.80393744, "learning_rate": 1.2745693663046836e-06, "loss": 0.82592964, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.607604742050171 }, { "auxiliary_loss_clip": 0.01153071, "auxiliary_loss_mlp": 0.01022528, "balance_loss_clip": 1.04995084, "balance_loss_mlp": 1.01602554, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 3.5750024812825734, "language_loss": 0.81160927, "learning_rate": 1.2738434976731662e-06, "loss": 0.83336526, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.5715765953063965 }, { "auxiliary_loss_clip": 0.01141716, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.04919887, "balance_loss_mlp": 1.02383161, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 2.641253593376257, "language_loss": 0.75205088, "learning_rate": 1.2731177391990125e-06, "loss": 0.77377886, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.6502158641815186 }, { "auxiliary_loss_clip": 0.01141231, "auxiliary_loss_mlp": 0.01027914, "balance_loss_clip": 1.04716396, "balance_loss_mlp": 1.02085066, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 2.339497351473081, "language_loss": 0.82215381, "learning_rate": 1.2723920909923203e-06, "loss": 0.84384525, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.590118646621704 }, { "auxiliary_loss_clip": 0.01090121, "auxiliary_loss_mlp": 0.01000494, "balance_loss_clip": 1.03209209, "balance_loss_mlp": 0.99935514, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.8446532782337528, "language_loss": 0.60425103, "learning_rate": 1.2716665531631688e-06, "loss": 0.62515724, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.104182243347168 }, { "auxiliary_loss_clip": 0.0116185, "auxiliary_loss_mlp": 0.01023661, "balance_loss_clip": 1.04838204, "balance_loss_mlp": 1.0162667, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 2.0554858561311393, "language_loss": 0.77379453, "learning_rate": 1.270941125821623e-06, "loss": 0.79564959, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.637927770614624 }, { "auxiliary_loss_clip": 0.01152067, "auxiliary_loss_mlp": 0.01024582, "balance_loss_clip": 1.04698265, "balance_loss_mlp": 1.01710165, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.985818665681911, "language_loss": 0.75358194, "learning_rate": 1.2702158090777278e-06, "loss": 0.77534848, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.6235971450805664 }, { "auxiliary_loss_clip": 0.01122374, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.04653263, "balance_loss_mlp": 1.02746701, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 2.00232916762426, "language_loss": 0.7476474, "learning_rate": 1.2694906030415148e-06, "loss": 0.76921493, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.7498202323913574 }, { "auxiliary_loss_clip": 0.01149666, "auxiliary_loss_mlp": 0.01025553, "balance_loss_clip": 1.04829574, "balance_loss_mlp": 1.01792383, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 2.8574677067551892, "language_loss": 0.82091427, "learning_rate": 1.2687655078229958e-06, "loss": 0.84266651, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.601461410522461 }, { "auxiliary_loss_clip": 0.01140841, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.04967058, "balance_loss_mlp": 1.0211904, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.90938194418958, "language_loss": 0.69402689, "learning_rate": 1.2680405235321678e-06, "loss": 0.71571523, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.688762664794922 }, { "auxiliary_loss_clip": 0.01145266, "auxiliary_loss_mlp": 0.00710941, "balance_loss_clip": 1.05167603, "balance_loss_mlp": 1.00044036, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.186619565418378, "language_loss": 0.78740239, "learning_rate": 1.267315650279011e-06, "loss": 0.80596447, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.614671230316162 }, { "auxiliary_loss_clip": 0.01120414, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.047212, "balance_loss_mlp": 1.0184803, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.7970646265858288, "language_loss": 0.73976731, "learning_rate": 1.2665908881734874e-06, "loss": 0.76122653, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.6591567993164062 }, { "auxiliary_loss_clip": 0.01158914, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.05106592, "balance_loss_mlp": 1.01856565, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.3440148813274675, "language_loss": 0.85188824, "learning_rate": 1.2658662373255432e-06, "loss": 0.87373441, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.5938761234283447 }, { "auxiliary_loss_clip": 0.0106204, "auxiliary_loss_mlp": 0.01002678, "balance_loss_clip": 1.03027558, "balance_loss_mlp": 1.00140798, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7088105004463293, "language_loss": 0.52255726, "learning_rate": 1.2651416978451063e-06, "loss": 0.54320449, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.319967746734619 }, { "auxiliary_loss_clip": 0.01178154, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.05367827, "balance_loss_mlp": 1.021927, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 1.9039353943877186, "language_loss": 0.65354186, "learning_rate": 1.2644172698420903e-06, "loss": 0.67562068, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.7544147968292236 }, { "auxiliary_loss_clip": 0.0112875, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.04948342, "balance_loss_mlp": 1.02258229, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.7901256533191394, "language_loss": 0.84876794, "learning_rate": 1.2636929534263892e-06, "loss": 0.87035036, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.665411949157715 }, { "auxiliary_loss_clip": 0.01125107, "auxiliary_loss_mlp": 0.01019072, "balance_loss_clip": 1.04240048, "balance_loss_mlp": 1.01230717, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.8328976685455383, "language_loss": 0.77804917, "learning_rate": 1.2629687487078821e-06, "loss": 0.79949093, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.681159496307373 }, { "auxiliary_loss_clip": 0.01162728, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.05138636, "balance_loss_mlp": 1.02052176, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 5.271153964910319, "language_loss": 0.76614058, "learning_rate": 1.2622446557964293e-06, "loss": 0.78804755, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.608781576156616 }, { "auxiliary_loss_clip": 0.01139496, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.04417133, "balance_loss_mlp": 1.02127445, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.7118116798993437, "language_loss": 0.71344936, "learning_rate": 1.261520674801876e-06, "loss": 0.735129, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.763051748275757 }, { "auxiliary_loss_clip": 0.011427, "auxiliary_loss_mlp": 0.01020851, "balance_loss_clip": 1.05154169, "balance_loss_mlp": 1.01371074, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 2.259202118539418, "language_loss": 0.72543722, "learning_rate": 1.2607968058340488e-06, "loss": 0.7470727, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.7495439052581787 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.04771948, "balance_loss_mlp": 1.01996338, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.7465311145487925, "language_loss": 0.73206228, "learning_rate": 1.2600730490027583e-06, "loss": 0.75371957, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.7516725063323975 }, { "auxiliary_loss_clip": 0.0112338, "auxiliary_loss_mlp": 0.0102356, "balance_loss_clip": 1.04597342, "balance_loss_mlp": 1.01674163, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.822639454589975, "language_loss": 0.80630684, "learning_rate": 1.2593494044177984e-06, "loss": 0.82777619, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.6186540126800537 }, { "auxiliary_loss_clip": 0.01174201, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.04901516, "balance_loss_mlp": 1.01690662, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.3301271183249854, "language_loss": 0.80317342, "learning_rate": 1.2586258721889448e-06, "loss": 0.82516724, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.579472780227661 }, { "auxiliary_loss_clip": 0.01101217, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.04467762, "balance_loss_mlp": 1.01768756, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 2.4784497098916036, "language_loss": 0.81621534, "learning_rate": 1.2579024524259573e-06, "loss": 0.83747411, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 4.4931018352508545 }, { "auxiliary_loss_clip": 0.01137284, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.0448457, "balance_loss_mlp": 1.02092385, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 2.7795379969791076, "language_loss": 0.91197538, "learning_rate": 1.2571791452385768e-06, "loss": 0.93362987, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 3.4448013305664062 }, { "auxiliary_loss_clip": 0.01145345, "auxiliary_loss_mlp": 0.01025492, "balance_loss_clip": 1.05180144, "balance_loss_mlp": 1.01820803, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 2.1604573418825996, "language_loss": 0.77213776, "learning_rate": 1.2564559507365301e-06, "loss": 0.79384613, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 3.6220247745513916 }, { "auxiliary_loss_clip": 0.01144006, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.04836547, "balance_loss_mlp": 1.02009058, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.0350454664318556, "language_loss": 0.79096884, "learning_rate": 1.2557328690295244e-06, "loss": 0.81268477, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.6370038986206055 }, { "auxiliary_loss_clip": 0.01132682, "auxiliary_loss_mlp": 0.01024294, "balance_loss_clip": 1.05072927, "balance_loss_mlp": 1.0173856, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.76604188089606, "language_loss": 0.7594586, "learning_rate": 1.255009900227251e-06, "loss": 0.78102839, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.629192590713501 }, { "auxiliary_loss_clip": 0.01171778, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.05249584, "balance_loss_mlp": 1.01683354, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.9820989287962691, "language_loss": 0.79599607, "learning_rate": 1.254287044439383e-06, "loss": 0.81795031, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.6083147525787354 }, { "auxiliary_loss_clip": 0.01087847, "auxiliary_loss_mlp": 0.01003123, "balance_loss_clip": 1.02971005, "balance_loss_mlp": 1.00193715, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.7767615398412074, "language_loss": 0.54460275, "learning_rate": 1.2535643017755776e-06, "loss": 0.56551242, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.2234387397766113 }, { "auxiliary_loss_clip": 0.01126343, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.04598665, "balance_loss_mlp": 1.01796496, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.6002545151828715, "language_loss": 0.71606815, "learning_rate": 1.2528416723454737e-06, "loss": 0.73758221, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.6670026779174805 }, { "auxiliary_loss_clip": 0.01175017, "auxiliary_loss_mlp": 0.01022156, "balance_loss_clip": 1.05428338, "balance_loss_mlp": 1.01571846, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.48996162204872, "language_loss": 0.71040106, "learning_rate": 1.2521191562586945e-06, "loss": 0.73237282, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.639477252960205 }, { "auxiliary_loss_clip": 0.01173312, "auxiliary_loss_mlp": 0.00711094, "balance_loss_clip": 1.05202079, "balance_loss_mlp": 1.00040007, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 2.631539036761198, "language_loss": 0.76952803, "learning_rate": 1.2513967536248445e-06, "loss": 0.7883721, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.591986656188965 }, { "auxiliary_loss_clip": 0.01155768, "auxiliary_loss_mlp": 0.01021919, "balance_loss_clip": 1.05114603, "balance_loss_mlp": 1.01417041, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.7056459947897284, "language_loss": 0.81271666, "learning_rate": 1.2506744645535117e-06, "loss": 0.83449352, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.5844523906707764 }, { "auxiliary_loss_clip": 0.01135406, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.04409218, "balance_loss_mlp": 1.01891553, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 2.1439452055441315, "language_loss": 0.6023916, "learning_rate": 1.249952289154267e-06, "loss": 0.62400836, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.632147789001465 }, { "auxiliary_loss_clip": 0.01081768, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04172456, "balance_loss_mlp": 1.01828492, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 41.28250529549206, "language_loss": 0.76355904, "learning_rate": 1.2492302275366635e-06, "loss": 0.78462613, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.799652099609375 }, { "auxiliary_loss_clip": 0.01154483, "auxiliary_loss_mlp": 0.01025224, "balance_loss_clip": 1.04949784, "balance_loss_mlp": 1.01760662, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.362837898725829, "language_loss": 0.65039855, "learning_rate": 1.2485082798102377e-06, "loss": 0.67219567, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.836966037750244 }, { "auxiliary_loss_clip": 0.01132845, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.04731607, "balance_loss_mlp": 1.02074444, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.286042070992751, "language_loss": 0.68668056, "learning_rate": 1.2477864460845084e-06, "loss": 0.70829678, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.6776821613311768 }, { "auxiliary_loss_clip": 0.01143057, "auxiliary_loss_mlp": 0.01027035, "balance_loss_clip": 1.04812121, "balance_loss_mlp": 1.01852906, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.8357534879362727, "language_loss": 0.73366308, "learning_rate": 1.2470647264689776e-06, "loss": 0.75536394, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.612672805786133 }, { "auxiliary_loss_clip": 0.01099429, "auxiliary_loss_mlp": 0.0102362, "balance_loss_clip": 1.04255939, "balance_loss_mlp": 1.01623225, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 2.877355596611623, "language_loss": 0.71410578, "learning_rate": 1.2463431210731282e-06, "loss": 0.7353363, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.8046982288360596 }, { "auxiliary_loss_clip": 0.01118439, "auxiliary_loss_mlp": 0.01027819, "balance_loss_clip": 1.04361677, "balance_loss_mlp": 1.02008843, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 12.674334377110199, "language_loss": 0.76674384, "learning_rate": 1.2456216300064289e-06, "loss": 0.78820646, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.718714714050293 }, { "auxiliary_loss_clip": 0.01137021, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.04686654, "balance_loss_mlp": 1.02318847, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.71056046648017, "language_loss": 0.78208709, "learning_rate": 1.244900253378328e-06, "loss": 0.80376327, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.7301809787750244 }, { "auxiliary_loss_clip": 0.01065028, "auxiliary_loss_mlp": 0.01024096, "balance_loss_clip": 1.04210567, "balance_loss_mlp": 1.01689577, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 5.375560078000362, "language_loss": 0.69260108, "learning_rate": 1.2441789912982583e-06, "loss": 0.71349227, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.8373312950134277 }, { "auxiliary_loss_clip": 0.01161568, "auxiliary_loss_mlp": 0.0102824, "balance_loss_clip": 1.05135238, "balance_loss_mlp": 1.01989555, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 2.985860787050975, "language_loss": 0.64848661, "learning_rate": 1.2434578438756346e-06, "loss": 0.67038471, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.9638075828552246 }, { "auxiliary_loss_clip": 0.01159466, "auxiliary_loss_mlp": 0.01024764, "balance_loss_clip": 1.04846859, "balance_loss_mlp": 1.01758754, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 2.502134310398146, "language_loss": 0.78169942, "learning_rate": 1.242736811219855e-06, "loss": 0.80354172, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 3.031282663345337 }, { "auxiliary_loss_clip": 0.01153271, "auxiliary_loss_mlp": 0.01022176, "balance_loss_clip": 1.04967129, "balance_loss_mlp": 1.01558065, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 2.134491441409308, "language_loss": 0.8230685, "learning_rate": 1.2420158934402988e-06, "loss": 0.844823, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.6403863430023193 }, { "auxiliary_loss_clip": 0.01110151, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.04205942, "balance_loss_mlp": 1.02243137, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 2.3686385736585445, "language_loss": 0.8483668, "learning_rate": 1.2412950906463286e-06, "loss": 0.86976802, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.7092723846435547 }, { "auxiliary_loss_clip": 0.01109744, "auxiliary_loss_mlp": 0.01027547, "balance_loss_clip": 1.04587936, "balance_loss_mlp": 1.02068019, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 1.921873584168721, "language_loss": 0.90412819, "learning_rate": 1.2405744029472902e-06, "loss": 0.92550105, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.7163705825805664 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.04858637, "balance_loss_mlp": 1.01820409, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 2.071916874403113, "language_loss": 0.76585102, "learning_rate": 1.2398538304525108e-06, "loss": 0.78751403, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.643495798110962 }, { "auxiliary_loss_clip": 0.01125387, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.04833615, "balance_loss_mlp": 1.02096164, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 3.4508119590147395, "language_loss": 0.75740176, "learning_rate": 1.2391333732713016e-06, "loss": 0.77894473, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 4.850160837173462 }, { "auxiliary_loss_clip": 0.0112698, "auxiliary_loss_mlp": 0.01023459, "balance_loss_clip": 1.04612887, "balance_loss_mlp": 1.01572871, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.6913978145603856, "language_loss": 0.78415406, "learning_rate": 1.2384130315129543e-06, "loss": 0.80565846, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 3.6034717559814453 }, { "auxiliary_loss_clip": 0.01054159, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.03953099, "balance_loss_mlp": 1.0201422, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 3.3783417460482066, "language_loss": 0.73249161, "learning_rate": 1.2376928052867447e-06, "loss": 0.75331092, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.0896542072296143 }, { "auxiliary_loss_clip": 0.01144365, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.04995608, "balance_loss_mlp": 1.0214293, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 2.545412517154734, "language_loss": 0.77457786, "learning_rate": 1.2369726947019299e-06, "loss": 0.79631114, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 2.984164237976074 }, { "auxiliary_loss_clip": 0.01156025, "auxiliary_loss_mlp": 0.01027037, "balance_loss_clip": 1.04889631, "balance_loss_mlp": 1.01994443, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.8646244886894845, "language_loss": 0.6684376, "learning_rate": 1.2362526998677511e-06, "loss": 0.69026822, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.626620054244995 }, { "auxiliary_loss_clip": 0.0114637, "auxiliary_loss_mlp": 0.01024055, "balance_loss_clip": 1.04924893, "balance_loss_mlp": 1.01753747, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 4.600356468159292, "language_loss": 0.84640169, "learning_rate": 1.2355328208934301e-06, "loss": 0.86810589, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.679089307785034 }, { "auxiliary_loss_clip": 0.0115885, "auxiliary_loss_mlp": 0.00711354, "balance_loss_clip": 1.04791462, "balance_loss_mlp": 1.00037146, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.7253639465932917, "language_loss": 0.72405267, "learning_rate": 1.2348130578881728e-06, "loss": 0.7427547, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.594414234161377 }, { "auxiliary_loss_clip": 0.01179585, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.05403674, "balance_loss_mlp": 1.0193181, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.6994547189183042, "language_loss": 0.76225936, "learning_rate": 1.2340934109611664e-06, "loss": 0.78432727, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.618298292160034 }, { "auxiliary_loss_clip": 0.01150664, "auxiliary_loss_mlp": 0.01029091, "balance_loss_clip": 1.05147481, "balance_loss_mlp": 1.02128279, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 3.316352909593844, "language_loss": 0.69376075, "learning_rate": 1.2333738802215798e-06, "loss": 0.71555829, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.6305394172668457 }, { "auxiliary_loss_clip": 0.01105515, "auxiliary_loss_mlp": 0.01026636, "balance_loss_clip": 1.04298341, "balance_loss_mlp": 1.01914382, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 4.399907551297928, "language_loss": 0.8171792, "learning_rate": 1.2326544657785668e-06, "loss": 0.83850074, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.7354378700256348 }, { "auxiliary_loss_clip": 0.0111787, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.04600823, "balance_loss_mlp": 1.01932847, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.7538742704469716, "language_loss": 0.74712312, "learning_rate": 1.2319351677412608e-06, "loss": 0.76857275, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.652263879776001 }, { "auxiliary_loss_clip": 0.01136655, "auxiliary_loss_mlp": 0.01027444, "balance_loss_clip": 1.04923296, "balance_loss_mlp": 1.01947474, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 2.5421849061730595, "language_loss": 0.74502814, "learning_rate": 1.2312159862187796e-06, "loss": 0.76666909, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.6449756622314453 }, { "auxiliary_loss_clip": 0.01178378, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.05375171, "balance_loss_mlp": 1.03164625, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 1.5687206845659747, "language_loss": 0.76092303, "learning_rate": 1.2304969213202217e-06, "loss": 0.78310102, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.5397253036499023 }, { "auxiliary_loss_clip": 0.01138342, "auxiliary_loss_mlp": 0.01032758, "balance_loss_clip": 1.04708004, "balance_loss_mlp": 1.02585006, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.5266527023823557, "language_loss": 0.79487574, "learning_rate": 1.2297779731546692e-06, "loss": 0.81658673, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.67793869972229 }, { "auxiliary_loss_clip": 0.0114062, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.05020332, "balance_loss_mlp": 1.02139735, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 2.515232260163527, "language_loss": 0.77949739, "learning_rate": 1.2290591418311853e-06, "loss": 0.80118996, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 2.698821544647217 }, { "auxiliary_loss_clip": 0.01161928, "auxiliary_loss_mlp": 0.01028607, "balance_loss_clip": 1.0539124, "balance_loss_mlp": 1.02094817, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 10.282102871957848, "language_loss": 0.72285813, "learning_rate": 1.2283404274588172e-06, "loss": 0.74476349, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.6844100952148438 }, { "auxiliary_loss_clip": 0.00995009, "auxiliary_loss_mlp": 0.01009199, "balance_loss_clip": 1.01578879, "balance_loss_mlp": 1.00762558, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7401275350448291, "language_loss": 0.52701592, "learning_rate": 1.227621830146592e-06, "loss": 0.54705805, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.2352588176727295 }, { "auxiliary_loss_clip": 0.0113368, "auxiliary_loss_mlp": 0.0102646, "balance_loss_clip": 1.05088377, "balance_loss_mlp": 1.01950729, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 2.114132924943391, "language_loss": 0.79185867, "learning_rate": 1.2269033500035217e-06, "loss": 0.81346011, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.8245506286621094 }, { "auxiliary_loss_clip": 0.01131042, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.0500319, "balance_loss_mlp": 1.02535927, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 1.8617636604674266, "language_loss": 0.73776114, "learning_rate": 1.2261849871385988e-06, "loss": 0.75939691, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.781132698059082 }, { "auxiliary_loss_clip": 0.01176961, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.05172241, "balance_loss_mlp": 1.02499938, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.7174136226713284, "language_loss": 0.6261211, "learning_rate": 1.2254667416607972e-06, "loss": 0.64822161, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.658806324005127 }, { "auxiliary_loss_clip": 0.01160625, "auxiliary_loss_mlp": 0.01029552, "balance_loss_clip": 1.05188608, "balance_loss_mlp": 1.0222621, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 2.253420509992604, "language_loss": 0.83437312, "learning_rate": 1.2247486136790756e-06, "loss": 0.85627484, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.6747660636901855 }, { "auxiliary_loss_clip": 0.01161712, "auxiliary_loss_mlp": 0.01026148, "balance_loss_clip": 1.05105209, "balance_loss_mlp": 1.01952612, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.995086819055731, "language_loss": 0.80579066, "learning_rate": 1.2240306033023726e-06, "loss": 0.82766926, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.62040376663208 }, { "auxiliary_loss_clip": 0.011287, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.0447222, "balance_loss_mlp": 1.02468109, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.856511174299, "language_loss": 0.72119862, "learning_rate": 1.223312710639611e-06, "loss": 0.74280196, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.710224151611328 }, { "auxiliary_loss_clip": 0.01140461, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.04850256, "balance_loss_mlp": 1.02120578, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.1031097742374794, "language_loss": 0.87034094, "learning_rate": 1.2225949357996928e-06, "loss": 0.89203382, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.5883941650390625 }, { "auxiliary_loss_clip": 0.011545, "auxiliary_loss_mlp": 0.01023712, "balance_loss_clip": 1.05065751, "balance_loss_mlp": 1.01647925, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.6458976536429575, "language_loss": 0.80347538, "learning_rate": 1.221877278891505e-06, "loss": 0.82525754, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.6316425800323486 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.05257607, "balance_loss_mlp": 1.02175164, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 7.391819042673351, "language_loss": 0.7153424, "learning_rate": 1.221159740023915e-06, "loss": 0.73729336, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.6090548038482666 }, { "auxiliary_loss_clip": 0.01136185, "auxiliary_loss_mlp": 0.00711851, "balance_loss_clip": 1.05010056, "balance_loss_mlp": 1.00044537, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 2.295313670505795, "language_loss": 0.72957069, "learning_rate": 1.2204423193057735e-06, "loss": 0.74805105, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 4.546837329864502 }, { "auxiliary_loss_clip": 0.01066738, "auxiliary_loss_mlp": 0.01002257, "balance_loss_clip": 1.03329861, "balance_loss_mlp": 1.00110674, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8480608958510634, "language_loss": 0.63353276, "learning_rate": 1.2197250168459122e-06, "loss": 0.65422273, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 5.554267883300781 }, { "auxiliary_loss_clip": 0.01163098, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.05312443, "balance_loss_mlp": 1.02279615, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 2.2677845201983997, "language_loss": 0.744717, "learning_rate": 1.2190078327531454e-06, "loss": 0.76664412, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.5858705043792725 }, { "auxiliary_loss_clip": 0.01160936, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.05028129, "balance_loss_mlp": 1.01997364, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.7810421879078868, "language_loss": 0.72698069, "learning_rate": 1.2182907671362697e-06, "loss": 0.74886405, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.5796196460723877 }, { "auxiliary_loss_clip": 0.0115784, "auxiliary_loss_mlp": 0.01023085, "balance_loss_clip": 1.05137789, "balance_loss_mlp": 1.01573884, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.2134075575426477, "language_loss": 0.78785902, "learning_rate": 1.2175738201040626e-06, "loss": 0.8096683, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.606895685195923 }, { "auxiliary_loss_clip": 0.01158251, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.04998362, "balance_loss_mlp": 1.02490211, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 2.651561519214967, "language_loss": 0.78706181, "learning_rate": 1.2168569917652855e-06, "loss": 0.80896688, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.6396071910858154 }, { "auxiliary_loss_clip": 0.0115921, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.05099583, "balance_loss_mlp": 1.02219689, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.6605949531085862, "language_loss": 0.64262128, "learning_rate": 1.2161402822286797e-06, "loss": 0.66450989, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.6077029705047607 }, { "auxiliary_loss_clip": 0.01124866, "auxiliary_loss_mlp": 0.01024845, "balance_loss_clip": 1.04679465, "balance_loss_mlp": 1.01744533, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 2.021368647354359, "language_loss": 0.79357374, "learning_rate": 1.2154236916029703e-06, "loss": 0.81507087, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.665450096130371 }, { "auxiliary_loss_clip": 0.01110679, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.04261792, "balance_loss_mlp": 1.02002442, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 3.675561077340039, "language_loss": 0.73660457, "learning_rate": 1.2147072199968627e-06, "loss": 0.75798595, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.6959779262542725 }, { "auxiliary_loss_clip": 0.01157412, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.05068207, "balance_loss_mlp": 1.01738572, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.7547048126097506, "language_loss": 0.71639103, "learning_rate": 1.2139908675190454e-06, "loss": 0.73821253, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.584075689315796 }, { "auxiliary_loss_clip": 0.01086245, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.04094887, "balance_loss_mlp": 1.02049541, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 3.0087103015987493, "language_loss": 0.74544501, "learning_rate": 1.2132746342781883e-06, "loss": 0.76658076, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.800570011138916 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.01027515, "balance_loss_clip": 1.05303288, "balance_loss_mlp": 1.02001119, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.9019707593821034, "language_loss": 0.80252767, "learning_rate": 1.2125585203829442e-06, "loss": 0.82456315, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.579102039337158 }, { "auxiliary_loss_clip": 0.01116065, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.04555976, "balance_loss_mlp": 1.02042139, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 4.393006362207998, "language_loss": 0.74154055, "learning_rate": 1.211842525941946e-06, "loss": 0.7629751, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.6686513423919678 }, { "auxiliary_loss_clip": 0.01107772, "auxiliary_loss_mlp": 0.01022998, "balance_loss_clip": 1.0465517, "balance_loss_mlp": 1.01603043, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 2.053696662534811, "language_loss": 0.79067934, "learning_rate": 1.2111266510638105e-06, "loss": 0.81198704, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.9147846698760986 }, { "auxiliary_loss_clip": 0.01087847, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.04401767, "balance_loss_mlp": 1.01999116, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.9130290846764066, "language_loss": 0.80008686, "learning_rate": 1.2104108958571346e-06, "loss": 0.82124639, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.7334158420562744 }, { "auxiliary_loss_clip": 0.01156378, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.05297685, "balance_loss_mlp": 1.01818824, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.4684358929898038, "language_loss": 0.75486803, "learning_rate": 1.2096952604304975e-06, "loss": 0.77668214, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.659085988998413 }, { "auxiliary_loss_clip": 0.0116128, "auxiliary_loss_mlp": 0.01026058, "balance_loss_clip": 1.05022252, "balance_loss_mlp": 1.01842308, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.6453726373670388, "language_loss": 0.70052326, "learning_rate": 1.2089797448924616e-06, "loss": 0.72239667, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.7585647106170654 }, { "auxiliary_loss_clip": 0.01117232, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.04480398, "balance_loss_mlp": 1.0166142, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 2.049372764554589, "language_loss": 0.66037381, "learning_rate": 1.2082643493515692e-06, "loss": 0.68178535, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.6802074909210205 }, { "auxiliary_loss_clip": 0.01160688, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.05228162, "balance_loss_mlp": 1.01765943, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 1.9111597157780122, "language_loss": 0.81908071, "learning_rate": 1.207549073916346e-06, "loss": 0.84093356, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.5958802700042725 }, { "auxiliary_loss_clip": 0.01133804, "auxiliary_loss_mlp": 0.01023669, "balance_loss_clip": 1.0490284, "balance_loss_mlp": 1.01668286, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.702887484632369, "language_loss": 0.77990532, "learning_rate": 1.2068339186952976e-06, "loss": 0.80148005, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.6454708576202393 }, { "auxiliary_loss_clip": 0.0116186, "auxiliary_loss_mlp": 0.01028138, "balance_loss_clip": 1.05141497, "balance_loss_mlp": 1.02098584, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 2.1838838009690673, "language_loss": 0.73321193, "learning_rate": 1.2061188837969136e-06, "loss": 0.75511193, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.6324448585510254 }, { "auxiliary_loss_clip": 0.01120732, "auxiliary_loss_mlp": 0.01027498, "balance_loss_clip": 1.04324615, "balance_loss_mlp": 1.01949358, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.4887408534600057, "language_loss": 0.8337943, "learning_rate": 1.2054039693296631e-06, "loss": 0.85527658, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.678319215774536 }, { "auxiliary_loss_clip": 0.01123866, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.04749799, "balance_loss_mlp": 1.01959741, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 1.728663795612148, "language_loss": 0.81630588, "learning_rate": 1.2046891754019992e-06, "loss": 0.83781004, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.745028495788574 }, { "auxiliary_loss_clip": 0.01162785, "auxiliary_loss_mlp": 0.0102492, "balance_loss_clip": 1.05333161, "balance_loss_mlp": 1.01772583, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.318240385985112, "language_loss": 0.82685822, "learning_rate": 1.2039745021223548e-06, "loss": 0.84873527, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.5576798915863037 }, { "auxiliary_loss_clip": 0.0103485, "auxiliary_loss_mlp": 0.01005158, "balance_loss_clip": 1.02986455, "balance_loss_mlp": 1.00377488, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7993287733496127, "language_loss": 0.57052273, "learning_rate": 1.2032599495991456e-06, "loss": 0.59092277, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.37154221534729 }, { "auxiliary_loss_clip": 0.01156985, "auxiliary_loss_mlp": 0.01023076, "balance_loss_clip": 1.05027008, "balance_loss_mlp": 1.01577151, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 2.250646174236253, "language_loss": 0.69890016, "learning_rate": 1.2025455179407685e-06, "loss": 0.72070074, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 3.76991605758667 }, { "auxiliary_loss_clip": 0.01154772, "auxiliary_loss_mlp": 0.00710945, "balance_loss_clip": 1.05020821, "balance_loss_mlp": 1.00046229, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 5.962967350775997, "language_loss": 0.73729306, "learning_rate": 1.2018312072556022e-06, "loss": 0.75595021, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 3.5087220668792725 }, { "auxiliary_loss_clip": 0.01172942, "auxiliary_loss_mlp": 0.00711632, "balance_loss_clip": 1.05150783, "balance_loss_mlp": 1.00043738, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 2.1129743122466684, "language_loss": 0.74503016, "learning_rate": 1.2011170176520077e-06, "loss": 0.7638759, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 3.4863357543945312 }, { "auxiliary_loss_clip": 0.0107671, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.04003632, "balance_loss_mlp": 1.02085519, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.6826758430089856, "language_loss": 0.81364381, "learning_rate": 1.2004029492383256e-06, "loss": 0.83468992, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.7665085792541504 }, { "auxiliary_loss_clip": 0.01159807, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.05389118, "balance_loss_mlp": 1.01826024, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 2.1705928824609995, "language_loss": 0.7392866, "learning_rate": 1.1996890021228814e-06, "loss": 0.7611419, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.58502459526062 }, { "auxiliary_loss_clip": 0.0113702, "auxiliary_loss_mlp": 0.01026716, "balance_loss_clip": 1.0454241, "balance_loss_mlp": 1.01992142, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.7627005423681417, "language_loss": 0.7010628, "learning_rate": 1.1989751764139785e-06, "loss": 0.72270012, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.8316328525543213 }, { "auxiliary_loss_clip": 0.01105793, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.03972149, "balance_loss_mlp": 1.01814044, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.5402423297646293, "language_loss": 0.83082014, "learning_rate": 1.1982614722199044e-06, "loss": 0.85212851, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.7304441928863525 }, { "auxiliary_loss_clip": 0.01145924, "auxiliary_loss_mlp": 0.01021947, "balance_loss_clip": 1.04624367, "balance_loss_mlp": 1.01472282, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.5056507850736733, "language_loss": 0.78108341, "learning_rate": 1.1975478896489276e-06, "loss": 0.80276215, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.6686205863952637 }, { "auxiliary_loss_clip": 0.01171146, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.0509181, "balance_loss_mlp": 1.02100635, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 2.0227605482061617, "language_loss": 0.76561987, "learning_rate": 1.1968344288092981e-06, "loss": 0.78761053, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.5453004837036133 }, { "auxiliary_loss_clip": 0.01159258, "auxiliary_loss_mlp": 0.00711216, "balance_loss_clip": 1.05178022, "balance_loss_mlp": 1.00052166, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.8941113656965882, "language_loss": 0.64781749, "learning_rate": 1.1961210898092468e-06, "loss": 0.6665222, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.6542348861694336 }, { "auxiliary_loss_clip": 0.01149558, "auxiliary_loss_mlp": 0.01027142, "balance_loss_clip": 1.05053067, "balance_loss_mlp": 1.01979578, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 2.215733059267478, "language_loss": 0.79527795, "learning_rate": 1.1954078727569874e-06, "loss": 0.81704497, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.611182451248169 }, { "auxiliary_loss_clip": 0.01133781, "auxiliary_loss_mlp": 0.00711164, "balance_loss_clip": 1.04852867, "balance_loss_mlp": 1.00046551, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.7989298011016042, "language_loss": 0.78174996, "learning_rate": 1.1946947777607141e-06, "loss": 0.80019945, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.7864861488342285 }, { "auxiliary_loss_clip": 0.01101852, "auxiliary_loss_mlp": 0.01029571, "balance_loss_clip": 1.04359376, "balance_loss_mlp": 1.02197731, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 2.363854840482857, "language_loss": 0.80271494, "learning_rate": 1.1939818049286024e-06, "loss": 0.82402921, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.7090096473693848 }, { "auxiliary_loss_clip": 0.01082073, "auxiliary_loss_mlp": 0.01027508, "balance_loss_clip": 1.04075003, "balance_loss_mlp": 1.01997375, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.9387843921273156, "language_loss": 0.75972372, "learning_rate": 1.1932689543688101e-06, "loss": 0.78081954, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.744046926498413 }, { "auxiliary_loss_clip": 0.01144075, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.05243731, "balance_loss_mlp": 1.02188814, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 2.1145370071057443, "language_loss": 0.72749627, "learning_rate": 1.1925562261894756e-06, "loss": 0.74922973, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.5887985229492188 }, { "auxiliary_loss_clip": 0.0113561, "auxiliary_loss_mlp": 0.01022058, "balance_loss_clip": 1.04669642, "balance_loss_mlp": 1.01482821, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.8967491010958049, "language_loss": 0.77522814, "learning_rate": 1.1918436204987207e-06, "loss": 0.79680479, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.671968698501587 }, { "auxiliary_loss_clip": 0.01156246, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.05188823, "balance_loss_mlp": 1.02222669, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 2.1502077692130275, "language_loss": 0.8187989, "learning_rate": 1.191131137404645e-06, "loss": 0.84065479, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.5527641773223877 }, { "auxiliary_loss_clip": 0.01116113, "auxiliary_loss_mlp": 0.01028867, "balance_loss_clip": 1.04649675, "balance_loss_mlp": 1.02197027, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.2815914943983833, "language_loss": 0.77158147, "learning_rate": 1.190418777015333e-06, "loss": 0.79303122, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.6280438899993896 }, { "auxiliary_loss_clip": 0.01140651, "auxiliary_loss_mlp": 0.01023225, "balance_loss_clip": 1.04778183, "balance_loss_mlp": 1.01637661, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.4719064360739222, "language_loss": 0.73897743, "learning_rate": 1.1897065394388487e-06, "loss": 0.76061624, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.66241717338562 }, { "auxiliary_loss_clip": 0.01145388, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.05344021, "balance_loss_mlp": 1.02586699, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.7427181703796129, "language_loss": 0.76588225, "learning_rate": 1.1889944247832385e-06, "loss": 0.78766465, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.6490116119384766 }, { "auxiliary_loss_clip": 0.01159816, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 1.04940152, "balance_loss_mlp": 1.01936436, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 2.652988241553469, "language_loss": 0.71027911, "learning_rate": 1.1882824331565283e-06, "loss": 0.73214191, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.6510467529296875 }, { "auxiliary_loss_clip": 0.01121435, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.04522932, "balance_loss_mlp": 1.01747084, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.77630656692946, "language_loss": 0.89658332, "learning_rate": 1.1875705646667287e-06, "loss": 0.91804421, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.6389687061309814 }, { "auxiliary_loss_clip": 0.01155812, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.04913855, "balance_loss_mlp": 1.01850617, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 7.070781247318037, "language_loss": 0.75470841, "learning_rate": 1.1868588194218282e-06, "loss": 0.77652436, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.7026290893554688 }, { "auxiliary_loss_clip": 0.01147926, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.05018961, "balance_loss_mlp": 1.01803327, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.641612761957076, "language_loss": 0.74031681, "learning_rate": 1.1861471975297979e-06, "loss": 0.76204187, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.720494031906128 }, { "auxiliary_loss_clip": 0.01121354, "auxiliary_loss_mlp": 0.0102814, "balance_loss_clip": 1.04813409, "balance_loss_mlp": 1.01983166, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.6300395125016696, "language_loss": 0.71108037, "learning_rate": 1.185435699098591e-06, "loss": 0.7325753, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.8132436275482178 }, { "auxiliary_loss_clip": 0.01144626, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.04830885, "balance_loss_mlp": 1.02472985, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.2365027189256694, "language_loss": 0.7832697, "learning_rate": 1.1847243242361403e-06, "loss": 0.80503684, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.5548110008239746 }, { "auxiliary_loss_clip": 0.01146101, "auxiliary_loss_mlp": 0.01022648, "balance_loss_clip": 1.0505296, "balance_loss_mlp": 1.01502419, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.6447701469406861, "language_loss": 0.78149962, "learning_rate": 1.1840130730503624e-06, "loss": 0.80318707, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.6881279945373535 }, { "auxiliary_loss_clip": 0.01175202, "auxiliary_loss_mlp": 0.0102529, "balance_loss_clip": 1.05237091, "balance_loss_mlp": 1.01770818, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 2.18691292696936, "language_loss": 0.7474649, "learning_rate": 1.1833019456491518e-06, "loss": 0.7694698, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 5.369600534439087 }, { "auxiliary_loss_clip": 0.01161283, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.05266929, "balance_loss_mlp": 1.01906252, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 2.1922171715120795, "language_loss": 0.79465759, "learning_rate": 1.1825909421403871e-06, "loss": 0.81653547, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 3.536893129348755 }, { "auxiliary_loss_clip": 0.01160364, "auxiliary_loss_mlp": 0.01027447, "balance_loss_clip": 1.05145693, "balance_loss_mlp": 1.02011001, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 1.8444249206177912, "language_loss": 0.76410091, "learning_rate": 1.181880062631926e-06, "loss": 0.78597903, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.629539966583252 }, { "auxiliary_loss_clip": 0.01135751, "auxiliary_loss_mlp": 0.01030864, "balance_loss_clip": 1.04831767, "balance_loss_mlp": 1.02321649, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.9218461090090937, "language_loss": 0.84842354, "learning_rate": 1.1811693072316093e-06, "loss": 0.87008965, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.7228944301605225 }, { "auxiliary_loss_clip": 0.01174208, "auxiliary_loss_mlp": 0.00711451, "balance_loss_clip": 1.05133688, "balance_loss_mlp": 1.00041366, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.711887714347483, "language_loss": 0.83891511, "learning_rate": 1.1804586760472574e-06, "loss": 0.85777169, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.545997381210327 }, { "auxiliary_loss_clip": 0.01123725, "auxiliary_loss_mlp": 0.0102797, "balance_loss_clip": 1.04545283, "balance_loss_mlp": 1.0206778, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.4147429951041777, "language_loss": 0.8027069, "learning_rate": 1.1797481691866736e-06, "loss": 0.82422388, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.712470769882202 }, { "auxiliary_loss_clip": 0.01134877, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.04980993, "balance_loss_mlp": 1.02163184, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 1.9399930164423758, "language_loss": 0.83103263, "learning_rate": 1.1790377867576393e-06, "loss": 0.85266912, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.644988536834717 }, { "auxiliary_loss_clip": 0.01145761, "auxiliary_loss_mlp": 0.01023107, "balance_loss_clip": 1.04870164, "balance_loss_mlp": 1.01621389, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 2.0752669689337306, "language_loss": 0.7654646, "learning_rate": 1.1783275288679203e-06, "loss": 0.78715324, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.6978678703308105 }, { "auxiliary_loss_clip": 0.01076645, "auxiliary_loss_mlp": 0.01001005, "balance_loss_clip": 1.03016305, "balance_loss_mlp": 0.99971718, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8475836303821596, "language_loss": 0.57078797, "learning_rate": 1.177617395625262e-06, "loss": 0.59156448, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.134643316268921 }, { "auxiliary_loss_clip": 0.01157813, "auxiliary_loss_mlp": 0.010247, "balance_loss_clip": 1.05149627, "balance_loss_mlp": 1.01734495, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.8706036557766006, "language_loss": 0.75575703, "learning_rate": 1.1769073871373908e-06, "loss": 0.77758217, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.626591444015503 }, { "auxiliary_loss_clip": 0.01120736, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.04296541, "balance_loss_mlp": 1.02127385, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.8739355985296358, "language_loss": 0.83822906, "learning_rate": 1.176197503512015e-06, "loss": 0.85972202, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.646559476852417 }, { "auxiliary_loss_clip": 0.01136811, "auxiliary_loss_mlp": 0.01025481, "balance_loss_clip": 1.04685521, "balance_loss_mlp": 1.01811993, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.490824994830087, "language_loss": 0.82492298, "learning_rate": 1.1754877448568223e-06, "loss": 0.84654593, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.648470640182495 }, { "auxiliary_loss_clip": 0.01140052, "auxiliary_loss_mlp": 0.0102465, "balance_loss_clip": 1.04748142, "balance_loss_mlp": 1.01795387, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 7.231738207303446, "language_loss": 0.90029907, "learning_rate": 1.1747781112794837e-06, "loss": 0.92194617, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.6254889965057373 }, { "auxiliary_loss_clip": 0.01126839, "auxiliary_loss_mlp": 0.01023886, "balance_loss_clip": 1.04941607, "balance_loss_mlp": 1.01709962, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.7386661091152669, "language_loss": 0.83035803, "learning_rate": 1.1740686028876487e-06, "loss": 0.85186523, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.7063043117523193 }, { "auxiliary_loss_clip": 0.01155637, "auxiliary_loss_mlp": 0.0102101, "balance_loss_clip": 1.05051935, "balance_loss_mlp": 1.01382744, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 3.2905360161341695, "language_loss": 0.75080454, "learning_rate": 1.1733592197889507e-06, "loss": 0.77257097, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.6142354011535645 }, { "auxiliary_loss_clip": 0.0115187, "auxiliary_loss_mlp": 0.01022146, "balance_loss_clip": 1.04996789, "balance_loss_mlp": 1.01563382, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 2.3377212974280264, "language_loss": 0.72622561, "learning_rate": 1.1726499620910014e-06, "loss": 0.74796569, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.5944221019744873 }, { "auxiliary_loss_clip": 0.01155242, "auxiliary_loss_mlp": 0.01027422, "balance_loss_clip": 1.04963517, "balance_loss_mlp": 1.01996529, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 2.1783768394858445, "language_loss": 0.77992374, "learning_rate": 1.1719408299013955e-06, "loss": 0.80175042, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.61307692527771 }, { "auxiliary_loss_clip": 0.01176615, "auxiliary_loss_mlp": 0.0102661, "balance_loss_clip": 1.0550462, "balance_loss_mlp": 1.01984751, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 3.540148263899153, "language_loss": 0.76245284, "learning_rate": 1.1712318233277067e-06, "loss": 0.78448504, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.5630571842193604 }, { "auxiliary_loss_clip": 0.01075194, "auxiliary_loss_mlp": 0.01001804, "balance_loss_clip": 1.03013206, "balance_loss_mlp": 1.00057018, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7502204075605605, "language_loss": 0.57809544, "learning_rate": 1.1705229424774916e-06, "loss": 0.59886545, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.0878891944885254 }, { "auxiliary_loss_clip": 0.01135978, "auxiliary_loss_mlp": 0.0102206, "balance_loss_clip": 1.0450871, "balance_loss_mlp": 1.01496708, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.6915920851708284, "language_loss": 0.64192975, "learning_rate": 1.1698141874582867e-06, "loss": 0.66351008, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.679372787475586 }, { "auxiliary_loss_clip": 0.01171456, "auxiliary_loss_mlp": 0.01021397, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.01496589, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 1.9087710779437768, "language_loss": 0.72194457, "learning_rate": 1.169105558377609e-06, "loss": 0.74387306, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.584196090698242 }, { "auxiliary_loss_clip": 0.0110934, "auxiliary_loss_mlp": 0.00711211, "balance_loss_clip": 1.04888976, "balance_loss_mlp": 1.00042915, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 2.734968212361885, "language_loss": 0.79071832, "learning_rate": 1.1683970553429587e-06, "loss": 0.80892384, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.7394654750823975 }, { "auxiliary_loss_clip": 0.01131268, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.0498544, "balance_loss_mlp": 1.02242815, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 1.991851297624705, "language_loss": 0.82461405, "learning_rate": 1.1676886784618128e-06, "loss": 0.84622872, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.6827263832092285 }, { "auxiliary_loss_clip": 0.01156159, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.0490495, "balance_loss_mlp": 1.02128386, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.2375269419670967, "language_loss": 0.83994341, "learning_rate": 1.1669804278416332e-06, "loss": 0.8617928, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.5613367557525635 }, { "auxiliary_loss_clip": 0.01148209, "auxiliary_loss_mlp": 0.01027431, "balance_loss_clip": 1.05123496, "balance_loss_mlp": 1.01985502, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 2.9228426349147973, "language_loss": 0.72022879, "learning_rate": 1.1662723035898602e-06, "loss": 0.7419852, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.6207358837127686 }, { "auxiliary_loss_clip": 0.01156713, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 1.0519371, "balance_loss_mlp": 1.01446402, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.9928792756133054, "language_loss": 0.81382138, "learning_rate": 1.165564305813915e-06, "loss": 0.83561122, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 3.490893602371216 }, { "auxiliary_loss_clip": 0.01157153, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.0512706, "balance_loss_mlp": 1.01708198, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.871270769090359, "language_loss": 0.81098598, "learning_rate": 1.1648564346212019e-06, "loss": 0.83280039, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 3.4836301803588867 }, { "auxiliary_loss_clip": 0.0115262, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.05070436, "balance_loss_mlp": 1.01986742, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 2.1007250980375325, "language_loss": 0.76571703, "learning_rate": 1.164148690119104e-06, "loss": 0.78751022, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 4.407437562942505 }, { "auxiliary_loss_clip": 0.01171684, "auxiliary_loss_mlp": 0.01025174, "balance_loss_clip": 1.05230975, "balance_loss_mlp": 1.01814055, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 2.132697088883427, "language_loss": 0.74019325, "learning_rate": 1.163441072414985e-06, "loss": 0.76216185, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.5929667949676514 }, { "auxiliary_loss_clip": 0.01155606, "auxiliary_loss_mlp": 0.01028805, "balance_loss_clip": 1.05054367, "balance_loss_mlp": 1.02158713, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 2.176009390819796, "language_loss": 0.7035563, "learning_rate": 1.16273358161619e-06, "loss": 0.72540039, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.6180732250213623 }, { "auxiliary_loss_clip": 0.01150795, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.05056357, "balance_loss_mlp": 1.01846337, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 2.145539537701846, "language_loss": 0.83347452, "learning_rate": 1.1620262178300446e-06, "loss": 0.8552404, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.6197643280029297 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.01022148, "balance_loss_clip": 1.0443269, "balance_loss_mlp": 1.0155561, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.9900103084317058, "language_loss": 0.76115203, "learning_rate": 1.1613189811638563e-06, "loss": 0.78260952, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.8150458335876465 }, { "auxiliary_loss_clip": 0.01160911, "auxiliary_loss_mlp": 0.01021722, "balance_loss_clip": 1.05321503, "balance_loss_mlp": 1.01489115, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.6317690701861558, "language_loss": 0.78260046, "learning_rate": 1.1606118717249117e-06, "loss": 0.80442679, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.6364917755126953 }, { "auxiliary_loss_clip": 0.01175121, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.0503366, "balance_loss_mlp": 1.01931143, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.773582970422982, "language_loss": 0.6810348, "learning_rate": 1.1599048896204787e-06, "loss": 0.70305783, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.5435709953308105 }, { "auxiliary_loss_clip": 0.0112995, "auxiliary_loss_mlp": 0.01023778, "balance_loss_clip": 1.04797959, "balance_loss_mlp": 1.01686668, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.9762208312712037, "language_loss": 0.80695522, "learning_rate": 1.1591980349578061e-06, "loss": 0.82849252, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.6862006187438965 }, { "auxiliary_loss_clip": 0.01051595, "auxiliary_loss_mlp": 0.01002992, "balance_loss_clip": 1.0299983, "balance_loss_mlp": 1.00177598, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.7319737726657107, "language_loss": 0.54298258, "learning_rate": 1.158491307844123e-06, "loss": 0.56352842, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.2287437915802 }, { "auxiliary_loss_clip": 0.01141269, "auxiliary_loss_mlp": 0.01024064, "balance_loss_clip": 1.0502528, "balance_loss_mlp": 1.01723945, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.9134119012688793, "language_loss": 0.83982551, "learning_rate": 1.1577847083866387e-06, "loss": 0.86147887, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.675676107406616 }, { "auxiliary_loss_clip": 0.01129966, "auxiliary_loss_mlp": 0.01021688, "balance_loss_clip": 1.04591346, "balance_loss_mlp": 1.01430321, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 2.272418108126368, "language_loss": 0.72275186, "learning_rate": 1.1570782366925453e-06, "loss": 0.7442683, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.615597724914551 }, { "auxiliary_loss_clip": 0.01140796, "auxiliary_loss_mlp": 0.01026538, "balance_loss_clip": 1.04523826, "balance_loss_mlp": 1.01889682, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.9313905066273316, "language_loss": 0.75612712, "learning_rate": 1.1563718928690132e-06, "loss": 0.77780044, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.604917049407959 }, { "auxiliary_loss_clip": 0.01123755, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.04812884, "balance_loss_mlp": 1.02302122, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.322516703015599, "language_loss": 0.71380091, "learning_rate": 1.1556656770231942e-06, "loss": 0.73534071, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.680527448654175 }, { "auxiliary_loss_clip": 0.01156312, "auxiliary_loss_mlp": 0.01024216, "balance_loss_clip": 1.04864907, "balance_loss_mlp": 1.01790071, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.734165523252056, "language_loss": 0.76638085, "learning_rate": 1.1549595892622207e-06, "loss": 0.78818619, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.6199705600738525 }, { "auxiliary_loss_clip": 0.01024328, "auxiliary_loss_mlp": 0.01004977, "balance_loss_clip": 1.02489519, "balance_loss_mlp": 1.00399911, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.8370178568846707, "language_loss": 0.58946025, "learning_rate": 1.1542536296932047e-06, "loss": 0.60975337, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.2555477619171143 }, { "auxiliary_loss_clip": 0.01132886, "auxiliary_loss_mlp": 0.01029617, "balance_loss_clip": 1.04742718, "balance_loss_mlp": 1.02182639, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.8790549061046664, "language_loss": 0.7038244, "learning_rate": 1.1535477984232414e-06, "loss": 0.72544944, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.661259174346924 }, { "auxiliary_loss_clip": 0.01109178, "auxiliary_loss_mlp": 0.01024619, "balance_loss_clip": 1.040501, "balance_loss_mlp": 1.01747203, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.9579699873604672, "language_loss": 0.76581299, "learning_rate": 1.152842095559404e-06, "loss": 0.78715098, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.7626760005950928 }, { "auxiliary_loss_clip": 0.01145599, "auxiliary_loss_mlp": 0.0102503, "balance_loss_clip": 1.04734159, "balance_loss_mlp": 1.01829505, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 1.9274599589610932, "language_loss": 0.76811558, "learning_rate": 1.1521365212087474e-06, "loss": 0.78982186, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.781454563140869 }, { "auxiliary_loss_clip": 0.01155958, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.04862976, "balance_loss_mlp": 1.01800394, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.9742964149028814, "language_loss": 0.70978308, "learning_rate": 1.1514310754783062e-06, "loss": 0.73159742, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.815622091293335 }, { "auxiliary_loss_clip": 0.01147446, "auxiliary_loss_mlp": 0.0102983, "balance_loss_clip": 1.05166841, "balance_loss_mlp": 1.0227102, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 2.017493304353741, "language_loss": 0.73471582, "learning_rate": 1.1507257584750964e-06, "loss": 0.75648862, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.7087059020996094 }, { "auxiliary_loss_clip": 0.01176376, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.05475819, "balance_loss_mlp": 1.0214417, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.853662951191284, "language_loss": 0.77551627, "learning_rate": 1.150020570306113e-06, "loss": 0.79756975, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.54845929145813 }, { "auxiliary_loss_clip": 0.01134191, "auxiliary_loss_mlp": 0.01033941, "balance_loss_clip": 1.04556346, "balance_loss_mlp": 1.02640438, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 2.131655204623006, "language_loss": 0.75172329, "learning_rate": 1.1493155110783338e-06, "loss": 0.7734046, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.676119327545166 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.05284882, "balance_loss_mlp": 1.01934779, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 2.5805920022711484, "language_loss": 0.70517063, "learning_rate": 1.1486105808987155e-06, "loss": 0.72701991, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.6593732833862305 }, { "auxiliary_loss_clip": 0.01160822, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.05293727, "balance_loss_mlp": 1.02144074, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.9142471046653506, "language_loss": 0.81339133, "learning_rate": 1.1479057798741947e-06, "loss": 0.83528876, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.634061336517334 }, { "auxiliary_loss_clip": 0.01070402, "auxiliary_loss_mlp": 0.01004232, "balance_loss_clip": 1.04392147, "balance_loss_mlp": 1.00318909, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.787213744282284, "language_loss": 0.53298783, "learning_rate": 1.14720110811169e-06, "loss": 0.55373418, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 4.187873840332031 }, { "auxiliary_loss_clip": 0.01162607, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.05343366, "balance_loss_mlp": 1.01986885, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.058026423947126, "language_loss": 0.76678765, "learning_rate": 1.146496565718098e-06, "loss": 0.78869063, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 3.5284531116485596 }, { "auxiliary_loss_clip": 0.0114224, "auxiliary_loss_mlp": 0.01031801, "balance_loss_clip": 1.04936469, "balance_loss_mlp": 1.0239687, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 3.106200003876561, "language_loss": 0.75899583, "learning_rate": 1.1457921528002996e-06, "loss": 0.78073621, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 3.57894229888916 }, { "auxiliary_loss_clip": 0.01175025, "auxiliary_loss_mlp": 0.00711786, "balance_loss_clip": 1.0523119, "balance_loss_mlp": 1.00044394, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.6925835453863285, "language_loss": 0.72255397, "learning_rate": 1.1450878694651522e-06, "loss": 0.74142206, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.6312131881713867 }, { "auxiliary_loss_clip": 0.01110027, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.04305744, "balance_loss_mlp": 1.01890707, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.5547841848846566, "language_loss": 0.63574511, "learning_rate": 1.1443837158194954e-06, "loss": 0.65710944, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.7136716842651367 }, { "auxiliary_loss_clip": 0.01128967, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.05203819, "balance_loss_mlp": 1.02232552, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.733341249904421, "language_loss": 0.7459836, "learning_rate": 1.1436796919701484e-06, "loss": 0.76756406, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.685354471206665 }, { "auxiliary_loss_clip": 0.01142064, "auxiliary_loss_mlp": 0.01026355, "balance_loss_clip": 1.04997849, "balance_loss_mlp": 1.01847529, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 2.0217206004354527, "language_loss": 0.62123203, "learning_rate": 1.1429757980239115e-06, "loss": 0.64291626, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.6713814735412598 }, { "auxiliary_loss_clip": 0.0117426, "auxiliary_loss_mlp": 0.01032272, "balance_loss_clip": 1.05056965, "balance_loss_mlp": 1.02445197, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 3.1447943511844243, "language_loss": 0.81860077, "learning_rate": 1.1422720340875636e-06, "loss": 0.84066606, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.626704692840576 }, { "auxiliary_loss_clip": 0.01162305, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.05039346, "balance_loss_mlp": 1.02270579, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 2.2907498404331097, "language_loss": 0.79401612, "learning_rate": 1.1415684002678671e-06, "loss": 0.81593615, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.5865187644958496 }, { "auxiliary_loss_clip": 0.01140164, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.04544449, "balance_loss_mlp": 1.01580977, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 3.2861250879387565, "language_loss": 0.77551317, "learning_rate": 1.1408648966715617e-06, "loss": 0.79715145, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.656658411026001 }, { "auxiliary_loss_clip": 0.01140444, "auxiliary_loss_mlp": 0.0102465, "balance_loss_clip": 1.04469609, "balance_loss_mlp": 1.01662731, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 1.9917889756746736, "language_loss": 0.72535574, "learning_rate": 1.1401615234053683e-06, "loss": 0.7470066, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.614471435546875 }, { "auxiliary_loss_clip": 0.0114334, "auxiliary_loss_mlp": 0.01025214, "balance_loss_clip": 1.04766262, "balance_loss_mlp": 1.01764417, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 1.892943358599982, "language_loss": 0.76072317, "learning_rate": 1.1394582805759885e-06, "loss": 0.78240871, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.677377700805664 }, { "auxiliary_loss_clip": 0.01158678, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.05213356, "balance_loss_mlp": 1.01931071, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 1.9639162904338017, "language_loss": 0.75944173, "learning_rate": 1.1387551682901022e-06, "loss": 0.78129375, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.623952865600586 }, { "auxiliary_loss_clip": 0.01121732, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.04493177, "balance_loss_mlp": 1.01740885, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 2.067002421414778, "language_loss": 0.70848584, "learning_rate": 1.138052186654373e-06, "loss": 0.72995108, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.6691620349884033 }, { "auxiliary_loss_clip": 0.01143882, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.04975283, "balance_loss_mlp": 1.0217495, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.122580616183352, "language_loss": 0.87731534, "learning_rate": 1.1373493357754417e-06, "loss": 0.89904571, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.574073076248169 }, { "auxiliary_loss_clip": 0.01170525, "auxiliary_loss_mlp": 0.01024148, "balance_loss_clip": 1.04871821, "balance_loss_mlp": 1.01696241, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 18.105797036543784, "language_loss": 0.77201873, "learning_rate": 1.1366466157599303e-06, "loss": 0.79396546, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.5939371585845947 }, { "auxiliary_loss_clip": 0.01105897, "auxiliary_loss_mlp": 0.00711435, "balance_loss_clip": 1.04311585, "balance_loss_mlp": 1.00047362, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 4.15824046717247, "language_loss": 0.76652837, "learning_rate": 1.1359440267144412e-06, "loss": 0.7847017, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.657346487045288 }, { "auxiliary_loss_clip": 0.01157673, "auxiliary_loss_mlp": 0.01024261, "balance_loss_clip": 1.04907513, "balance_loss_mlp": 1.01722741, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 2.686502950695107, "language_loss": 0.74092269, "learning_rate": 1.1352415687455556e-06, "loss": 0.76274204, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.748901128768921 }, { "auxiliary_loss_clip": 0.01158179, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.05178607, "balance_loss_mlp": 1.01936007, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.3756905448908077, "language_loss": 0.63728333, "learning_rate": 1.1345392419598362e-06, "loss": 0.65913379, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.6115293502807617 }, { "auxiliary_loss_clip": 0.0115161, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.04847884, "balance_loss_mlp": 1.02089643, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 2.262433283215008, "language_loss": 0.72151339, "learning_rate": 1.1338370464638263e-06, "loss": 0.74330783, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.676485061645508 }, { "auxiliary_loss_clip": 0.01171383, "auxiliary_loss_mlp": 0.01020936, "balance_loss_clip": 1.04981875, "balance_loss_mlp": 1.01401305, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.494612360894141, "language_loss": 0.64201194, "learning_rate": 1.1331349823640474e-06, "loss": 0.66393512, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.5316781997680664 }, { "auxiliary_loss_clip": 0.01156235, "auxiliary_loss_mlp": 0.00710612, "balance_loss_clip": 1.04853439, "balance_loss_mlp": 1.00052238, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.833347867477342, "language_loss": 0.77884758, "learning_rate": 1.132433049767003e-06, "loss": 0.79751599, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.685279607772827 }, { "auxiliary_loss_clip": 0.01140019, "auxiliary_loss_mlp": 0.01023913, "balance_loss_clip": 1.04820466, "balance_loss_mlp": 1.01683211, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5806257043345309, "language_loss": 0.8115207, "learning_rate": 1.1317312487791748e-06, "loss": 0.83315992, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.663898468017578 }, { "auxiliary_loss_clip": 0.01150139, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.04680061, "balance_loss_mlp": 1.01894116, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 2.2299953108247537, "language_loss": 0.73430008, "learning_rate": 1.1310295795070253e-06, "loss": 0.75606394, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.6310880184173584 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.04574823, "balance_loss_mlp": 1.02362061, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.7734577846020085, "language_loss": 0.81133908, "learning_rate": 1.1303280420569982e-06, "loss": 0.8327992, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.7424404621124268 }, { "auxiliary_loss_clip": 0.01150384, "auxiliary_loss_mlp": 0.01026786, "balance_loss_clip": 1.04750824, "balance_loss_mlp": 1.01959181, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 2.528638602907611, "language_loss": 0.77527726, "learning_rate": 1.1296266365355158e-06, "loss": 0.79704899, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.669041156768799 }, { "auxiliary_loss_clip": 0.01131842, "auxiliary_loss_mlp": 0.01029368, "balance_loss_clip": 1.05005348, "balance_loss_mlp": 1.02118397, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 1.9718656608453322, "language_loss": 0.73193228, "learning_rate": 1.1289253630489806e-06, "loss": 0.75354439, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.7361228466033936 }, { "auxiliary_loss_clip": 0.01162636, "auxiliary_loss_mlp": 0.0102556, "balance_loss_clip": 1.04988635, "balance_loss_mlp": 1.01758504, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 2.3754323777424213, "language_loss": 0.72769952, "learning_rate": 1.1282242217037753e-06, "loss": 0.74958146, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 4.495873928070068 }, { "auxiliary_loss_clip": 0.01105251, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.04269516, "balance_loss_mlp": 1.02003014, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 2.0938503375331416, "language_loss": 0.61890018, "learning_rate": 1.127523212606262e-06, "loss": 0.64023268, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 4.7419843673706055 }, { "auxiliary_loss_clip": 0.01155284, "auxiliary_loss_mlp": 0.01024125, "balance_loss_clip": 1.04856849, "balance_loss_mlp": 1.01724362, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 6.522382134512687, "language_loss": 0.73263687, "learning_rate": 1.1268223358627835e-06, "loss": 0.75443101, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.6682395935058594 }, { "auxiliary_loss_clip": 0.01174001, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.05070782, "balance_loss_mlp": 1.02114701, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 1.9466531824882154, "language_loss": 0.72063673, "learning_rate": 1.126121591579663e-06, "loss": 0.74266374, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.6390275955200195 }, { "auxiliary_loss_clip": 0.0115566, "auxiliary_loss_mlp": 0.01024952, "balance_loss_clip": 1.05198622, "balance_loss_mlp": 1.01741219, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.9749098966902245, "language_loss": 0.68963385, "learning_rate": 1.1254209798632018e-06, "loss": 0.71143997, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.6147239208221436 }, { "auxiliary_loss_clip": 0.01084197, "auxiliary_loss_mlp": 0.0103031, "balance_loss_clip": 1.04304481, "balance_loss_mlp": 1.02275515, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.9511499070148222, "language_loss": 0.8479383, "learning_rate": 1.124720500819683e-06, "loss": 0.86908346, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.7346532344818115 }, { "auxiliary_loss_clip": 0.01176905, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.05366445, "balance_loss_mlp": 1.01731801, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.9854727185274885, "language_loss": 0.82615286, "learning_rate": 1.1240201545553682e-06, "loss": 0.84817511, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.561744213104248 }, { "auxiliary_loss_clip": 0.01124412, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.04736626, "balance_loss_mlp": 1.02036595, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 3.061283033236197, "language_loss": 0.73250961, "learning_rate": 1.1233199411764987e-06, "loss": 0.75403285, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.710514545440674 }, { "auxiliary_loss_clip": 0.01111475, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 1.04200411, "balance_loss_mlp": 1.01850772, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.9215725493907592, "language_loss": 0.6913923, "learning_rate": 1.1226198607892978e-06, "loss": 0.71275842, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.66815447807312 }, { "auxiliary_loss_clip": 0.01112183, "auxiliary_loss_mlp": 0.0102535, "balance_loss_clip": 1.04748583, "balance_loss_mlp": 1.01820087, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 1.8889882340046515, "language_loss": 0.79776263, "learning_rate": 1.1219199134999664e-06, "loss": 0.81913793, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.734046459197998 }, { "auxiliary_loss_clip": 0.01145991, "auxiliary_loss_mlp": 0.01023285, "balance_loss_clip": 1.04960728, "balance_loss_mlp": 1.01534569, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 2.565109525632711, "language_loss": 0.78940976, "learning_rate": 1.1212200994146863e-06, "loss": 0.81110251, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.643658399581909 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01023218, "balance_loss_clip": 1.04210973, "balance_loss_mlp": 1.01628304, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 2.4714547293705884, "language_loss": 0.75682622, "learning_rate": 1.120520418639618e-06, "loss": 0.7782824, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.695732355117798 }, { "auxiliary_loss_clip": 0.01159363, "auxiliary_loss_mlp": 0.01023244, "balance_loss_clip": 1.05252457, "balance_loss_mlp": 1.01637769, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 3.257420164336547, "language_loss": 0.83513236, "learning_rate": 1.119820871280903e-06, "loss": 0.85695839, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.689340114593506 }, { "auxiliary_loss_clip": 0.01155461, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.04913759, "balance_loss_mlp": 1.02076089, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 2.051497122904288, "language_loss": 0.73543572, "learning_rate": 1.1191214574446614e-06, "loss": 0.75727117, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.6605467796325684 }, { "auxiliary_loss_clip": 0.01134394, "auxiliary_loss_mlp": 0.01023542, "balance_loss_clip": 1.04555607, "balance_loss_mlp": 1.01640785, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.9395518142322241, "language_loss": 0.80151683, "learning_rate": 1.118422177236995e-06, "loss": 0.82309622, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.6882753372192383 }, { "auxiliary_loss_clip": 0.01144095, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.0490315, "balance_loss_mlp": 1.01986551, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 5.574606955234599, "language_loss": 0.85782349, "learning_rate": 1.1177230307639835e-06, "loss": 0.87954032, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.658731698989868 }, { "auxiliary_loss_clip": 0.01121253, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.04690051, "balance_loss_mlp": 1.01906812, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.7441013611306397, "language_loss": 0.78662455, "learning_rate": 1.1170240181316865e-06, "loss": 0.80810535, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.741670608520508 }, { "auxiliary_loss_clip": 0.01121137, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04427946, "balance_loss_mlp": 1.01590633, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.491938300189942, "language_loss": 0.7967968, "learning_rate": 1.1163251394461442e-06, "loss": 0.81824398, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.6570191383361816 }, { "auxiliary_loss_clip": 0.01154584, "auxiliary_loss_mlp": 0.01031585, "balance_loss_clip": 1.04966259, "balance_loss_mlp": 1.02372956, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.9702655784247187, "language_loss": 0.82561326, "learning_rate": 1.1156263948133746e-06, "loss": 0.84747493, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.598419189453125 }, { "auxiliary_loss_clip": 0.01104185, "auxiliary_loss_mlp": 0.00711899, "balance_loss_clip": 1.04660952, "balance_loss_mlp": 1.00054002, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 2.4049917531998655, "language_loss": 0.77814138, "learning_rate": 1.1149277843393787e-06, "loss": 0.79630214, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.716931104660034 }, { "auxiliary_loss_clip": 0.01091659, "auxiliary_loss_mlp": 0.00711233, "balance_loss_clip": 1.04117131, "balance_loss_mlp": 1.00052798, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.0455193401538705, "language_loss": 0.63401145, "learning_rate": 1.1142293081301342e-06, "loss": 0.65204036, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.7976205348968506 }, { "auxiliary_loss_clip": 0.01133577, "auxiliary_loss_mlp": 0.01025023, "balance_loss_clip": 1.04451478, "balance_loss_mlp": 1.01792073, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.6982110192465847, "language_loss": 0.67798436, "learning_rate": 1.1135309662915995e-06, "loss": 0.69957036, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.696777820587158 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.0102586, "balance_loss_clip": 1.04367495, "balance_loss_mlp": 1.01836467, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 2.624263701024384, "language_loss": 0.60798091, "learning_rate": 1.112832758929712e-06, "loss": 0.62938577, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.7948813438415527 }, { "auxiliary_loss_clip": 0.01157135, "auxiliary_loss_mlp": 0.01023342, "balance_loss_clip": 1.05123019, "balance_loss_mlp": 1.0156827, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 2.6133851178231517, "language_loss": 0.75136226, "learning_rate": 1.11213468615039e-06, "loss": 0.77316701, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.563847541809082 }, { "auxiliary_loss_clip": 0.01092796, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 1.04321206, "balance_loss_mlp": 1.01779461, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.618489819557046, "language_loss": 0.75305194, "learning_rate": 1.1114367480595292e-06, "loss": 0.77423298, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.7899818420410156 }, { "auxiliary_loss_clip": 0.01097373, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.04700148, "balance_loss_mlp": 1.02224851, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 1.9624340956992172, "language_loss": 0.81482708, "learning_rate": 1.1107389447630086e-06, "loss": 0.8361032, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 3.627434730529785 }, { "auxiliary_loss_clip": 0.0113628, "auxiliary_loss_mlp": 0.00711304, "balance_loss_clip": 1.04635596, "balance_loss_mlp": 1.00054872, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.1605269515839005, "language_loss": 0.78509969, "learning_rate": 1.1100412763666818e-06, "loss": 0.80357552, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 3.6511964797973633 }, { "auxiliary_loss_clip": 0.01143213, "auxiliary_loss_mlp": 0.01027317, "balance_loss_clip": 1.04786944, "balance_loss_mlp": 1.01955616, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.8512153446487558, "language_loss": 0.80081308, "learning_rate": 1.1093437429763865e-06, "loss": 0.82251835, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 3.505897045135498 }, { "auxiliary_loss_clip": 0.01159868, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.05186343, "balance_loss_mlp": 1.01829004, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 1.8833594703458385, "language_loss": 0.73426753, "learning_rate": 1.1086463446979361e-06, "loss": 0.75612199, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 3.587033271789551 }, { "auxiliary_loss_clip": 0.01162121, "auxiliary_loss_mlp": 0.01028617, "balance_loss_clip": 1.05384922, "balance_loss_mlp": 1.02074373, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 2.0100664600706644, "language_loss": 0.77369273, "learning_rate": 1.1079490816371277e-06, "loss": 0.79560006, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.6055209636688232 }, { "auxiliary_loss_clip": 0.01157928, "auxiliary_loss_mlp": 0.00712283, "balance_loss_clip": 1.04893124, "balance_loss_mlp": 1.00056005, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.909494587677778, "language_loss": 0.74723804, "learning_rate": 1.1072519538997352e-06, "loss": 0.76594019, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.683122158050537 }, { "auxiliary_loss_clip": 0.01142516, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04535198, "balance_loss_mlp": 1.01716352, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.8711627229945598, "language_loss": 0.82357967, "learning_rate": 1.1065549615915095e-06, "loss": 0.84525669, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.633639097213745 }, { "auxiliary_loss_clip": 0.01155724, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.05104828, "balance_loss_mlp": 1.02660608, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 4.108493283167885, "language_loss": 0.78791273, "learning_rate": 1.105858104818187e-06, "loss": 0.80981445, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.6977319717407227 }, { "auxiliary_loss_clip": 0.01162875, "auxiliary_loss_mlp": 0.01026855, "balance_loss_clip": 1.05044675, "balance_loss_mlp": 1.01939237, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 20.36286533751052, "language_loss": 0.75063658, "learning_rate": 1.105161383685478e-06, "loss": 0.77253389, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.597233772277832 }, { "auxiliary_loss_clip": 0.01047435, "auxiliary_loss_mlp": 0.01002949, "balance_loss_clip": 1.03248382, "balance_loss_mlp": 1.00179887, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7272875914792383, "language_loss": 0.56260395, "learning_rate": 1.1044647982990771e-06, "loss": 0.58310777, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.172694206237793 }, { "auxiliary_loss_clip": 0.01142761, "auxiliary_loss_mlp": 0.01027324, "balance_loss_clip": 1.0496912, "balance_loss_mlp": 1.01936126, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 4.810270636670405, "language_loss": 0.64726245, "learning_rate": 1.1037683487646536e-06, "loss": 0.66896331, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.7097008228302 }, { "auxiliary_loss_clip": 0.01139574, "auxiliary_loss_mlp": 0.00710594, "balance_loss_clip": 1.05099225, "balance_loss_mlp": 1.00050092, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.187650470206469, "language_loss": 0.77513981, "learning_rate": 1.1030720351878583e-06, "loss": 0.79364151, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.605396270751953 }, { "auxiliary_loss_clip": 0.0106583, "auxiliary_loss_mlp": 0.01003636, "balance_loss_clip": 1.03264761, "balance_loss_mlp": 1.00246227, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.80839584435476, "language_loss": 0.57625711, "learning_rate": 1.102375857674323e-06, "loss": 0.59695172, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.194634199142456 }, { "auxiliary_loss_clip": 0.01140891, "auxiliary_loss_mlp": 0.01024567, "balance_loss_clip": 1.04711652, "balance_loss_mlp": 1.01751912, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.8039261121335404, "language_loss": 0.90373302, "learning_rate": 1.1016798163296561e-06, "loss": 0.92538762, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.6558403968811035 }, { "auxiliary_loss_clip": 0.0115978, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.05092096, "balance_loss_mlp": 1.01939821, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 2.0677184706489147, "language_loss": 0.66083777, "learning_rate": 1.1009839112594471e-06, "loss": 0.68270141, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.661853551864624 }, { "auxiliary_loss_clip": 0.0116138, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.05173957, "balance_loss_mlp": 1.01864457, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.4588471799716554, "language_loss": 0.72130555, "learning_rate": 1.1002881425692638e-06, "loss": 0.74318129, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.6573376655578613 }, { "auxiliary_loss_clip": 0.011501, "auxiliary_loss_mlp": 0.0102381, "balance_loss_clip": 1.04665112, "balance_loss_mlp": 1.01629961, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.7123700345980235, "language_loss": 0.7562052, "learning_rate": 1.0995925103646532e-06, "loss": 0.77794433, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.6546308994293213 }, { "auxiliary_loss_clip": 0.01121335, "auxiliary_loss_mlp": 0.01026293, "balance_loss_clip": 1.04764843, "balance_loss_mlp": 1.01906872, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.6373461943656675, "language_loss": 0.67051792, "learning_rate": 1.0988970147511437e-06, "loss": 0.69199425, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.7756500244140625 }, { "auxiliary_loss_clip": 0.01144076, "auxiliary_loss_mlp": 0.01025761, "balance_loss_clip": 1.05319953, "balance_loss_mlp": 1.01823616, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.2403524273847615, "language_loss": 0.80999976, "learning_rate": 1.0982016558342405e-06, "loss": 0.83169818, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.6416170597076416 }, { "auxiliary_loss_clip": 0.01175044, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.05183756, "balance_loss_mlp": 1.02118516, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 2.138587718054408, "language_loss": 0.71270001, "learning_rate": 1.0975064337194291e-06, "loss": 0.73473513, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.515501022338867 }, { "auxiliary_loss_clip": 0.01120102, "auxiliary_loss_mlp": 0.01031182, "balance_loss_clip": 1.04608536, "balance_loss_mlp": 1.02411282, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.8943157753147113, "language_loss": 0.70642877, "learning_rate": 1.0968113485121743e-06, "loss": 0.72794163, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.710491418838501 }, { "auxiliary_loss_clip": 0.01156118, "auxiliary_loss_mlp": 0.00711333, "balance_loss_clip": 1.04762447, "balance_loss_mlp": 1.00046444, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 1.9673535558244069, "language_loss": 0.80107731, "learning_rate": 1.0961164003179185e-06, "loss": 0.8197518, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.6798245906829834 }, { "auxiliary_loss_clip": 0.01125003, "auxiliary_loss_mlp": 0.01027445, "balance_loss_clip": 1.04756951, "balance_loss_mlp": 1.01988435, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 2.2245569606531577, "language_loss": 0.84472078, "learning_rate": 1.0954215892420884e-06, "loss": 0.86624527, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.7089133262634277 }, { "auxiliary_loss_clip": 0.01131025, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 1.04878902, "balance_loss_mlp": 1.02011311, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.7419226561716068, "language_loss": 0.70599627, "learning_rate": 1.094726915390082e-06, "loss": 0.7275815, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.7081987857818604 }, { "auxiliary_loss_clip": 0.01158065, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.05163002, "balance_loss_mlp": 1.02162218, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 2.520132205453627, "language_loss": 0.69929826, "learning_rate": 1.0940323788672836e-06, "loss": 0.72117031, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.5880229473114014 }, { "auxiliary_loss_clip": 0.01155124, "auxiliary_loss_mlp": 0.01019538, "balance_loss_clip": 1.05182004, "balance_loss_mlp": 1.01261771, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 4.717866933864309, "language_loss": 0.73924118, "learning_rate": 1.0933379797790522e-06, "loss": 0.76098776, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.6754865646362305 }, { "auxiliary_loss_clip": 0.01173982, "auxiliary_loss_mlp": 0.01027329, "balance_loss_clip": 1.05318475, "balance_loss_mlp": 1.02011096, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 3.484364153828101, "language_loss": 0.71557099, "learning_rate": 1.0926437182307293e-06, "loss": 0.73758411, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 3.524864673614502 }, { "auxiliary_loss_clip": 0.01146177, "auxiliary_loss_mlp": 0.0101891, "balance_loss_clip": 1.048365, "balance_loss_mlp": 1.01171255, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 2.262463940030778, "language_loss": 0.78117168, "learning_rate": 1.0919495943276338e-06, "loss": 0.80282253, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 3.5504634380340576 }, { "auxiliary_loss_clip": 0.01125595, "auxiliary_loss_mlp": 0.0102765, "balance_loss_clip": 1.04290676, "balance_loss_mlp": 1.02002931, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.617169483473301, "language_loss": 0.76711297, "learning_rate": 1.0912556081750611e-06, "loss": 0.78864539, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 3.5914089679718018 }, { "auxiliary_loss_clip": 0.01138951, "auxiliary_loss_mlp": 0.01023579, "balance_loss_clip": 1.04810905, "balance_loss_mlp": 1.01569963, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 2.0335357748974454, "language_loss": 0.76833665, "learning_rate": 1.0905617598782909e-06, "loss": 0.78996193, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 3.620790481567383 }, { "auxiliary_loss_clip": 0.01106109, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.04594922, "balance_loss_mlp": 1.01948464, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 2.0650369229499446, "language_loss": 0.8179875, "learning_rate": 1.0898680495425775e-06, "loss": 0.83931446, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.68619966506958 }, { "auxiliary_loss_clip": 0.0114591, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.04968262, "balance_loss_mlp": 1.02277124, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.8872631217005729, "language_loss": 0.8030591, "learning_rate": 1.0891744772731594e-06, "loss": 0.82482165, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.665492296218872 }, { "auxiliary_loss_clip": 0.01159049, "auxiliary_loss_mlp": 0.01026263, "balance_loss_clip": 1.0492487, "balance_loss_mlp": 1.01971292, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 2.0177945264368438, "language_loss": 0.65907216, "learning_rate": 1.088481043175248e-06, "loss": 0.68092525, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.6845486164093018 }, { "auxiliary_loss_clip": 0.01132044, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.04572785, "balance_loss_mlp": 1.02294612, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.7443354817150651, "language_loss": 0.75908554, "learning_rate": 1.0877877473540368e-06, "loss": 0.78070748, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.7247445583343506 }, { "auxiliary_loss_clip": 0.01175555, "auxiliary_loss_mlp": 0.01020344, "balance_loss_clip": 1.05332041, "balance_loss_mlp": 1.01362944, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 1.833518616379994, "language_loss": 0.72490585, "learning_rate": 1.0870945899147002e-06, "loss": 0.7468648, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.621727466583252 }, { "auxiliary_loss_clip": 0.01157445, "auxiliary_loss_mlp": 0.01028552, "balance_loss_clip": 1.05252194, "balance_loss_mlp": 1.02126813, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 1.8989255065226003, "language_loss": 0.76169819, "learning_rate": 1.0864015709623879e-06, "loss": 0.78355813, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.6858818531036377 }, { "auxiliary_loss_clip": 0.01163176, "auxiliary_loss_mlp": 0.01022872, "balance_loss_clip": 1.05159664, "balance_loss_mlp": 1.01603246, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.417248522462399, "language_loss": 0.80369896, "learning_rate": 1.0857086906022313e-06, "loss": 0.82555938, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.5968515872955322 }, { "auxiliary_loss_clip": 0.01084738, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.0451858, "balance_loss_mlp": 1.02180243, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.405520136363566, "language_loss": 0.73090714, "learning_rate": 1.0850159489393388e-06, "loss": 0.75204533, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.829775810241699 }, { "auxiliary_loss_clip": 0.01119269, "auxiliary_loss_mlp": 0.01022045, "balance_loss_clip": 1.04286397, "balance_loss_mlp": 1.0143981, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 2.0820055831522755, "language_loss": 0.82163715, "learning_rate": 1.0843233460787992e-06, "loss": 0.8430503, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.661663770675659 }, { "auxiliary_loss_clip": 0.01115014, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04640269, "balance_loss_mlp": 1.01698971, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 1.8628992780875517, "language_loss": 0.78220439, "learning_rate": 1.0836308821256805e-06, "loss": 0.8035934, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.7664568424224854 }, { "auxiliary_loss_clip": 0.01156268, "auxiliary_loss_mlp": 0.01021449, "balance_loss_clip": 1.05101907, "balance_loss_mlp": 1.01424325, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.1787653238379057, "language_loss": 0.78319609, "learning_rate": 1.0829385571850282e-06, "loss": 0.8049733, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.543987989425659 }, { "auxiliary_loss_clip": 0.01179428, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.05278993, "balance_loss_mlp": 1.02019346, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.9572802818731563, "language_loss": 0.8396132, "learning_rate": 1.0822463713618679e-06, "loss": 0.86168706, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.575855255126953 }, { "auxiliary_loss_clip": 0.01130646, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.01982975, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.0256986423603247, "language_loss": 0.85018611, "learning_rate": 1.0815543247612034e-06, "loss": 0.87176025, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.6658935546875 }, { "auxiliary_loss_clip": 0.01141502, "auxiliary_loss_mlp": 0.01021185, "balance_loss_clip": 1.0461576, "balance_loss_mlp": 1.01417243, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.8008011377524054, "language_loss": 0.83053702, "learning_rate": 1.0808624174880168e-06, "loss": 0.85216391, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.6497409343719482 }, { "auxiliary_loss_clip": 0.01172148, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.05346715, "balance_loss_mlp": 1.02058589, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.920266278378613, "language_loss": 0.80091751, "learning_rate": 1.080170649647272e-06, "loss": 0.8229171, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.571399211883545 }, { "auxiliary_loss_clip": 0.0117305, "auxiliary_loss_mlp": 0.01029035, "balance_loss_clip": 1.05185449, "balance_loss_mlp": 1.02162027, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 2.1015477530927464, "language_loss": 0.67288101, "learning_rate": 1.0794790213439068e-06, "loss": 0.69490182, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.7296509742736816 }, { "auxiliary_loss_clip": 0.01112005, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.0454818, "balance_loss_mlp": 1.01905608, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 6.120216311260431, "language_loss": 0.78676683, "learning_rate": 1.078787532682843e-06, "loss": 0.80815786, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.7754948139190674 }, { "auxiliary_loss_clip": 0.01157045, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.05092442, "balance_loss_mlp": 1.02008438, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.7372494187090717, "language_loss": 0.75803483, "learning_rate": 1.0780961837689773e-06, "loss": 0.77987742, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.7150416374206543 }, { "auxiliary_loss_clip": 0.01134153, "auxiliary_loss_mlp": 0.01027036, "balance_loss_clip": 1.04801166, "balance_loss_mlp": 1.02026176, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.760202157757008, "language_loss": 0.69960618, "learning_rate": 1.0774049747071883e-06, "loss": 0.72121805, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.6154558658599854 }, { "auxiliary_loss_clip": 0.01110984, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.04931903, "balance_loss_mlp": 1.0191257, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.8080284485482614, "language_loss": 0.68470615, "learning_rate": 1.076713905602332e-06, "loss": 0.70608115, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.8042705059051514 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.01022282, "balance_loss_clip": 1.05553997, "balance_loss_mlp": 1.01558876, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 2.0928013319479217, "language_loss": 0.81099355, "learning_rate": 1.07602297655924e-06, "loss": 0.83287156, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.645576238632202 }, { "auxiliary_loss_clip": 0.01174319, "auxiliary_loss_mlp": 0.01026918, "balance_loss_clip": 1.05371308, "balance_loss_mlp": 1.01928306, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.9184141808241644, "language_loss": 0.81098342, "learning_rate": 1.0753321876827292e-06, "loss": 0.83299583, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.5681135654449463 }, { "auxiliary_loss_clip": 0.01172094, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.05068851, "balance_loss_mlp": 1.02064919, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 2.29349803817808, "language_loss": 0.74351829, "learning_rate": 1.0746415390775893e-06, "loss": 0.76552385, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 3.5357818603515625 }, { "auxiliary_loss_clip": 0.01174443, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.05368996, "balance_loss_mlp": 1.01912212, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 2.487925542923052, "language_loss": 0.767887, "learning_rate": 1.0739510308485939e-06, "loss": 0.78989923, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 3.456862688064575 }, { "auxiliary_loss_clip": 0.01054969, "auxiliary_loss_mlp": 0.01007577, "balance_loss_clip": 1.03212094, "balance_loss_mlp": 1.00640857, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8684165240409529, "language_loss": 0.62504768, "learning_rate": 1.07326066310049e-06, "loss": 0.64567304, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 5.047543287277222 }, { "auxiliary_loss_clip": 0.01121265, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 1.04515874, "balance_loss_mlp": 1.02107406, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 3.9418344296739916, "language_loss": 0.79321069, "learning_rate": 1.0725704359380059e-06, "loss": 0.81471205, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.766235589981079 }, { "auxiliary_loss_clip": 0.01172574, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.05096316, "balance_loss_mlp": 1.02024305, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 2.360011039462556, "language_loss": 0.7219032, "learning_rate": 1.0718803494658497e-06, "loss": 0.74389887, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.7366671562194824 }, { "auxiliary_loss_clip": 0.01054626, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.03879189, "balance_loss_mlp": 1.01805997, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.3409504421908514, "language_loss": 0.84219891, "learning_rate": 1.071190403788707e-06, "loss": 0.86299771, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 3.0008273124694824 }, { "auxiliary_loss_clip": 0.01134168, "auxiliary_loss_mlp": 0.01024886, "balance_loss_clip": 1.04995513, "balance_loss_mlp": 1.01730478, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.9674836249935088, "language_loss": 0.75398672, "learning_rate": 1.0705005990112415e-06, "loss": 0.77557719, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.917851448059082 }, { "auxiliary_loss_clip": 0.01098061, "auxiliary_loss_mlp": 0.01024784, "balance_loss_clip": 1.04622412, "balance_loss_mlp": 1.01770592, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 5.625738924945476, "language_loss": 0.74730754, "learning_rate": 1.0698109352380957e-06, "loss": 0.76853597, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.6527020931243896 }, { "auxiliary_loss_clip": 0.01174048, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.05234408, "balance_loss_mlp": 1.01951146, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 2.1434196044831055, "language_loss": 0.77780956, "learning_rate": 1.0691214125738909e-06, "loss": 0.79981309, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.6621458530426025 }, { "auxiliary_loss_clip": 0.01086644, "auxiliary_loss_mlp": 0.01002199, "balance_loss_clip": 1.03155339, "balance_loss_mlp": 1.00100744, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.7906936825639193, "language_loss": 0.57478374, "learning_rate": 1.0684320311232287e-06, "loss": 0.59567219, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.1990132331848145 }, { "auxiliary_loss_clip": 0.01136594, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.04689646, "balance_loss_mlp": 1.01956832, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 2.6346957355907303, "language_loss": 0.81350136, "learning_rate": 1.0677427909906865e-06, "loss": 0.83514249, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.6489596366882324 }, { "auxiliary_loss_clip": 0.01179276, "auxiliary_loss_mlp": 0.01030827, "balance_loss_clip": 1.05465949, "balance_loss_mlp": 1.02343941, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 2.4360555939430877, "language_loss": 0.72286081, "learning_rate": 1.0670536922808216e-06, "loss": 0.74496186, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.619805335998535 }, { "auxiliary_loss_clip": 0.01142841, "auxiliary_loss_mlp": 0.0102742, "balance_loss_clip": 1.04927874, "balance_loss_mlp": 1.02021348, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.119735101883299, "language_loss": 0.72066563, "learning_rate": 1.06636473509817e-06, "loss": 0.74236822, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.585888147354126 }, { "auxiliary_loss_clip": 0.0114096, "auxiliary_loss_mlp": 0.00711369, "balance_loss_clip": 1.04970181, "balance_loss_mlp": 1.00057936, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.1307919295244795, "language_loss": 0.81045938, "learning_rate": 1.0656759195472447e-06, "loss": 0.82898271, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.6649551391601562 }, { "auxiliary_loss_clip": 0.0106445, "auxiliary_loss_mlp": 0.01002705, "balance_loss_clip": 1.03346896, "balance_loss_mlp": 1.00150108, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7752457182962335, "language_loss": 0.59759068, "learning_rate": 1.0649872457325414e-06, "loss": 0.61826229, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.1914737224578857 }, { "auxiliary_loss_clip": 0.01076413, "auxiliary_loss_mlp": 0.01001217, "balance_loss_clip": 1.03188658, "balance_loss_mlp": 1.00016201, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8613404910149212, "language_loss": 0.5509218, "learning_rate": 1.0642987137585278e-06, "loss": 0.57169807, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.2064473628997803 }, { "auxiliary_loss_clip": 0.01140915, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.05001628, "balance_loss_mlp": 1.02111554, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.729032961894284, "language_loss": 0.82630211, "learning_rate": 1.0636103237296561e-06, "loss": 0.84799248, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.6714422702789307 }, { "auxiliary_loss_clip": 0.0115635, "auxiliary_loss_mlp": 0.0102152, "balance_loss_clip": 1.05312574, "balance_loss_mlp": 1.01462388, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 2.1451783755761658, "language_loss": 0.84441805, "learning_rate": 1.062922075750353e-06, "loss": 0.86619681, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.6268913745880127 }, { "auxiliary_loss_clip": 0.0113085, "auxiliary_loss_mlp": 0.01023119, "balance_loss_clip": 1.0489974, "balance_loss_mlp": 1.01600814, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 2.3520743079637714, "language_loss": 0.72249877, "learning_rate": 1.0622339699250267e-06, "loss": 0.74403846, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.7049567699432373 }, { "auxiliary_loss_clip": 0.01126398, "auxiliary_loss_mlp": 0.01025016, "balance_loss_clip": 1.04628372, "balance_loss_mlp": 1.0181793, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.8157145234975807, "language_loss": 0.79569227, "learning_rate": 1.0615460063580624e-06, "loss": 0.81720644, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.694331407546997 }, { "auxiliary_loss_clip": 0.01145414, "auxiliary_loss_mlp": 0.010276, "balance_loss_clip": 1.04952455, "balance_loss_mlp": 1.02074301, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 2.252091247294166, "language_loss": 0.73212767, "learning_rate": 1.060858185153821e-06, "loss": 0.75385785, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.6921164989471436 }, { "auxiliary_loss_clip": 0.01149357, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.05086708, "balance_loss_mlp": 1.01607513, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.1343109708503114, "language_loss": 0.7602821, "learning_rate": 1.0601705064166474e-06, "loss": 0.78200912, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.6677193641662598 }, { "auxiliary_loss_clip": 0.0113616, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.04802632, "balance_loss_mlp": 1.0178442, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 2.9907070240865092, "language_loss": 0.73563838, "learning_rate": 1.0594829702508596e-06, "loss": 0.757249, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.675443410873413 }, { "auxiliary_loss_clip": 0.01128408, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.04836035, "balance_loss_mlp": 1.02167845, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 2.0180693096415627, "language_loss": 0.54844749, "learning_rate": 1.0587955767607592e-06, "loss": 0.57001978, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.781597852706909 }, { "auxiliary_loss_clip": 0.01176336, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.0530355, "balance_loss_mlp": 1.02566767, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.7569684451138876, "language_loss": 0.77168727, "learning_rate": 1.0581083260506206e-06, "loss": 0.79378659, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.568281412124634 }, { "auxiliary_loss_clip": 0.01139493, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.04591727, "balance_loss_mlp": 1.01760626, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.3337613414047254, "language_loss": 0.76690626, "learning_rate": 1.0574212182246993e-06, "loss": 0.78854799, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.633362054824829 }, { "auxiliary_loss_clip": 0.01147687, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.04972053, "balance_loss_mlp": 1.01959944, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.796734570394699, "language_loss": 0.75587857, "learning_rate": 1.0567342533872303e-06, "loss": 0.77763283, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 3.777373790740967 }, { "auxiliary_loss_clip": 0.01147284, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.05100656, "balance_loss_mlp": 1.02222562, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 2.0010209867048, "language_loss": 0.80935079, "learning_rate": 1.0560474316424255e-06, "loss": 0.83112115, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 3.896676540374756 }, { "auxiliary_loss_clip": 0.0114347, "auxiliary_loss_mlp": 0.01028432, "balance_loss_clip": 1.04852414, "balance_loss_mlp": 1.02021289, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 3.3665209153927087, "language_loss": 0.74061167, "learning_rate": 1.0553607530944746e-06, "loss": 0.76233071, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 4.512425661087036 }, { "auxiliary_loss_clip": 0.01128044, "auxiliary_loss_mlp": 0.01019354, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.01176679, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 2.48810241691693, "language_loss": 0.89914, "learning_rate": 1.0546742178475463e-06, "loss": 0.920614, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.642869472503662 }, { "auxiliary_loss_clip": 0.01115912, "auxiliary_loss_mlp": 0.01025634, "balance_loss_clip": 1.0471741, "balance_loss_mlp": 1.01873207, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 2.0755955579865137, "language_loss": 0.8648243, "learning_rate": 1.0539878260057868e-06, "loss": 0.88623977, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.7969970703125 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.05244851, "balance_loss_mlp": 1.02445471, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 4.0415573888417615, "language_loss": 0.68712544, "learning_rate": 1.0533015776733226e-06, "loss": 0.70905566, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.5701897144317627 }, { "auxiliary_loss_clip": 0.01140298, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.05016959, "balance_loss_mlp": 1.02082515, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.194550233354131, "language_loss": 0.78681612, "learning_rate": 1.0526154729542566e-06, "loss": 0.80850351, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.7155275344848633 }, { "auxiliary_loss_clip": 0.01129808, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.0505451, "balance_loss_mlp": 1.02021921, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 3.868678622377649, "language_loss": 0.79663968, "learning_rate": 1.0519295119526699e-06, "loss": 0.81821442, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.6939780712127686 }, { "auxiliary_loss_clip": 0.01146077, "auxiliary_loss_mlp": 0.01026645, "balance_loss_clip": 1.04904377, "balance_loss_mlp": 1.01943588, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.8833985689627404, "language_loss": 0.8320809, "learning_rate": 1.0512436947726227e-06, "loss": 0.85380816, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.6934707164764404 }, { "auxiliary_loss_clip": 0.01128039, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.04611623, "balance_loss_mlp": 1.02674842, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.4122563782445803, "language_loss": 0.65272951, "learning_rate": 1.0505580215181517e-06, "loss": 0.6743573, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.865776300430298 }, { "auxiliary_loss_clip": 0.0103382, "auxiliary_loss_mlp": 0.01003649, "balance_loss_clip": 1.02560353, "balance_loss_mlp": 1.00260019, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7947654062006966, "language_loss": 0.56626397, "learning_rate": 1.0498724922932753e-06, "loss": 0.58663863, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.20279860496521 }, { "auxiliary_loss_clip": 0.01181411, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.05548811, "balance_loss_mlp": 1.02528191, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.431769503843201, "language_loss": 0.87030733, "learning_rate": 1.0491871072019851e-06, "loss": 0.89245152, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.626361608505249 }, { "auxiliary_loss_clip": 0.01131093, "auxiliary_loss_mlp": 0.01020117, "balance_loss_clip": 1.04580808, "balance_loss_mlp": 1.0131166, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.905906819993487, "language_loss": 0.64036751, "learning_rate": 1.0485018663482555e-06, "loss": 0.6618796, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.7265989780426025 }, { "auxiliary_loss_clip": 0.01154532, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.05049968, "balance_loss_mlp": 1.02057934, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.8417615088273966, "language_loss": 0.70797962, "learning_rate": 1.0478167698360354e-06, "loss": 0.72980672, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.6975576877593994 }, { "auxiliary_loss_clip": 0.01152091, "auxiliary_loss_mlp": 0.0102641, "balance_loss_clip": 1.0493052, "balance_loss_mlp": 1.01917434, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.1573395651015663, "language_loss": 0.70256221, "learning_rate": 1.0471318177692556e-06, "loss": 0.72434723, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.617433786392212 }, { "auxiliary_loss_clip": 0.01111916, "auxiliary_loss_mlp": 0.01024765, "balance_loss_clip": 1.04539037, "balance_loss_mlp": 1.01779115, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 3.091845274030746, "language_loss": 0.75682026, "learning_rate": 1.046447010251821e-06, "loss": 0.7781871, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.723128318786621 }, { "auxiliary_loss_clip": 0.01144968, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.05308986, "balance_loss_mlp": 1.0244329, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.7382178006987539, "language_loss": 0.75714946, "learning_rate": 1.0457623473876157e-06, "loss": 0.77891392, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.695730447769165 }, { "auxiliary_loss_clip": 0.01173447, "auxiliary_loss_mlp": 0.01026985, "balance_loss_clip": 1.0525918, "balance_loss_mlp": 1.01952863, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 1.8718106463767832, "language_loss": 0.71106595, "learning_rate": 1.0450778292805046e-06, "loss": 0.73307025, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.652923345565796 }, { "auxiliary_loss_clip": 0.01163753, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 1.05173111, "balance_loss_mlp": 1.01660049, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 1.6161896764259673, "language_loss": 0.78503996, "learning_rate": 1.0443934560343267e-06, "loss": 0.80691934, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.6982061862945557 }, { "auxiliary_loss_clip": 0.01115875, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 1.04645061, "balance_loss_mlp": 1.01862919, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 2.4354179203251296, "language_loss": 0.78016311, "learning_rate": 1.0437092277529034e-06, "loss": 0.80158734, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.6570911407470703 }, { "auxiliary_loss_clip": 0.01140604, "auxiliary_loss_mlp": 0.01028427, "balance_loss_clip": 1.04988241, "balance_loss_mlp": 1.02106547, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 2.320928748379339, "language_loss": 0.73282284, "learning_rate": 1.0430251445400292e-06, "loss": 0.75451314, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.6468918323516846 }, { "auxiliary_loss_clip": 0.01062664, "auxiliary_loss_mlp": 0.01031292, "balance_loss_clip": 1.04145288, "balance_loss_mlp": 1.02401185, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 3.5575222410070504, "language_loss": 0.62431902, "learning_rate": 1.0423412064994787e-06, "loss": 0.64525855, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 2.987905502319336 }, { "auxiliary_loss_clip": 0.01126281, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.04590988, "balance_loss_mlp": 1.01931858, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 2.0861440125998283, "language_loss": 0.73600399, "learning_rate": 1.0416574137350064e-06, "loss": 0.75752574, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 2.9791576862335205 }, { "auxiliary_loss_clip": 0.01151986, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.04937148, "balance_loss_mlp": 1.01814103, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.4286641809966953, "language_loss": 0.80832845, "learning_rate": 1.0409737663503428e-06, "loss": 0.83010226, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.668374538421631 }, { "auxiliary_loss_clip": 0.01155592, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.04764676, "balance_loss_mlp": 1.01699305, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 8.145462417362372, "language_loss": 0.82806993, "learning_rate": 1.040290264449196e-06, "loss": 0.84987277, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.6272079944610596 }, { "auxiliary_loss_clip": 0.01155162, "auxiliary_loss_mlp": 0.01022404, "balance_loss_clip": 1.05190074, "balance_loss_mlp": 1.01509929, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 2.9072461281252595, "language_loss": 0.6367681, "learning_rate": 1.0396069081352532e-06, "loss": 0.65854383, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.605198383331299 }, { "auxiliary_loss_clip": 0.01085505, "auxiliary_loss_mlp": 0.0100311, "balance_loss_clip": 1.03014481, "balance_loss_mlp": 1.00198317, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.786285232866036, "language_loss": 0.56006014, "learning_rate": 1.0389236975121782e-06, "loss": 0.58094633, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 4.214010953903198 }, { "auxiliary_loss_clip": 0.01176326, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.0526042, "balance_loss_mlp": 1.02293181, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.069718722345505, "language_loss": 0.71260965, "learning_rate": 1.0382406326836147e-06, "loss": 0.7346822, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 3.557300090789795 }, { "auxiliary_loss_clip": 0.01163481, "auxiliary_loss_mlp": 0.01023051, "balance_loss_clip": 1.05156898, "balance_loss_mlp": 1.01533806, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 2.1332775177542227, "language_loss": 0.76561153, "learning_rate": 1.0375577137531828e-06, "loss": 0.7874769, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 3.572758436203003 }, { "auxiliary_loss_clip": 0.01144732, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 1.04824293, "balance_loss_mlp": 1.02117574, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.6580644713770125, "language_loss": 0.71945792, "learning_rate": 1.0368749408244802e-06, "loss": 0.74118716, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 3.6462888717651367 }, { "auxiliary_loss_clip": 0.01152574, "auxiliary_loss_mlp": 0.01026327, "balance_loss_clip": 1.05087352, "balance_loss_mlp": 1.0194037, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 2.4853190182381795, "language_loss": 0.79148495, "learning_rate": 1.0361923140010836e-06, "loss": 0.81327391, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.608527183532715 }, { "auxiliary_loss_clip": 0.01163017, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 1.05078793, "balance_loss_mlp": 1.02119553, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.1838121633210443, "language_loss": 0.63344687, "learning_rate": 1.0355098333865455e-06, "loss": 0.65536833, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.5871829986572266 }, { "auxiliary_loss_clip": 0.01155391, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.05307376, "balance_loss_mlp": 1.02257991, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 2.470104823818211, "language_loss": 0.69526124, "learning_rate": 1.0348274990844006e-06, "loss": 0.71711141, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.721935510635376 }, { "auxiliary_loss_clip": 0.01157339, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.04974449, "balance_loss_mlp": 1.01871991, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 2.236351966163519, "language_loss": 0.72475839, "learning_rate": 1.034145311198155e-06, "loss": 0.74658966, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.599351167678833 }, { "auxiliary_loss_clip": 0.01173041, "auxiliary_loss_mlp": 0.01022935, "balance_loss_clip": 1.05241752, "balance_loss_mlp": 1.01631021, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.9157637032406571, "language_loss": 0.64271337, "learning_rate": 1.0334632698312989e-06, "loss": 0.66467309, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.696781873703003 }, { "auxiliary_loss_clip": 0.0113497, "auxiliary_loss_mlp": 0.01026183, "balance_loss_clip": 1.04753995, "balance_loss_mlp": 1.0186733, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 2.8642928336243543, "language_loss": 0.75537193, "learning_rate": 1.032781375087295e-06, "loss": 0.7769835, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.675222635269165 }, { "auxiliary_loss_clip": 0.0114682, "auxiliary_loss_mlp": 0.0102005, "balance_loss_clip": 1.05232096, "balance_loss_mlp": 1.01281404, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.6037104985206554, "language_loss": 0.67540121, "learning_rate": 1.0320996270695891e-06, "loss": 0.69706994, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.650660276412964 }, { "auxiliary_loss_clip": 0.01125221, "auxiliary_loss_mlp": 0.01024867, "balance_loss_clip": 1.04521048, "balance_loss_mlp": 1.0174222, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 1.992290599341529, "language_loss": 0.73302937, "learning_rate": 1.0314180258815998e-06, "loss": 0.75453025, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.6943604946136475 }, { "auxiliary_loss_clip": 0.01118638, "auxiliary_loss_mlp": 0.01023743, "balance_loss_clip": 1.04513788, "balance_loss_mlp": 1.01659656, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 2.3730673806541436, "language_loss": 0.74455422, "learning_rate": 1.0307365716267247e-06, "loss": 0.7659781, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.6869261264801025 }, { "auxiliary_loss_clip": 0.01157731, "auxiliary_loss_mlp": 0.01029441, "balance_loss_clip": 1.05104136, "balance_loss_mlp": 1.02191341, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 2.61591370573048, "language_loss": 0.7786476, "learning_rate": 1.0300552644083423e-06, "loss": 0.80051935, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.6061689853668213 }, { "auxiliary_loss_clip": 0.0113298, "auxiliary_loss_mlp": 0.01025185, "balance_loss_clip": 1.04970825, "balance_loss_mlp": 1.01758504, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 7.788677517487601, "language_loss": 0.72471017, "learning_rate": 1.0293741043298036e-06, "loss": 0.74629176, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.632523775100708 }, { "auxiliary_loss_clip": 0.01130364, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.05071449, "balance_loss_mlp": 1.02065551, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 2.8582230855620483, "language_loss": 0.7161516, "learning_rate": 1.0286930914944436e-06, "loss": 0.73773581, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.724353075027466 }, { "auxiliary_loss_clip": 0.01172419, "auxiliary_loss_mlp": 0.01020475, "balance_loss_clip": 1.04872537, "balance_loss_mlp": 1.01329815, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 3.0326469993315723, "language_loss": 0.7744143, "learning_rate": 1.0280122260055684e-06, "loss": 0.79634321, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.5523478984832764 }, { "auxiliary_loss_clip": 0.01177732, "auxiliary_loss_mlp": 0.01028001, "balance_loss_clip": 1.05391312, "balance_loss_mlp": 1.02094734, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 3.956827377382085, "language_loss": 0.82023579, "learning_rate": 1.0273315079664652e-06, "loss": 0.84229314, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.5775203704833984 }, { "auxiliary_loss_clip": 0.01158099, "auxiliary_loss_mlp": 0.01024707, "balance_loss_clip": 1.0488162, "balance_loss_mlp": 1.01771283, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.0935738521435363, "language_loss": 0.74380422, "learning_rate": 1.0266509374803992e-06, "loss": 0.76563227, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.593308925628662 }, { "auxiliary_loss_clip": 0.01175506, "auxiliary_loss_mlp": 0.00711924, "balance_loss_clip": 1.05216825, "balance_loss_mlp": 1.00055599, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 3.2717099534028367, "language_loss": 0.84345734, "learning_rate": 1.0259705146506123e-06, "loss": 0.86233163, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.6126439571380615 }, { "auxiliary_loss_clip": 0.01161397, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.05125737, "balance_loss_mlp": 1.02088928, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 2.5400281434814884, "language_loss": 0.7766608, "learning_rate": 1.025290239580324e-06, "loss": 0.79855645, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.675673007965088 }, { "auxiliary_loss_clip": 0.01109376, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.04313672, "balance_loss_mlp": 1.0199039, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 3.9293961740028323, "language_loss": 0.75624323, "learning_rate": 1.0246101123727313e-06, "loss": 0.77760756, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.685418128967285 }, { "auxiliary_loss_clip": 0.01159669, "auxiliary_loss_mlp": 0.01025738, "balance_loss_clip": 1.0512116, "balance_loss_mlp": 1.01891959, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 4.965480489745478, "language_loss": 0.78637677, "learning_rate": 1.0239301331310085e-06, "loss": 0.80823088, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.5389316082000732 }, { "auxiliary_loss_clip": 0.01158263, "auxiliary_loss_mlp": 0.01027187, "balance_loss_clip": 1.05100727, "balance_loss_mlp": 1.01968539, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.7021612764483365, "language_loss": 0.88506794, "learning_rate": 1.0232503019583088e-06, "loss": 0.9069224, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.6390292644500732 }, { "auxiliary_loss_clip": 0.01154868, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.0504806, "balance_loss_mlp": 1.02017856, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 2.9169439563532773, "language_loss": 0.69808894, "learning_rate": 1.0225706189577619e-06, "loss": 0.71991438, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.5918877124786377 }, { "auxiliary_loss_clip": 0.01160951, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.05188107, "balance_loss_mlp": 1.01957142, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 5.7443344831355745, "language_loss": 0.74731827, "learning_rate": 1.021891084232475e-06, "loss": 0.76919711, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.566408157348633 }, { "auxiliary_loss_clip": 0.01157791, "auxiliary_loss_mlp": 0.01023127, "balance_loss_clip": 1.04906869, "balance_loss_mlp": 1.01534581, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.5866438270576464, "language_loss": 0.79883873, "learning_rate": 1.0212116978855325e-06, "loss": 0.8206479, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 3.4823594093322754 }, { "auxiliary_loss_clip": 0.01121377, "auxiliary_loss_mlp": 0.01021578, "balance_loss_clip": 1.04633331, "balance_loss_mlp": 1.01438046, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 2.1064019855362477, "language_loss": 0.78954959, "learning_rate": 1.020532460019997e-06, "loss": 0.81097913, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 3.5530271530151367 }, { "auxiliary_loss_clip": 0.01082184, "auxiliary_loss_mlp": 0.0103334, "balance_loss_clip": 1.04321206, "balance_loss_mlp": 1.02605009, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 2.0502725662257797, "language_loss": 0.70853353, "learning_rate": 1.0198533707389096e-06, "loss": 0.72968876, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 3.799898147583008 }, { "auxiliary_loss_clip": 0.01155211, "auxiliary_loss_mlp": 0.00711381, "balance_loss_clip": 1.05045295, "balance_loss_mlp": 1.00055897, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 2.602487068598723, "language_loss": 0.73317319, "learning_rate": 1.0191744301452853e-06, "loss": 0.75183916, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 3.680581569671631 }, { "auxiliary_loss_clip": 0.01171632, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.04998016, "balance_loss_mlp": 1.018803, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.8214104696741045, "language_loss": 0.70364249, "learning_rate": 1.0184956383421208e-06, "loss": 0.7256164, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.593773603439331 }, { "auxiliary_loss_clip": 0.01160967, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.05120015, "balance_loss_mlp": 1.02148974, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 2.6608248880438987, "language_loss": 0.65990925, "learning_rate": 1.017816995432387e-06, "loss": 0.6818049, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.6291520595550537 }, { "auxiliary_loss_clip": 0.01139805, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.04702961, "balance_loss_mlp": 1.02143431, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 2.2077782946563342, "language_loss": 0.74566984, "learning_rate": 1.0171385015190353e-06, "loss": 0.76735973, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.595607280731201 }, { "auxiliary_loss_clip": 0.01135499, "auxiliary_loss_mlp": 0.00711108, "balance_loss_clip": 1.04826951, "balance_loss_mlp": 1.00049162, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 1.9419033981880742, "language_loss": 0.73357666, "learning_rate": 1.0164601567049908e-06, "loss": 0.75204277, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.63287091255188 }, { "auxiliary_loss_clip": 0.01141693, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.04982567, "balance_loss_mlp": 1.02093339, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 2.3616143934997713, "language_loss": 0.80413663, "learning_rate": 1.015781961093158e-06, "loss": 0.82583535, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.6152796745300293 }, { "auxiliary_loss_clip": 0.01145733, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.04678094, "balance_loss_mlp": 1.0194447, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.6741227541468287, "language_loss": 0.77074236, "learning_rate": 1.0151039147864197e-06, "loss": 0.79246569, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.6582255363464355 }, { "auxiliary_loss_clip": 0.01076836, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.04794359, "balance_loss_mlp": 1.02345419, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 2.112273190944843, "language_loss": 0.66106308, "learning_rate": 1.0144260178876336e-06, "loss": 0.68213856, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.7831943035125732 }, { "auxiliary_loss_clip": 0.01149471, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.04925752, "balance_loss_mlp": 1.02640796, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.3710324203570585, "language_loss": 0.67946333, "learning_rate": 1.0137482704996388e-06, "loss": 0.70129323, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.6301827430725098 }, { "auxiliary_loss_clip": 0.01131431, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.04888654, "balance_loss_mlp": 1.01849496, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 2.2263879842906658, "language_loss": 0.79164827, "learning_rate": 1.0130706727252461e-06, "loss": 0.81322253, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.715529203414917 }, { "auxiliary_loss_clip": 0.01131495, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.04791367, "balance_loss_mlp": 1.0188334, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.448920977649567, "language_loss": 0.68311709, "learning_rate": 1.0123932246672468e-06, "loss": 0.70469344, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.6128838062286377 }, { "auxiliary_loss_clip": 0.01034423, "auxiliary_loss_mlp": 0.00701769, "balance_loss_clip": 1.02730083, "balance_loss_mlp": 1.00008821, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7501131608996664, "language_loss": 0.55812061, "learning_rate": 1.0117159264284114e-06, "loss": 0.57548255, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.2262797355651855 }, { "auxiliary_loss_clip": 0.0114186, "auxiliary_loss_mlp": 0.01027651, "balance_loss_clip": 1.04810643, "balance_loss_mlp": 1.02024245, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.8411850197596127, "language_loss": 0.77125907, "learning_rate": 1.0110387781114837e-06, "loss": 0.79295409, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.699007511138916 }, { "auxiliary_loss_clip": 0.01173827, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.05223632, "balance_loss_mlp": 1.01938844, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 2.7431939831734296, "language_loss": 0.77294779, "learning_rate": 1.0103617798191872e-06, "loss": 0.79495943, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.581284999847412 }, { "auxiliary_loss_clip": 0.01138976, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.04966807, "balance_loss_mlp": 1.02029479, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.3376674215130895, "language_loss": 0.8279258, "learning_rate": 1.0096849316542217e-06, "loss": 0.84959203, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.607295513153076 }, { "auxiliary_loss_clip": 0.01064378, "auxiliary_loss_mlp": 0.0102442, "balance_loss_clip": 1.03946042, "balance_loss_mlp": 1.01708817, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.066680845834836, "language_loss": 0.7495364, "learning_rate": 1.0090082337192643e-06, "loss": 0.77042437, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.766697406768799 }, { "auxiliary_loss_clip": 0.01088237, "auxiliary_loss_mlp": 0.0102701, "balance_loss_clip": 1.04106998, "balance_loss_mlp": 1.01967907, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 2.4939285748286095, "language_loss": 0.78543413, "learning_rate": 1.0083316861169705e-06, "loss": 0.80658662, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.730135679244995 }, { "auxiliary_loss_clip": 0.01132748, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 1.04598463, "balance_loss_mlp": 1.02112007, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 3.1028930229179164, "language_loss": 0.72002041, "learning_rate": 1.0076552889499713e-06, "loss": 0.74163651, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.727837562561035 }, { "auxiliary_loss_clip": 0.01159316, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.05295277, "balance_loss_mlp": 1.01836646, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.0040633317365333, "language_loss": 0.73712784, "learning_rate": 1.006979042320876e-06, "loss": 0.75897199, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.6924233436584473 }, { "auxiliary_loss_clip": 0.01137574, "auxiliary_loss_mlp": 0.01025966, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.0186646, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 3.834410162154862, "language_loss": 0.6322093, "learning_rate": 1.0063029463322702e-06, "loss": 0.65384465, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.660435438156128 }, { "auxiliary_loss_clip": 0.01100917, "auxiliary_loss_mlp": 0.00711205, "balance_loss_clip": 1.04027486, "balance_loss_mlp": 1.00047517, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.6664740427440066, "language_loss": 0.75280368, "learning_rate": 1.0056270010867164e-06, "loss": 0.77092493, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.6900269985198975 }, { "auxiliary_loss_clip": 0.0114257, "auxiliary_loss_mlp": 0.01024248, "balance_loss_clip": 1.0450474, "balance_loss_mlp": 1.01701546, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.590467425521181, "language_loss": 0.78502917, "learning_rate": 1.004951206686758e-06, "loss": 0.80669737, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.6034886837005615 }, { "auxiliary_loss_clip": 0.01154262, "auxiliary_loss_mlp": 0.01025502, "balance_loss_clip": 1.05025375, "balance_loss_mlp": 1.01795578, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 3.1858689063408243, "language_loss": 0.71799481, "learning_rate": 1.0042755632349087e-06, "loss": 0.73979247, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.61868953704834 }, { "auxiliary_loss_clip": 0.01127372, "auxiliary_loss_mlp": 0.01029092, "balance_loss_clip": 1.0481894, "balance_loss_mlp": 1.02159703, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 3.2688702932075, "language_loss": 0.6269052, "learning_rate": 1.0036000708336653e-06, "loss": 0.64846981, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 4.6009345054626465 }, { "auxiliary_loss_clip": 0.01145834, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.05121267, "balance_loss_mlp": 1.01938462, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.3803490813170365, "language_loss": 0.79717934, "learning_rate": 1.0029247295854984e-06, "loss": 0.81890935, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 3.5281002521514893 }, { "auxiliary_loss_clip": 0.01134156, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.05072784, "balance_loss_mlp": 1.02348924, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 2.086694989466932, "language_loss": 0.71807551, "learning_rate": 1.0022495395928588e-06, "loss": 0.7397176, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.660083770751953 }, { "auxiliary_loss_clip": 0.01083073, "auxiliary_loss_mlp": 0.01003254, "balance_loss_clip": 1.02768421, "balance_loss_mlp": 1.00209188, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7902356931242901, "language_loss": 0.62369502, "learning_rate": 1.0015745009581697e-06, "loss": 0.64455825, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 4.207628965377808 }, { "auxiliary_loss_clip": 0.01156303, "auxiliary_loss_mlp": 0.01024227, "balance_loss_clip": 1.05099213, "balance_loss_mlp": 1.01697314, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 1.949137327421627, "language_loss": 0.67416555, "learning_rate": 1.0008996137838343e-06, "loss": 0.69597083, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.602876901626587 }, { "auxiliary_loss_clip": 0.01178611, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.05197513, "balance_loss_mlp": 1.0181067, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 2.5270152092036713, "language_loss": 0.80012739, "learning_rate": 1.000224878172234e-06, "loss": 0.82217395, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.5854592323303223 }, { "auxiliary_loss_clip": 0.01160795, "auxiliary_loss_mlp": 0.01025687, "balance_loss_clip": 1.05085647, "balance_loss_mlp": 1.01870179, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 2.4604613561181843, "language_loss": 0.73082733, "learning_rate": 9.99550294225724e-07, "loss": 0.75269216, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.5739262104034424 }, { "auxiliary_loss_clip": 0.01110392, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.04149866, "balance_loss_mlp": 1.01708341, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 2.0846082838232465, "language_loss": 0.7260685, "learning_rate": 9.988758620466402e-07, "loss": 0.74741888, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.7086782455444336 }, { "auxiliary_loss_clip": 0.0110033, "auxiliary_loss_mlp": 0.01022797, "balance_loss_clip": 1.04566622, "balance_loss_mlp": 1.01632094, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.8637502494188918, "language_loss": 0.76018572, "learning_rate": 9.982015817372917e-07, "loss": 0.78141701, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.794610023498535 }, { "auxiliary_loss_clip": 0.01109162, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.0442574, "balance_loss_mlp": 1.02358329, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 2.0135769501734435, "language_loss": 0.81765974, "learning_rate": 9.975274533999657e-07, "loss": 0.83906227, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.7324252128601074 }, { "auxiliary_loss_clip": 0.01174642, "auxiliary_loss_mlp": 0.01028141, "balance_loss_clip": 1.05138707, "balance_loss_mlp": 1.0202198, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 4.5193307016677515, "language_loss": 0.84373772, "learning_rate": 9.96853477136929e-07, "loss": 0.86576557, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.5611846446990967 }, { "auxiliary_loss_clip": 0.01117188, "auxiliary_loss_mlp": 0.01027261, "balance_loss_clip": 1.04314828, "balance_loss_mlp": 1.01960135, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 3.0955191086003766, "language_loss": 0.75156951, "learning_rate": 9.96179653050422e-07, "loss": 0.77301395, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.6996006965637207 }, { "auxiliary_loss_clip": 0.01116662, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 1.0446043, "balance_loss_mlp": 1.01984763, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.2079822564316443, "language_loss": 0.74119478, "learning_rate": 9.955059812426635e-07, "loss": 0.76263106, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.65122127532959 }, { "auxiliary_loss_clip": 0.01176374, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.05430841, "balance_loss_mlp": 1.01681817, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 3.2499408208267337, "language_loss": 0.83264959, "learning_rate": 9.948324618158493e-07, "loss": 0.85465598, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.624699592590332 }, { "auxiliary_loss_clip": 0.0115805, "auxiliary_loss_mlp": 0.01025261, "balance_loss_clip": 1.04810739, "balance_loss_mlp": 1.01774454, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.6388151161615747, "language_loss": 0.77644718, "learning_rate": 9.941590948721502e-07, "loss": 0.7982803, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.5511560440063477 }, { "auxiliary_loss_clip": 0.01135995, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 1.04858804, "balance_loss_mlp": 1.017658, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 2.123228964025193, "language_loss": 0.76385951, "learning_rate": 9.934858805137188e-07, "loss": 0.78546405, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.6741726398468018 }, { "auxiliary_loss_clip": 0.01154022, "auxiliary_loss_mlp": 0.01024773, "balance_loss_clip": 1.05009294, "balance_loss_mlp": 1.01782632, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.686947636231347, "language_loss": 0.80955362, "learning_rate": 9.92812818842677e-07, "loss": 0.83134151, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.6229302883148193 }, { "auxiliary_loss_clip": 0.0115442, "auxiliary_loss_mlp": 0.01024954, "balance_loss_clip": 1.04869556, "balance_loss_mlp": 1.01770294, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 2.351270112240873, "language_loss": 0.63572836, "learning_rate": 9.921399099611306e-07, "loss": 0.65752208, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.819148302078247 }, { "auxiliary_loss_clip": 0.01142559, "auxiliary_loss_mlp": 0.01024833, "balance_loss_clip": 1.04798329, "balance_loss_mlp": 1.01766825, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.85820095284178, "language_loss": 0.69130707, "learning_rate": 9.914671539711588e-07, "loss": 0.71298099, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.644986391067505 }, { "auxiliary_loss_clip": 0.0106737, "auxiliary_loss_mlp": 0.00712424, "balance_loss_clip": 1.04353249, "balance_loss_mlp": 1.00053155, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.535816692069073, "language_loss": 0.78456521, "learning_rate": 9.90794550974817e-07, "loss": 0.80236316, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.995584726333618 }, { "auxiliary_loss_clip": 0.01121146, "auxiliary_loss_mlp": 0.0102614, "balance_loss_clip": 1.04644537, "balance_loss_mlp": 1.01873112, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.6280980432466587, "language_loss": 0.81355602, "learning_rate": 9.901221010741407e-07, "loss": 0.83502889, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 2.83758807182312 }, { "auxiliary_loss_clip": 0.01161265, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.05101395, "balance_loss_mlp": 1.02013183, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 3.0723249857699133, "language_loss": 0.74657369, "learning_rate": 9.894498043711375e-07, "loss": 0.7684657, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.696161985397339 }, { "auxiliary_loss_clip": 0.01137673, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 1.04543245, "balance_loss_mlp": 1.01604867, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 2.433367322317129, "language_loss": 0.69579244, "learning_rate": 9.887776609677962e-07, "loss": 0.71740037, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.6581225395202637 }, { "auxiliary_loss_clip": 0.01116855, "auxiliary_loss_mlp": 0.01025925, "balance_loss_clip": 1.04430568, "balance_loss_mlp": 1.01891565, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.8134322621323258, "language_loss": 0.72526455, "learning_rate": 9.88105670966079e-07, "loss": 0.7466923, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.6167526245117188 }, { "auxiliary_loss_clip": 0.01096445, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.04409552, "balance_loss_mlp": 1.01930726, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 1.8434936044155878, "language_loss": 0.78990227, "learning_rate": 9.874338344679283e-07, "loss": 0.81113374, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.7363855838775635 }, { "auxiliary_loss_clip": 0.01172584, "auxiliary_loss_mlp": 0.010231, "balance_loss_clip": 1.05265713, "balance_loss_mlp": 1.01667154, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 2.0375021610234834, "language_loss": 0.74086338, "learning_rate": 9.86762151575259e-07, "loss": 0.76282024, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.54451584815979 }, { "auxiliary_loss_clip": 0.01115491, "auxiliary_loss_mlp": 0.00710615, "balance_loss_clip": 1.04898858, "balance_loss_mlp": 1.00050378, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.557283197907904, "language_loss": 0.80238503, "learning_rate": 9.860906223899651e-07, "loss": 0.82064617, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 3.9969019889831543 }, { "auxiliary_loss_clip": 0.01147458, "auxiliary_loss_mlp": 0.01029273, "balance_loss_clip": 1.04887319, "balance_loss_mlp": 1.02204871, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.6691079074599908, "language_loss": 0.75359988, "learning_rate": 9.854192470139184e-07, "loss": 0.77536714, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 4.528884172439575 }, { "auxiliary_loss_clip": 0.01145084, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.05259001, "balance_loss_mlp": 1.02063048, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.5903064749247577, "language_loss": 0.71869457, "learning_rate": 9.847480255489645e-07, "loss": 0.74042583, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.6776976585388184 }, { "auxiliary_loss_clip": 0.01145939, "auxiliary_loss_mlp": 0.0102676, "balance_loss_clip": 1.0480988, "balance_loss_mlp": 1.01970315, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.9485159862222619, "language_loss": 0.69433469, "learning_rate": 9.840769580969295e-07, "loss": 0.71606171, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.6657888889312744 }, { "auxiliary_loss_clip": 0.01152505, "auxiliary_loss_mlp": 0.0101972, "balance_loss_clip": 1.04998207, "balance_loss_mlp": 1.01221633, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 2.3507602686514204, "language_loss": 0.80474687, "learning_rate": 9.834060447596114e-07, "loss": 0.82646906, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 3.531498908996582 }, { "auxiliary_loss_clip": 0.01160599, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.05086815, "balance_loss_mlp": 1.01884508, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 1.8252056288725262, "language_loss": 0.78219593, "learning_rate": 9.827352856387868e-07, "loss": 0.80406427, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.5662636756896973 }, { "auxiliary_loss_clip": 0.01030365, "auxiliary_loss_mlp": 0.01008805, "balance_loss_clip": 1.02548909, "balance_loss_mlp": 1.00759506, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7781468350734754, "language_loss": 0.64211285, "learning_rate": 9.820646808362118e-07, "loss": 0.66250455, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.4368398189544678 }, { "auxiliary_loss_clip": 0.01140609, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.05182195, "balance_loss_mlp": 1.0276618, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.6970982551018223, "language_loss": 0.72826362, "learning_rate": 9.813942304536154e-07, "loss": 0.75001758, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.749642848968506 }, { "auxiliary_loss_clip": 0.01144933, "auxiliary_loss_mlp": 0.01024044, "balance_loss_clip": 1.05064416, "balance_loss_mlp": 1.01665938, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 4.105589397996422, "language_loss": 0.63856542, "learning_rate": 9.807239345927043e-07, "loss": 0.66025519, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.6073222160339355 }, { "auxiliary_loss_clip": 0.01142717, "auxiliary_loss_mlp": 0.0102312, "balance_loss_clip": 1.04550767, "balance_loss_mlp": 1.01588678, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.309801443674913, "language_loss": 0.72069496, "learning_rate": 9.80053793355162e-07, "loss": 0.74235332, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.7225329875946045 }, { "auxiliary_loss_clip": 0.01107243, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.04506469, "balance_loss_mlp": 1.01852143, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.228067042474085, "language_loss": 0.74992752, "learning_rate": 9.793838068426472e-07, "loss": 0.77125835, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.64521861076355 }, { "auxiliary_loss_clip": 0.01174132, "auxiliary_loss_mlp": 0.01025667, "balance_loss_clip": 1.05273473, "balance_loss_mlp": 1.01841283, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 3.895544685515171, "language_loss": 0.61103165, "learning_rate": 9.78713975156799e-07, "loss": 0.63302964, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.568998098373413 }, { "auxiliary_loss_clip": 0.0112836, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.05027807, "balance_loss_mlp": 1.02135551, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.8151279301292937, "language_loss": 0.71683919, "learning_rate": 9.780442983992273e-07, "loss": 0.73841262, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.7067763805389404 }, { "auxiliary_loss_clip": 0.01139199, "auxiliary_loss_mlp": 0.01024169, "balance_loss_clip": 1.05095541, "balance_loss_mlp": 1.01664686, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 2.1924223026728953, "language_loss": 0.71698457, "learning_rate": 9.773747766715238e-07, "loss": 0.73861825, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.7535762786865234 }, { "auxiliary_loss_clip": 0.01144416, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.0483458, "balance_loss_mlp": 1.02064335, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.8612199599323909, "language_loss": 0.80425811, "learning_rate": 9.767054100752536e-07, "loss": 0.82598394, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.639341115951538 }, { "auxiliary_loss_clip": 0.01129683, "auxiliary_loss_mlp": 0.01023851, "balance_loss_clip": 1.0480473, "balance_loss_mlp": 1.01630473, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 2.100812914723218, "language_loss": 0.82052457, "learning_rate": 9.760361987119584e-07, "loss": 0.84205991, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.6494269371032715 }, { "auxiliary_loss_clip": 0.01141401, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.04930198, "balance_loss_mlp": 1.02238798, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 2.457105237439769, "language_loss": 0.67971057, "learning_rate": 9.753671426831592e-07, "loss": 0.70142293, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.5730082988739014 }, { "auxiliary_loss_clip": 0.01151607, "auxiliary_loss_mlp": 0.01024661, "balance_loss_clip": 1.04753923, "balance_loss_mlp": 1.01712704, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.8601737356802914, "language_loss": 0.7962153, "learning_rate": 9.746982420903483e-07, "loss": 0.81797796, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.668301820755005 }, { "auxiliary_loss_clip": 0.01153641, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.0495981, "balance_loss_mlp": 1.02116907, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 3.1551637873365586, "language_loss": 0.74958527, "learning_rate": 9.740294970349993e-07, "loss": 0.77139771, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.624943971633911 }, { "auxiliary_loss_clip": 0.01060478, "auxiliary_loss_mlp": 0.01001994, "balance_loss_clip": 1.02910471, "balance_loss_mlp": 1.00078964, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8785644020126522, "language_loss": 0.60920334, "learning_rate": 9.733609076185594e-07, "loss": 0.6298281, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.125898599624634 }, { "auxiliary_loss_clip": 0.01160358, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.05235314, "balance_loss_mlp": 1.0218277, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 1.996998127231471, "language_loss": 0.83954644, "learning_rate": 9.72692473942455e-07, "loss": 0.86144692, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.5813419818878174 }, { "auxiliary_loss_clip": 0.01115801, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 1.0484612, "balance_loss_mlp": 1.02386296, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.7750045458901718, "language_loss": 0.7761243, "learning_rate": 9.720241961080849e-07, "loss": 0.79760003, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.6810996532440186 }, { "auxiliary_loss_clip": 0.01173692, "auxiliary_loss_mlp": 0.01024226, "balance_loss_clip": 1.05132151, "balance_loss_mlp": 1.01673365, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 2.5104675786411774, "language_loss": 0.73346478, "learning_rate": 9.713560742168259e-07, "loss": 0.75544393, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.7303924560546875 }, { "auxiliary_loss_clip": 0.01124737, "auxiliary_loss_mlp": 0.01025797, "balance_loss_clip": 1.04753053, "balance_loss_mlp": 1.01876926, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 2.619495941914291, "language_loss": 0.71727324, "learning_rate": 9.706881083700333e-07, "loss": 0.73877853, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.673774003982544 }, { "auxiliary_loss_clip": 0.010941, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.04794085, "balance_loss_mlp": 1.02115774, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 2.3186712638723797, "language_loss": 0.82367474, "learning_rate": 9.700202986690357e-07, "loss": 0.84490609, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.80145263671875 }, { "auxiliary_loss_clip": 0.011556, "auxiliary_loss_mlp": 0.00711869, "balance_loss_clip": 1.04937232, "balance_loss_mlp": 1.00062466, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 2.2342645528851452, "language_loss": 0.66885686, "learning_rate": 9.693526452151413e-07, "loss": 0.68753153, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 2.5790297985076904 }, { "auxiliary_loss_clip": 0.01131064, "auxiliary_loss_mlp": 0.01027887, "balance_loss_clip": 1.04691434, "balance_loss_mlp": 1.02016854, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.5376052838352536, "language_loss": 0.75226128, "learning_rate": 9.686851481096305e-07, "loss": 0.7738508, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 3.640028238296509 }, { "auxiliary_loss_clip": 0.01094166, "auxiliary_loss_mlp": 0.01024074, "balance_loss_clip": 1.04628992, "balance_loss_mlp": 1.01662087, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 2.0514557735006074, "language_loss": 0.72490191, "learning_rate": 9.68017807453762e-07, "loss": 0.74608433, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 3.710721969604492 }, { "auxiliary_loss_clip": 0.01145405, "auxiliary_loss_mlp": 0.0071128, "balance_loss_clip": 1.05144477, "balance_loss_mlp": 1.00061965, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 2.1182446672758246, "language_loss": 0.73610747, "learning_rate": 9.673506233487721e-07, "loss": 0.75467426, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.6819469928741455 }, { "auxiliary_loss_clip": 0.01143731, "auxiliary_loss_mlp": 0.00710804, "balance_loss_clip": 1.04932165, "balance_loss_mlp": 1.00048685, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.840324527446825, "language_loss": 0.8630923, "learning_rate": 9.666835958958717e-07, "loss": 0.88163769, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.6469311714172363 }, { "auxiliary_loss_clip": 0.01174201, "auxiliary_loss_mlp": 0.0102713, "balance_loss_clip": 1.05269027, "balance_loss_mlp": 1.02014422, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.2755950727220453, "language_loss": 0.80499214, "learning_rate": 9.660167251962484e-07, "loss": 0.82700551, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 3.5055110454559326 }, { "auxiliary_loss_clip": 0.01129303, "auxiliary_loss_mlp": 0.01025346, "balance_loss_clip": 1.04805589, "balance_loss_mlp": 1.01857448, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 2.44922042312642, "language_loss": 0.77922904, "learning_rate": 9.653500113510654e-07, "loss": 0.80077553, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.6801583766937256 }, { "auxiliary_loss_clip": 0.01138208, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.04749727, "balance_loss_mlp": 1.02197647, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.6023909806540715, "language_loss": 0.6762501, "learning_rate": 9.646834544614627e-07, "loss": 0.69792867, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.691377639770508 }, { "auxiliary_loss_clip": 0.01133288, "auxiliary_loss_mlp": 0.01026291, "balance_loss_clip": 1.04795647, "balance_loss_mlp": 1.01929307, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 2.361559413777868, "language_loss": 0.76411629, "learning_rate": 9.64017054628558e-07, "loss": 0.785712, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.838965654373169 }, { "auxiliary_loss_clip": 0.01112834, "auxiliary_loss_mlp": 0.01024476, "balance_loss_clip": 1.04535866, "balance_loss_mlp": 1.01691866, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 2.3119113092569648, "language_loss": 0.78968441, "learning_rate": 9.63350811953441e-07, "loss": 0.81105757, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.744474172592163 }, { "auxiliary_loss_clip": 0.01127321, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.04824066, "balance_loss_mlp": 1.02029097, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 3.2773151850345776, "language_loss": 0.70433927, "learning_rate": 9.626847265371826e-07, "loss": 0.72588819, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.645725727081299 }, { "auxiliary_loss_clip": 0.0113583, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.04756474, "balance_loss_mlp": 1.01954913, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 2.169640659089284, "language_loss": 0.7880671, "learning_rate": 9.620187984808262e-07, "loss": 0.80969095, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.684417247772217 }, { "auxiliary_loss_clip": 0.01142543, "auxiliary_loss_mlp": 0.00710704, "balance_loss_clip": 1.05107188, "balance_loss_mlp": 1.00049269, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 2.0287863319267645, "language_loss": 0.86248636, "learning_rate": 9.613530278853919e-07, "loss": 0.88101882, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.639397144317627 }, { "auxiliary_loss_clip": 0.01156635, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.05001497, "balance_loss_mlp": 1.02234268, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 1.8663767728844747, "language_loss": 0.73595166, "learning_rate": 9.60687414851879e-07, "loss": 0.75781626, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.657850980758667 }, { "auxiliary_loss_clip": 0.01145979, "auxiliary_loss_mlp": 0.01026273, "balance_loss_clip": 1.0512172, "balance_loss_mlp": 1.01927876, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.2225129598001256, "language_loss": 0.77405429, "learning_rate": 9.600219594812575e-07, "loss": 0.79577684, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.5949387550354004 }, { "auxiliary_loss_clip": 0.01173386, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.05264509, "balance_loss_mlp": 1.02008414, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.6532161632933071, "language_loss": 0.72628224, "learning_rate": 9.593566618744786e-07, "loss": 0.74828649, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.630298376083374 }, { "auxiliary_loss_clip": 0.01171688, "auxiliary_loss_mlp": 0.01024968, "balance_loss_clip": 1.05057502, "balance_loss_mlp": 1.01814365, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.8897605373492787, "language_loss": 0.73933303, "learning_rate": 9.58691522132466e-07, "loss": 0.76129967, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.579930305480957 }, { "auxiliary_loss_clip": 0.01147227, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.0516665, "balance_loss_mlp": 1.0197978, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 3.3388931153432475, "language_loss": 0.84730709, "learning_rate": 9.58026540356123e-07, "loss": 0.86905241, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.6755661964416504 }, { "auxiliary_loss_clip": 0.01160964, "auxiliary_loss_mlp": 0.01024445, "balance_loss_clip": 1.05061054, "balance_loss_mlp": 1.01661849, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.8280142960361188, "language_loss": 0.86679041, "learning_rate": 9.573617166463246e-07, "loss": 0.88864446, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.633861541748047 }, { "auxiliary_loss_clip": 0.01143672, "auxiliary_loss_mlp": 0.01029516, "balance_loss_clip": 1.04841018, "balance_loss_mlp": 1.02213717, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 2.0029889990910745, "language_loss": 0.5993188, "learning_rate": 9.56697051103924e-07, "loss": 0.62105072, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.6646502017974854 }, { "auxiliary_loss_clip": 0.01138111, "auxiliary_loss_mlp": 0.01022143, "balance_loss_clip": 1.04659128, "balance_loss_mlp": 1.01524711, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 2.2434285989413745, "language_loss": 0.8073138, "learning_rate": 9.560325438297522e-07, "loss": 0.82891631, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01144622, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.05267692, "balance_loss_mlp": 1.01966643, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 2.399984406991829, "language_loss": 0.86906701, "learning_rate": 9.553681949246127e-07, "loss": 0.89077818, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.6099538803100586 }, { "auxiliary_loss_clip": 0.01133356, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.04980767, "balance_loss_mlp": 1.0231843, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 2.291693103723008, "language_loss": 0.75360441, "learning_rate": 9.547040044892886e-07, "loss": 0.77525234, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 2.941685676574707 }, { "auxiliary_loss_clip": 0.01077494, "auxiliary_loss_mlp": 0.01002293, "balance_loss_clip": 1.03216243, "balance_loss_mlp": 1.00114274, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8598971476993212, "language_loss": 0.60157108, "learning_rate": 9.540399726245354e-07, "loss": 0.62236893, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.0086071491241455 }, { "auxiliary_loss_clip": 0.01139337, "auxiliary_loss_mlp": 0.01023104, "balance_loss_clip": 1.04855728, "balance_loss_mlp": 1.01541531, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 4.4392842997221855, "language_loss": 0.68893981, "learning_rate": 9.533760994310859e-07, "loss": 0.7105642, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.590073347091675 }, { "auxiliary_loss_clip": 0.01174244, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.05204344, "balance_loss_mlp": 1.02285504, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 2.2166350921319715, "language_loss": 0.74942267, "learning_rate": 9.527123850096508e-07, "loss": 0.7714693, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.4429218769073486 }, { "auxiliary_loss_clip": 0.01163615, "auxiliary_loss_mlp": 0.01024227, "balance_loss_clip": 1.05254221, "balance_loss_mlp": 1.01657367, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 3.244551597835367, "language_loss": 0.71812463, "learning_rate": 9.520488294609142e-07, "loss": 0.74000299, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.5234909057617188 }, { "auxiliary_loss_clip": 0.01040313, "auxiliary_loss_mlp": 0.01003127, "balance_loss_clip": 1.03357124, "balance_loss_mlp": 1.0018996, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7383279553535225, "language_loss": 0.53781056, "learning_rate": 9.513854328855368e-07, "loss": 0.55824494, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 4.131189346313477 }, { "auxiliary_loss_clip": 0.01172992, "auxiliary_loss_mlp": 0.01019956, "balance_loss_clip": 1.05130637, "balance_loss_mlp": 1.01300645, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 2.937829879909089, "language_loss": 0.81292611, "learning_rate": 9.507221953841558e-07, "loss": 0.83485562, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 4.279926300048828 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01027239, "balance_loss_clip": 1.0538497, "balance_loss_mlp": 1.01978803, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.8934843095408938, "language_loss": 0.77873743, "learning_rate": 9.500591170573824e-07, "loss": 0.80062979, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.6033036708831787 }, { "auxiliary_loss_clip": 0.01107519, "auxiliary_loss_mlp": 0.01024402, "balance_loss_clip": 1.04519081, "balance_loss_mlp": 1.01640964, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 2.5776242407053345, "language_loss": 0.74407279, "learning_rate": 9.493961980058078e-07, "loss": 0.76539195, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.659811496734619 }, { "auxiliary_loss_clip": 0.01080799, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.04338217, "balance_loss_mlp": 1.01739407, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 3.679135679552407, "language_loss": 0.67709786, "learning_rate": 9.48733438329993e-07, "loss": 0.69814682, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 3.6999738216400146 }, { "auxiliary_loss_clip": 0.01171997, "auxiliary_loss_mlp": 0.0071099, "balance_loss_clip": 1.05203533, "balance_loss_mlp": 1.00058746, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.8603630164034555, "language_loss": 0.74678755, "learning_rate": 9.480708381304807e-07, "loss": 0.76561749, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.672832489013672 }, { "auxiliary_loss_clip": 0.01108898, "auxiliary_loss_mlp": 0.0102562, "balance_loss_clip": 1.04829681, "balance_loss_mlp": 1.01824141, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.361952120226, "language_loss": 0.83552271, "learning_rate": 9.474083975077858e-07, "loss": 0.85686791, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.6592798233032227 }, { "auxiliary_loss_clip": 0.01151222, "auxiliary_loss_mlp": 0.01026164, "balance_loss_clip": 1.04868734, "balance_loss_mlp": 1.0192318, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 3.176191383213326, "language_loss": 0.80341816, "learning_rate": 9.467461165623994e-07, "loss": 0.82519203, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.7728965282440186 }, { "auxiliary_loss_clip": 0.01160673, "auxiliary_loss_mlp": 0.01023521, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.01644862, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 2.0586369919531355, "language_loss": 0.80041319, "learning_rate": 9.46083995394791e-07, "loss": 0.82225513, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.6239559650421143 }, { "auxiliary_loss_clip": 0.01160227, "auxiliary_loss_mlp": 0.00710832, "balance_loss_clip": 1.05187702, "balance_loss_mlp": 1.00062203, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 3.339644280882412, "language_loss": 0.63525248, "learning_rate": 9.454220341054012e-07, "loss": 0.65396309, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.7669830322265625 }, { "auxiliary_loss_clip": 0.01125491, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 1.047791, "balance_loss_mlp": 1.01677358, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 2.182225120558684, "language_loss": 0.80826437, "learning_rate": 9.447602327946512e-07, "loss": 0.82976305, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.7369444370269775 }, { "auxiliary_loss_clip": 0.01139339, "auxiliary_loss_mlp": 0.01027203, "balance_loss_clip": 1.04684293, "balance_loss_mlp": 1.01943684, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 2.1179925843887, "language_loss": 0.76662421, "learning_rate": 9.440985915629338e-07, "loss": 0.78828967, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.615445375442505 }, { "auxiliary_loss_clip": 0.01176738, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.05602431, "balance_loss_mlp": 1.01992023, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 3.765834036828379, "language_loss": 0.73219132, "learning_rate": 9.434371105106223e-07, "loss": 0.75422895, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.5487396717071533 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.04741764, "balance_loss_mlp": 1.0203464, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 2.4857546610526153, "language_loss": 0.70710737, "learning_rate": 9.427757897380602e-07, "loss": 0.72863847, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.669158458709717 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.0482173, "balance_loss_mlp": 1.01916432, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 14.65100732352208, "language_loss": 0.85079223, "learning_rate": 9.421146293455695e-07, "loss": 0.87227869, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.6862218379974365 }, { "auxiliary_loss_clip": 0.01140298, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.04904318, "balance_loss_mlp": 1.02043319, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 2.0867503524297866, "language_loss": 0.68434244, "learning_rate": 9.414536294334489e-07, "loss": 0.70602322, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.7708845138549805 }, { "auxiliary_loss_clip": 0.01142324, "auxiliary_loss_mlp": 0.01024983, "balance_loss_clip": 1.04642773, "balance_loss_mlp": 1.01767588, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 1.9336184881431142, "language_loss": 0.69966698, "learning_rate": 9.407927901019708e-07, "loss": 0.72134, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.7645697593688965 }, { "auxiliary_loss_clip": 0.0115969, "auxiliary_loss_mlp": 0.01022729, "balance_loss_clip": 1.05111969, "balance_loss_mlp": 1.01610374, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 2.0520543103426383, "language_loss": 0.76725417, "learning_rate": 9.401321114513854e-07, "loss": 0.78907835, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.73408579826355 }, { "auxiliary_loss_clip": 0.01176648, "auxiliary_loss_mlp": 0.01026881, "balance_loss_clip": 1.054322, "balance_loss_mlp": 1.01918006, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 2.1434438226135124, "language_loss": 0.75468439, "learning_rate": 9.394715935819155e-07, "loss": 0.77671969, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.750576972961426 }, { "auxiliary_loss_clip": 0.01162016, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.05058253, "balance_loss_mlp": 1.01789451, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 4.178427364665322, "language_loss": 0.6277051, "learning_rate": 9.388112365937608e-07, "loss": 0.64957762, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.651135206222534 }, { "auxiliary_loss_clip": 0.01127725, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.04907846, "balance_loss_mlp": 1.01798511, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.9277980881991246, "language_loss": 0.82704443, "learning_rate": 9.381510405870985e-07, "loss": 0.8485781, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.743074893951416 }, { "auxiliary_loss_clip": 0.0115458, "auxiliary_loss_mlp": 0.01028812, "balance_loss_clip": 1.04883885, "balance_loss_mlp": 1.02133226, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 2.178062490461682, "language_loss": 0.77458632, "learning_rate": 9.374910056620791e-07, "loss": 0.79642022, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.691119432449341 }, { "auxiliary_loss_clip": 0.01159965, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.05334044, "balance_loss_mlp": 1.02303052, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 1.638629280730866, "language_loss": 0.810839, "learning_rate": 9.368311319188293e-07, "loss": 0.83274686, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.6575069427490234 }, { "auxiliary_loss_clip": 0.01126002, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.04755437, "balance_loss_mlp": 1.02049792, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.8287766079718297, "language_loss": 0.79572177, "learning_rate": 9.361714194574515e-07, "loss": 0.81726491, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.7431654930114746 }, { "auxiliary_loss_clip": 0.01088311, "auxiliary_loss_mlp": 0.01001558, "balance_loss_clip": 1.03274679, "balance_loss_mlp": 1.00046766, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7361705956700398, "language_loss": 0.5826149, "learning_rate": 9.355118683780228e-07, "loss": 0.6035136, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.2726285457611084 }, { "auxiliary_loss_clip": 0.01172839, "auxiliary_loss_mlp": 0.0102319, "balance_loss_clip": 1.05093336, "balance_loss_mlp": 1.01586771, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.401358968838573, "language_loss": 0.79599953, "learning_rate": 9.348524787805987e-07, "loss": 0.81795979, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.5651097297668457 }, { "auxiliary_loss_clip": 0.01126858, "auxiliary_loss_mlp": 0.0102412, "balance_loss_clip": 1.04477108, "balance_loss_mlp": 1.01693118, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.944732330991927, "language_loss": 0.85554242, "learning_rate": 9.341932507652053e-07, "loss": 0.87705225, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 4.495525121688843 }, { "auxiliary_loss_clip": 0.01139393, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 1.04518569, "balance_loss_mlp": 1.0191865, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.962578981675814, "language_loss": 0.78805089, "learning_rate": 9.335341844318489e-07, "loss": 0.80971432, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 3.5723297595977783 }, { "auxiliary_loss_clip": 0.01142885, "auxiliary_loss_mlp": 0.01025349, "balance_loss_clip": 1.0505724, "balance_loss_mlp": 1.01751113, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.901474043933206, "language_loss": 0.73400635, "learning_rate": 9.328752798805091e-07, "loss": 0.75568873, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.6525728702545166 }, { "auxiliary_loss_clip": 0.0116184, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.05517125, "balance_loss_mlp": 1.02408862, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 2.8041806755329417, "language_loss": 0.76183385, "learning_rate": 9.322165372111399e-07, "loss": 0.78376794, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.5968081951141357 }, { "auxiliary_loss_clip": 0.01124248, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.04909229, "balance_loss_mlp": 1.02096939, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 2.986129844510775, "language_loss": 0.76137006, "learning_rate": 9.315579565236747e-07, "loss": 0.78289783, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 3.6205437183380127 }, { "auxiliary_loss_clip": 0.01139448, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 1.05069244, "balance_loss_mlp": 1.02025485, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.9622294264583255, "language_loss": 0.74284291, "learning_rate": 9.308995379180162e-07, "loss": 0.76451135, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.6641955375671387 }, { "auxiliary_loss_clip": 0.01078297, "auxiliary_loss_mlp": 0.01002425, "balance_loss_clip": 1.03370571, "balance_loss_mlp": 1.00143003, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7381369098245268, "language_loss": 0.59497356, "learning_rate": 9.302412814940488e-07, "loss": 0.61578083, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.248504161834717 }, { "auxiliary_loss_clip": 0.01139331, "auxiliary_loss_mlp": 0.01023473, "balance_loss_clip": 1.04658997, "balance_loss_mlp": 1.01546192, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.8625508609815036, "language_loss": 0.7137872, "learning_rate": 9.295831873516276e-07, "loss": 0.73541528, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.696650505065918 }, { "auxiliary_loss_clip": 0.01175512, "auxiliary_loss_mlp": 0.01024772, "balance_loss_clip": 1.05448139, "balance_loss_mlp": 1.01786983, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 3.538995462529229, "language_loss": 0.76185119, "learning_rate": 9.289252555905873e-07, "loss": 0.78385401, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.578876256942749 }, { "auxiliary_loss_clip": 0.01160576, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.05445623, "balance_loss_mlp": 1.01946366, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 3.1631143621365054, "language_loss": 0.75978637, "learning_rate": 9.282674863107334e-07, "loss": 0.7816565, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.6107616424560547 }, { "auxiliary_loss_clip": 0.01156429, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.05195284, "balance_loss_mlp": 1.02113402, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.5622038898129937, "language_loss": 0.75759578, "learning_rate": 9.276098796118488e-07, "loss": 0.77944231, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.5595054626464844 }, { "auxiliary_loss_clip": 0.01141444, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.05032992, "balance_loss_mlp": 1.02197099, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 1.9605990605930557, "language_loss": 0.66189581, "learning_rate": 9.269524355936938e-07, "loss": 0.68360138, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.764648675918579 }, { "auxiliary_loss_clip": 0.01137829, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.0486877, "balance_loss_mlp": 1.01695573, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.768230209569965, "language_loss": 0.84875059, "learning_rate": 9.262951543560002e-07, "loss": 0.87036926, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.6811318397521973 }, { "auxiliary_loss_clip": 0.01141856, "auxiliary_loss_mlp": 0.01031519, "balance_loss_clip": 1.04995799, "balance_loss_mlp": 1.02341902, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.382944766344743, "language_loss": 0.86132002, "learning_rate": 9.256380359984795e-07, "loss": 0.88305384, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.6046106815338135 }, { "auxiliary_loss_clip": 0.01116532, "auxiliary_loss_mlp": 0.01021895, "balance_loss_clip": 1.04319382, "balance_loss_mlp": 1.01436079, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 1.8797335820798466, "language_loss": 0.74813569, "learning_rate": 9.249810806208139e-07, "loss": 0.76951998, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.8247032165527344 }, { "auxiliary_loss_clip": 0.01105935, "auxiliary_loss_mlp": 0.00711248, "balance_loss_clip": 1.04210567, "balance_loss_mlp": 1.00061131, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 3.3926792164126462, "language_loss": 0.80808288, "learning_rate": 9.243242883226627e-07, "loss": 0.82625473, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.657499074935913 }, { "auxiliary_loss_clip": 0.01159118, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.04737711, "balance_loss_mlp": 1.01756573, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 1.7846247870362133, "language_loss": 0.69476354, "learning_rate": 9.236676592036628e-07, "loss": 0.71660531, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.6619739532470703 }, { "auxiliary_loss_clip": 0.01139177, "auxiliary_loss_mlp": 0.01026138, "balance_loss_clip": 1.05185294, "balance_loss_mlp": 1.01921797, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.9996678624988289, "language_loss": 0.73485756, "learning_rate": 9.230111933634228e-07, "loss": 0.75651073, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.6933867931365967 }, { "auxiliary_loss_clip": 0.01164217, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.05453086, "balance_loss_mlp": 1.02345109, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.5509094168368676, "language_loss": 0.80956507, "learning_rate": 9.223548909015288e-07, "loss": 0.8315134, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.646538496017456 }, { "auxiliary_loss_clip": 0.0110353, "auxiliary_loss_mlp": 0.01022701, "balance_loss_clip": 1.04571009, "balance_loss_mlp": 1.01603746, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 2.0123197755787157, "language_loss": 0.72228706, "learning_rate": 9.216987519175407e-07, "loss": 0.74354935, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.7580296993255615 }, { "auxiliary_loss_clip": 0.01155185, "auxiliary_loss_mlp": 0.01026678, "balance_loss_clip": 1.0520668, "balance_loss_mlp": 1.01987088, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 2.8065761620538927, "language_loss": 0.68787098, "learning_rate": 9.210427765109942e-07, "loss": 0.70968968, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.641158103942871 }, { "auxiliary_loss_clip": 0.01145201, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.04886007, "balance_loss_mlp": 1.02555418, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 3.1515593322110464, "language_loss": 0.81213832, "learning_rate": 9.20386964781402e-07, "loss": 0.83392245, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.6398770809173584 }, { "auxiliary_loss_clip": 0.01139547, "auxiliary_loss_mlp": 0.01026708, "balance_loss_clip": 1.04827428, "balance_loss_mlp": 1.01976466, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 2.007336184526138, "language_loss": 0.84367388, "learning_rate": 9.197313168282472e-07, "loss": 0.86533648, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.7124011516571045 }, { "auxiliary_loss_clip": 0.01151061, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.04696858, "balance_loss_mlp": 1.02382112, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.379589020262004, "language_loss": 0.72335124, "learning_rate": 9.190758327509935e-07, "loss": 0.74517852, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.662742853164673 }, { "auxiliary_loss_clip": 0.01040084, "auxiliary_loss_mlp": 0.00701902, "balance_loss_clip": 1.02967072, "balance_loss_mlp": 1.00012743, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 1.4082267904630579, "language_loss": 0.64449203, "learning_rate": 9.184205126490767e-07, "loss": 0.66191185, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.1288833618164062 }, { "auxiliary_loss_clip": 0.01054064, "auxiliary_loss_mlp": 0.00701732, "balance_loss_clip": 1.03368652, "balance_loss_mlp": 1.0001719, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.132840287539613, "language_loss": 0.59643197, "learning_rate": 9.177653566219075e-07, "loss": 0.61398995, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 4.078438758850098 }, { "auxiliary_loss_clip": 0.01130497, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.04729378, "balance_loss_mlp": 1.02166748, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 3.1192820034423736, "language_loss": 0.76498258, "learning_rate": 9.171103647688744e-07, "loss": 0.7865724, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.63106632232666 }, { "auxiliary_loss_clip": 0.01073217, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 1.0445056, "balance_loss_mlp": 1.02181447, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 2.026863994624511, "language_loss": 0.68959022, "learning_rate": 9.164555371893367e-07, "loss": 0.71060967, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 4.58482551574707 }, { "auxiliary_loss_clip": 0.01159195, "auxiliary_loss_mlp": 0.0071165, "balance_loss_clip": 1.05215538, "balance_loss_mlp": 1.0006597, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 3.3349672096338385, "language_loss": 0.75658989, "learning_rate": 9.158008739826333e-07, "loss": 0.77529836, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.6697778701782227 }, { "auxiliary_loss_clip": 0.01137607, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.04745662, "balance_loss_mlp": 1.02013671, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 27.81574264537178, "language_loss": 0.86432141, "learning_rate": 9.151463752480744e-07, "loss": 0.88597417, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 3.5503716468811035 }, { "auxiliary_loss_clip": 0.01117986, "auxiliary_loss_mlp": 0.0102501, "balance_loss_clip": 1.04683065, "balance_loss_mlp": 1.01791692, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.5089822552768433, "language_loss": 0.80146909, "learning_rate": 9.144920410849493e-07, "loss": 0.8228991, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.705604076385498 }, { "auxiliary_loss_clip": 0.01146355, "auxiliary_loss_mlp": 0.01022977, "balance_loss_clip": 1.04983616, "balance_loss_mlp": 1.01596749, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 2.4939348471180955, "language_loss": 0.80688316, "learning_rate": 9.138378715925176e-07, "loss": 0.82857645, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.6671500205993652 }, { "auxiliary_loss_clip": 0.01134433, "auxiliary_loss_mlp": 0.0102985, "balance_loss_clip": 1.04835224, "balance_loss_mlp": 1.02231562, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 6.026135754370563, "language_loss": 0.81299472, "learning_rate": 9.131838668700167e-07, "loss": 0.83463752, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.6297109127044678 }, { "auxiliary_loss_clip": 0.01126003, "auxiliary_loss_mlp": 0.01025257, "balance_loss_clip": 1.04684329, "balance_loss_mlp": 1.01785374, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.985985137689765, "language_loss": 0.86542201, "learning_rate": 9.125300270166598e-07, "loss": 0.88693464, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.7455313205718994 }, { "auxiliary_loss_clip": 0.01132523, "auxiliary_loss_mlp": 0.01022114, "balance_loss_clip": 1.04580283, "balance_loss_mlp": 1.01511621, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.9928798249883206, "language_loss": 0.85680866, "learning_rate": 9.118763521316324e-07, "loss": 0.87835503, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.7102179527282715 }, { "auxiliary_loss_clip": 0.01173566, "auxiliary_loss_mlp": 0.00711765, "balance_loss_clip": 1.051633, "balance_loss_mlp": 1.00062633, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 1.9796063066152796, "language_loss": 0.76162094, "learning_rate": 9.112228423140987e-07, "loss": 0.78047419, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.6454248428344727 }, { "auxiliary_loss_clip": 0.01144999, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.04959476, "balance_loss_mlp": 1.02608657, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.685900321452954, "language_loss": 0.86815548, "learning_rate": 9.105694976631932e-07, "loss": 0.88994473, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.7046329975128174 }, { "auxiliary_loss_clip": 0.01160171, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.0548234, "balance_loss_mlp": 1.02164793, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 2.2852255758251365, "language_loss": 0.72604007, "learning_rate": 9.099163182780283e-07, "loss": 0.74792659, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.763115882873535 }, { "auxiliary_loss_clip": 0.01140161, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.05110717, "balance_loss_mlp": 1.02072704, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.852618893618745, "language_loss": 0.49788362, "learning_rate": 9.092633042576916e-07, "loss": 0.51956725, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.6331896781921387 }, { "auxiliary_loss_clip": 0.01139261, "auxiliary_loss_mlp": 0.01023316, "balance_loss_clip": 1.05075812, "balance_loss_mlp": 1.01636589, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 2.0701943709906763, "language_loss": 0.56389868, "learning_rate": 9.086104557012446e-07, "loss": 0.58552444, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.708383798599243 }, { "auxiliary_loss_clip": 0.01149042, "auxiliary_loss_mlp": 0.01026222, "balance_loss_clip": 1.04931879, "balance_loss_mlp": 1.01941514, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 2.258344319067112, "language_loss": 0.65851635, "learning_rate": 9.079577727077239e-07, "loss": 0.680269, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.6173043251037598 }, { "auxiliary_loss_clip": 0.01161258, "auxiliary_loss_mlp": 0.01027162, "balance_loss_clip": 1.05317354, "balance_loss_mlp": 1.01957393, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.746785400005797, "language_loss": 0.71771568, "learning_rate": 9.073052553761404e-07, "loss": 0.73959994, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.615138292312622 }, { "auxiliary_loss_clip": 0.01111058, "auxiliary_loss_mlp": 0.01028672, "balance_loss_clip": 1.04678023, "balance_loss_mlp": 1.02063107, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.8883851442317028, "language_loss": 0.78165442, "learning_rate": 9.066529038054805e-07, "loss": 0.80305171, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.687406539916992 }, { "auxiliary_loss_clip": 0.01141108, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.04935861, "balance_loss_mlp": 1.02101481, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.7303754052358045, "language_loss": 0.74132663, "learning_rate": 9.060007180947071e-07, "loss": 0.76302123, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.634010076522827 }, { "auxiliary_loss_clip": 0.01110128, "auxiliary_loss_mlp": 0.01028822, "balance_loss_clip": 1.04429603, "balance_loss_mlp": 1.02160382, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 2.469539871008601, "language_loss": 0.73412132, "learning_rate": 9.053486983427534e-07, "loss": 0.75551081, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.809152603149414 }, { "auxiliary_loss_clip": 0.01145951, "auxiliary_loss_mlp": 0.01024327, "balance_loss_clip": 1.04876232, "balance_loss_mlp": 1.01662898, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 1.9624092917962068, "language_loss": 0.70177281, "learning_rate": 9.046968446485326e-07, "loss": 0.72347558, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.640016794204712 }, { "auxiliary_loss_clip": 0.01162274, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.05279207, "balance_loss_mlp": 1.01823449, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 2.378612000578876, "language_loss": 0.7057988, "learning_rate": 9.040451571109295e-07, "loss": 0.72767949, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.6267385482788086 }, { "auxiliary_loss_clip": 0.01057744, "auxiliary_loss_mlp": 0.01002712, "balance_loss_clip": 1.04332447, "balance_loss_mlp": 1.0017761, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8373806255828555, "language_loss": 0.60430849, "learning_rate": 9.033936358288042e-07, "loss": 0.62491298, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.1407148838043213 }, { "auxiliary_loss_clip": 0.01176395, "auxiliary_loss_mlp": 0.01026835, "balance_loss_clip": 1.05379808, "balance_loss_mlp": 1.01987648, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 2.3808266658401105, "language_loss": 0.82214451, "learning_rate": 9.027422809009937e-07, "loss": 0.84417683, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.6013054847717285 }, { "auxiliary_loss_clip": 0.01158057, "auxiliary_loss_mlp": 0.01024226, "balance_loss_clip": 1.04903746, "balance_loss_mlp": 1.01697218, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.8001771549415584, "language_loss": 0.83149642, "learning_rate": 9.020910924263054e-07, "loss": 0.85331929, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.6019129753112793 }, { "auxiliary_loss_clip": 0.01054354, "auxiliary_loss_mlp": 0.01003277, "balance_loss_clip": 1.04166842, "balance_loss_mlp": 1.00226402, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8110775120480173, "language_loss": 0.58124048, "learning_rate": 9.014400705035261e-07, "loss": 0.60181677, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.300025463104248 }, { "auxiliary_loss_clip": 0.01177303, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.05756831, "balance_loss_mlp": 1.02259195, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 1.8859489083871346, "language_loss": 0.76902229, "learning_rate": 9.00789215231414e-07, "loss": 0.79108965, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 3.554755210876465 }, { "auxiliary_loss_clip": 0.01124435, "auxiliary_loss_mlp": 0.00711751, "balance_loss_clip": 1.04502022, "balance_loss_mlp": 1.00068736, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.9168533314035208, "language_loss": 0.82058907, "learning_rate": 9.001385267087056e-07, "loss": 0.83895093, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 3.5800912380218506 }, { "auxiliary_loss_clip": 0.01161774, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 1.05323672, "balance_loss_mlp": 1.01962185, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.8880702975945203, "language_loss": 0.70594192, "learning_rate": 8.994880050341072e-07, "loss": 0.72782725, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 3.5564725399017334 }, { "auxiliary_loss_clip": 0.01137772, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 1.0503099, "balance_loss_mlp": 1.02488899, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 2.1338056566555093, "language_loss": 0.7801621, "learning_rate": 8.988376503063026e-07, "loss": 0.80185986, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.733999013900757 }, { "auxiliary_loss_clip": 0.01118876, "auxiliary_loss_mlp": 0.01022468, "balance_loss_clip": 1.04863501, "balance_loss_mlp": 1.01537538, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 2.9928647861428073, "language_loss": 0.81428605, "learning_rate": 8.981874626239521e-07, "loss": 0.8356995, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 3.636852979660034 }, { "auxiliary_loss_clip": 0.01161677, "auxiliary_loss_mlp": 0.01027531, "balance_loss_clip": 1.05453515, "balance_loss_mlp": 1.01960897, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 2.0577401648504887, "language_loss": 0.88362551, "learning_rate": 8.975374420856872e-07, "loss": 0.90551758, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.5800909996032715 }, { "auxiliary_loss_clip": 0.01117146, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.04617643, "balance_loss_mlp": 1.01881838, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.1115370359109686, "language_loss": 0.72663438, "learning_rate": 8.968875887901157e-07, "loss": 0.74806583, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.688610792160034 }, { "auxiliary_loss_clip": 0.01141724, "auxiliary_loss_mlp": 0.01022863, "balance_loss_clip": 1.04823029, "balance_loss_mlp": 1.01570415, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 2.188604550827255, "language_loss": 0.62774557, "learning_rate": 8.9623790283582e-07, "loss": 0.64939141, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.6507062911987305 }, { "auxiliary_loss_clip": 0.01128413, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.05040407, "balance_loss_mlp": 1.02196717, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.1301512602693884, "language_loss": 0.76557243, "learning_rate": 8.955883843213561e-07, "loss": 0.78715253, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.6512858867645264 }, { "auxiliary_loss_clip": 0.01166998, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.05347705, "balance_loss_mlp": 1.02266097, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 2.357724833096942, "language_loss": 0.87190723, "learning_rate": 8.949390333452569e-07, "loss": 0.89388216, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.603489637374878 }, { "auxiliary_loss_clip": 0.0117435, "auxiliary_loss_mlp": 0.01025461, "balance_loss_clip": 1.054425, "balance_loss_mlp": 1.0182904, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 1.8899705993513103, "language_loss": 0.68067503, "learning_rate": 8.942898500060279e-07, "loss": 0.70267308, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.6693239212036133 }, { "auxiliary_loss_clip": 0.01119782, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.04840517, "balance_loss_mlp": 1.02326727, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 2.967763094853393, "language_loss": 0.71706355, "learning_rate": 8.936408344021493e-07, "loss": 0.73857439, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.716249465942383 }, { "auxiliary_loss_clip": 0.01155117, "auxiliary_loss_mlp": 0.01027204, "balance_loss_clip": 1.05475092, "balance_loss_mlp": 1.01930344, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.3540135153417907, "language_loss": 0.70971119, "learning_rate": 8.929919866320765e-07, "loss": 0.73153442, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.86749529838562 }, { "auxiliary_loss_clip": 0.01135624, "auxiliary_loss_mlp": 0.0071153, "balance_loss_clip": 1.05111098, "balance_loss_mlp": 1.00057924, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 2.4008620496872664, "language_loss": 0.81453836, "learning_rate": 8.923433067942385e-07, "loss": 0.83300996, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.6941356658935547 }, { "auxiliary_loss_clip": 0.0113648, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.05094409, "balance_loss_mlp": 1.01706386, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 2.0335395343291824, "language_loss": 0.68660581, "learning_rate": 8.916947949870417e-07, "loss": 0.7082113, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.6998026371002197 }, { "auxiliary_loss_clip": 0.01080091, "auxiliary_loss_mlp": 0.01001538, "balance_loss_clip": 1.03580475, "balance_loss_mlp": 1.00057209, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7401910484517724, "language_loss": 0.58151829, "learning_rate": 8.910464513088615e-07, "loss": 0.60233456, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.277672052383423 }, { "auxiliary_loss_clip": 0.01134101, "auxiliary_loss_mlp": 0.01024632, "balance_loss_clip": 1.04716086, "balance_loss_mlp": 1.01733351, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 2.6772579783535577, "language_loss": 0.78232616, "learning_rate": 8.903982758580542e-07, "loss": 0.80391347, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.627351999282837 }, { "auxiliary_loss_clip": 0.01139402, "auxiliary_loss_mlp": 0.01025422, "balance_loss_clip": 1.05031717, "balance_loss_mlp": 1.01807332, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 3.2395107878106493, "language_loss": 0.80108511, "learning_rate": 8.897502687329457e-07, "loss": 0.8227334, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.6603288650512695 }, { "auxiliary_loss_clip": 0.01125973, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.05041897, "balance_loss_mlp": 1.02237034, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 2.6234358920348586, "language_loss": 0.8000409, "learning_rate": 8.891024300318382e-07, "loss": 0.82159698, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.7122466564178467 }, { "auxiliary_loss_clip": 0.01118387, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.04693496, "balance_loss_mlp": 1.021819, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 1.5910617428194764, "language_loss": 0.76228607, "learning_rate": 8.884547598530103e-07, "loss": 0.78375876, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.6707019805908203 }, { "auxiliary_loss_clip": 0.01069063, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.0426228, "balance_loss_mlp": 1.01996922, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 2.2354180372156796, "language_loss": 0.75375926, "learning_rate": 8.8780725829471e-07, "loss": 0.77472782, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.880186080932617 }, { "auxiliary_loss_clip": 0.01176661, "auxiliary_loss_mlp": 0.0102499, "balance_loss_clip": 1.05407, "balance_loss_mlp": 1.01791477, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 2.4414803668455973, "language_loss": 0.78270847, "learning_rate": 8.87159925455165e-07, "loss": 0.80472493, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.8122849464416504 }, { "auxiliary_loss_clip": 0.01123273, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.0485338, "balance_loss_mlp": 1.0205512, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 2.268799096025992, "language_loss": 0.73118305, "learning_rate": 8.865127614325738e-07, "loss": 0.75269258, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.7060325145721436 }, { "auxiliary_loss_clip": 0.01137051, "auxiliary_loss_mlp": 0.01023704, "balance_loss_clip": 1.04859078, "balance_loss_mlp": 1.0163517, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 1.7917917267722823, "language_loss": 0.66635907, "learning_rate": 8.85865766325113e-07, "loss": 0.68796659, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.786921739578247 }, { "auxiliary_loss_clip": 0.01140725, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 1.05031979, "balance_loss_mlp": 1.01912856, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.948954968879261, "language_loss": 0.72354728, "learning_rate": 8.852189402309287e-07, "loss": 0.74521559, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.6938531398773193 }, { "auxiliary_loss_clip": 0.0115989, "auxiliary_loss_mlp": 0.0103118, "balance_loss_clip": 1.05312407, "balance_loss_mlp": 1.0234108, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.7360618789916287, "language_loss": 0.74877, "learning_rate": 8.845722832481441e-07, "loss": 0.77068067, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.599684953689575 }, { "auxiliary_loss_clip": 0.01157052, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.05197906, "balance_loss_mlp": 1.01954412, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 2.101671982564735, "language_loss": 0.77415508, "learning_rate": 8.83925795474858e-07, "loss": 0.79599333, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 3.621168613433838 }, { "auxiliary_loss_clip": 0.01123005, "auxiliary_loss_mlp": 0.01027229, "balance_loss_clip": 1.04980767, "balance_loss_mlp": 1.0196594, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.4905373730537423, "language_loss": 0.59262061, "learning_rate": 8.832794770091414e-07, "loss": 0.61412293, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 3.6610257625579834 }, { "auxiliary_loss_clip": 0.01149292, "auxiliary_loss_mlp": 0.01022647, "balance_loss_clip": 1.05124056, "balance_loss_mlp": 1.01535714, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.107720687768508, "language_loss": 0.82737553, "learning_rate": 8.826333279490401e-07, "loss": 0.84909499, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 3.56413197517395 }, { "auxiliary_loss_clip": 0.01145333, "auxiliary_loss_mlp": 0.01028878, "balance_loss_clip": 1.05064321, "balance_loss_mlp": 1.02220559, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.6281640447943597, "language_loss": 0.68011791, "learning_rate": 8.819873483925748e-07, "loss": 0.70186007, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.5979485511779785 }, { "auxiliary_loss_clip": 0.01134, "auxiliary_loss_mlp": 0.0071139, "balance_loss_clip": 1.05201936, "balance_loss_mlp": 1.0005734, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 2.0386054221380405, "language_loss": 0.74689424, "learning_rate": 8.81341538437739e-07, "loss": 0.76534808, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 3.6040561199188232 }, { "auxiliary_loss_clip": 0.01146133, "auxiliary_loss_mlp": 0.01025648, "balance_loss_clip": 1.04869461, "balance_loss_mlp": 1.01787806, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 3.634540980393114, "language_loss": 0.68303394, "learning_rate": 8.80695898182503e-07, "loss": 0.70475179, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.8312671184539795 }, { "auxiliary_loss_clip": 0.01077124, "auxiliary_loss_mlp": 0.01001311, "balance_loss_clip": 1.04117513, "balance_loss_mlp": 1.00033987, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8392854231203671, "language_loss": 0.65064502, "learning_rate": 8.800504277248093e-07, "loss": 0.67142928, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.2612192630767822 }, { "auxiliary_loss_clip": 0.01124353, "auxiliary_loss_mlp": 0.00711149, "balance_loss_clip": 1.05195272, "balance_loss_mlp": 1.00064611, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 1.8619061161798016, "language_loss": 0.75101674, "learning_rate": 8.794051271625753e-07, "loss": 0.76937175, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.693756580352783 }, { "auxiliary_loss_clip": 0.01142741, "auxiliary_loss_mlp": 0.0102566, "balance_loss_clip": 1.05048299, "balance_loss_mlp": 1.01871622, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 1.5490063012640376, "language_loss": 0.8337459, "learning_rate": 8.787599965936925e-07, "loss": 0.85542989, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.657099485397339 }, { "auxiliary_loss_clip": 0.01118855, "auxiliary_loss_mlp": 0.01025411, "balance_loss_clip": 1.04863358, "balance_loss_mlp": 1.01798463, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.9569699317921656, "language_loss": 0.71848345, "learning_rate": 8.781150361160261e-07, "loss": 0.7399261, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.8206028938293457 }, { "auxiliary_loss_clip": 0.0112763, "auxiliary_loss_mlp": 0.0102364, "balance_loss_clip": 1.04734373, "balance_loss_mlp": 1.01634407, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.7292219309000922, "language_loss": 0.73693991, "learning_rate": 8.774702458274181e-07, "loss": 0.75845259, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.720316171646118 }, { "auxiliary_loss_clip": 0.01157385, "auxiliary_loss_mlp": 0.01022744, "balance_loss_clip": 1.05060101, "balance_loss_mlp": 1.01563573, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.80851310832457, "language_loss": 0.71106511, "learning_rate": 8.768256258256799e-07, "loss": 0.73286641, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.5865044593811035 }, { "auxiliary_loss_clip": 0.01158569, "auxiliary_loss_mlp": 0.01026584, "balance_loss_clip": 1.05037379, "balance_loss_mlp": 1.0194937, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.8828823001117574, "language_loss": 0.74030048, "learning_rate": 8.76181176208602e-07, "loss": 0.76215202, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.62807559967041 }, { "auxiliary_loss_clip": 0.01096208, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.04166746, "balance_loss_mlp": 1.01867545, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 2.2183514822104957, "language_loss": 0.73621571, "learning_rate": 8.755368970739461e-07, "loss": 0.75743484, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.704742670059204 }, { "auxiliary_loss_clip": 0.01133175, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.04820621, "balance_loss_mlp": 1.01791584, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 3.600712177112117, "language_loss": 0.61632907, "learning_rate": 8.748927885194479e-07, "loss": 0.63791835, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.689974784851074 }, { "auxiliary_loss_clip": 0.01044117, "auxiliary_loss_mlp": 0.01003226, "balance_loss_clip": 1.03244281, "balance_loss_mlp": 1.00225472, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.8034530470470806, "language_loss": 0.57291371, "learning_rate": 8.742488506428209e-07, "loss": 0.59338713, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2094860076904297 }, { "auxiliary_loss_clip": 0.01146339, "auxiliary_loss_mlp": 0.00711208, "balance_loss_clip": 1.05041265, "balance_loss_mlp": 1.0005827, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 2.680665358286183, "language_loss": 0.78465843, "learning_rate": 8.736050835417466e-07, "loss": 0.80323398, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.6878745555877686 }, { "auxiliary_loss_clip": 0.01161743, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.05142331, "balance_loss_mlp": 1.02406716, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 1.97573548396418, "language_loss": 0.61185199, "learning_rate": 8.729614873138862e-07, "loss": 0.63377869, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.584390163421631 }, { "auxiliary_loss_clip": 0.01116505, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 1.04991674, "balance_loss_mlp": 1.01715779, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 2.3860366392904466, "language_loss": 0.7772668, "learning_rate": 8.723180620568716e-07, "loss": 0.79867911, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.753126382827759 }, { "auxiliary_loss_clip": 0.01146249, "auxiliary_loss_mlp": 0.01023514, "balance_loss_clip": 1.04959464, "balance_loss_mlp": 1.01685297, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.9698238382920366, "language_loss": 0.85118312, "learning_rate": 8.716748078683116e-07, "loss": 0.87288076, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.668119192123413 }, { "auxiliary_loss_clip": 0.01069984, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.04301035, "balance_loss_mlp": 1.02171123, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 2.2964206854537195, "language_loss": 0.68979418, "learning_rate": 8.710317248457855e-07, "loss": 0.71078843, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.8423807621002197 }, { "auxiliary_loss_clip": 0.01141171, "auxiliary_loss_mlp": 0.01022402, "balance_loss_clip": 1.05311298, "balance_loss_mlp": 1.01522291, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 2.2274002497543663, "language_loss": 0.72176981, "learning_rate": 8.703888130868482e-07, "loss": 0.74340558, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.780466318130493 }, { "auxiliary_loss_clip": 0.01125721, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 1.04895353, "balance_loss_mlp": 1.02146935, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 2.0140866774426756, "language_loss": 0.8248657, "learning_rate": 8.697460726890307e-07, "loss": 0.84641027, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.637913942337036 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.00711768, "balance_loss_clip": 1.04511166, "balance_loss_mlp": 1.00068498, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 3.1786577149941055, "language_loss": 0.90527654, "learning_rate": 8.691035037498354e-07, "loss": 0.92363644, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.7006070613861084 }, { "auxiliary_loss_clip": 0.01137083, "auxiliary_loss_mlp": 0.01026133, "balance_loss_clip": 1.04635644, "balance_loss_mlp": 1.01915383, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 3.5933761541603197, "language_loss": 0.72550404, "learning_rate": 8.684611063667391e-07, "loss": 0.74713624, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.651671886444092 }, { "auxiliary_loss_clip": 0.01155894, "auxiliary_loss_mlp": 0.01025223, "balance_loss_clip": 1.04958963, "balance_loss_mlp": 1.0177753, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 2.0765826800336296, "language_loss": 0.77029407, "learning_rate": 8.678188806371935e-07, "loss": 0.79210532, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.6852009296417236 }, { "auxiliary_loss_clip": 0.01155394, "auxiliary_loss_mlp": 0.01021812, "balance_loss_clip": 1.04841471, "balance_loss_mlp": 1.01516056, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.757769746014874, "language_loss": 0.85253501, "learning_rate": 8.671768266586228e-07, "loss": 0.87430704, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 3.6445107460021973 }, { "auxiliary_loss_clip": 0.0112358, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.04747665, "balance_loss_mlp": 1.02737963, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 2.4892427197147198, "language_loss": 0.78316319, "learning_rate": 8.665349445284275e-07, "loss": 0.80474675, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 3.6370856761932373 }, { "auxiliary_loss_clip": 0.01122364, "auxiliary_loss_mlp": 0.01023373, "balance_loss_clip": 1.04629493, "balance_loss_mlp": 1.01659918, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.5876900955480344, "language_loss": 0.80882502, "learning_rate": 8.658932343439799e-07, "loss": 0.83028233, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 2.760467529296875 }, { "auxiliary_loss_clip": 0.01174324, "auxiliary_loss_mlp": 0.01025013, "balance_loss_clip": 1.05315304, "balance_loss_mlp": 1.01799142, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.024589966432211, "language_loss": 0.77983618, "learning_rate": 8.65251696202627e-07, "loss": 0.80182958, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.585803985595703 }, { "auxiliary_loss_clip": 0.01126225, "auxiliary_loss_mlp": 0.01027541, "balance_loss_clip": 1.04828417, "balance_loss_mlp": 1.01989365, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 9.04529398917649, "language_loss": 0.87587881, "learning_rate": 8.646103302016896e-07, "loss": 0.89741647, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 3.615353584289551 }, { "auxiliary_loss_clip": 0.01121205, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.04738188, "balance_loss_mlp": 1.0183177, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.9528175603414462, "language_loss": 0.88551438, "learning_rate": 8.639691364384614e-07, "loss": 0.90698469, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.722984552383423 }, { "auxiliary_loss_clip": 0.01144033, "auxiliary_loss_mlp": 0.01028372, "balance_loss_clip": 1.04995537, "balance_loss_mlp": 1.02108836, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 4.374849601430092, "language_loss": 0.72689807, "learning_rate": 8.633281150102136e-07, "loss": 0.74862218, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.6164846420288086 }, { "auxiliary_loss_clip": 0.01145313, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.0523504, "balance_loss_mlp": 1.0208658, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.519138961266485, "language_loss": 0.67942286, "learning_rate": 8.626872660141855e-07, "loss": 0.70115829, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.642364025115967 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.0481112, "balance_loss_mlp": 1.01854968, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.9066974007865884, "language_loss": 0.74774802, "learning_rate": 8.620465895475957e-07, "loss": 0.76912415, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.687201976776123 }, { "auxiliary_loss_clip": 0.01105747, "auxiliary_loss_mlp": 0.01028003, "balance_loss_clip": 1.04694223, "balance_loss_mlp": 1.02081764, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.6048757816883863, "language_loss": 0.75320411, "learning_rate": 8.614060857076333e-07, "loss": 0.77454162, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.7510719299316406 }, { "auxiliary_loss_clip": 0.01139242, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.04919291, "balance_loss_mlp": 1.02426767, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 2.329898658946449, "language_loss": 0.74921662, "learning_rate": 8.60765754591462e-07, "loss": 0.77092779, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.6160926818847656 }, { "auxiliary_loss_clip": 0.01174017, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.05258453, "balance_loss_mlp": 1.01964307, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 2.3561567029760977, "language_loss": 0.73029536, "learning_rate": 8.601255962962211e-07, "loss": 0.75230789, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.621500253677368 }, { "auxiliary_loss_clip": 0.01166606, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.05290091, "balance_loss_mlp": 1.02129531, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.299463248602676, "language_loss": 0.72773463, "learning_rate": 8.594856109190194e-07, "loss": 0.74969536, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.5749223232269287 }, { "auxiliary_loss_clip": 0.01172513, "auxiliary_loss_mlp": 0.01024516, "balance_loss_clip": 1.05157614, "balance_loss_mlp": 1.01722932, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 2.243606926732274, "language_loss": 0.69198781, "learning_rate": 8.588457985569446e-07, "loss": 0.71395808, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.723280191421509 }, { "auxiliary_loss_clip": 0.0117562, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.05401111, "balance_loss_mlp": 1.02260303, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 3.5229175720465538, "language_loss": 0.71809787, "learning_rate": 8.582061593070542e-07, "loss": 0.74016106, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.627631187438965 }, { "auxiliary_loss_clip": 0.0117571, "auxiliary_loss_mlp": 0.00711277, "balance_loss_clip": 1.05365276, "balance_loss_mlp": 1.00067997, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 4.430344181511128, "language_loss": 0.77380633, "learning_rate": 8.57566693266383e-07, "loss": 0.79267615, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.5876076221466064 }, { "auxiliary_loss_clip": 0.01146373, "auxiliary_loss_mlp": 0.00711454, "balance_loss_clip": 1.04823828, "balance_loss_mlp": 1.00060058, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.2845088228230326, "language_loss": 0.69536889, "learning_rate": 8.569274005319354e-07, "loss": 0.71394712, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.60662579536438 }, { "auxiliary_loss_clip": 0.01154996, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 1.05160046, "balance_loss_mlp": 1.02301812, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 2.5754636354865608, "language_loss": 0.80083799, "learning_rate": 8.562882812006913e-07, "loss": 0.82268548, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.6240508556365967 }, { "auxiliary_loss_clip": 0.01173171, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.05150747, "balance_loss_mlp": 1.02477837, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 1.8317989385358455, "language_loss": 0.77516437, "learning_rate": 8.556493353696066e-07, "loss": 0.79721683, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.5916433334350586 }, { "auxiliary_loss_clip": 0.01161325, "auxiliary_loss_mlp": 0.00711769, "balance_loss_clip": 1.05250072, "balance_loss_mlp": 1.00059617, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 2.234994298924337, "language_loss": 0.68118, "learning_rate": 8.550105631356077e-07, "loss": 0.69991094, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.662180185317993 }, { "auxiliary_loss_clip": 0.01122305, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.04640913, "balance_loss_mlp": 1.01756608, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 3.176245082270358, "language_loss": 0.77054834, "learning_rate": 8.543719645955961e-07, "loss": 0.79202974, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.6741831302642822 }, { "auxiliary_loss_clip": 0.01143042, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.04828858, "balance_loss_mlp": 1.02139926, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.6208818568469612, "language_loss": 0.74709541, "learning_rate": 8.537335398464467e-07, "loss": 0.76880825, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.6892473697662354 }, { "auxiliary_loss_clip": 0.01140095, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.0468663, "balance_loss_mlp": 1.01932549, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 2.8401287441539407, "language_loss": 0.84958732, "learning_rate": 8.53095288985007e-07, "loss": 0.87125838, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.6269161701202393 }, { "auxiliary_loss_clip": 0.01171741, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.05174136, "balance_loss_mlp": 1.02138686, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.836016600024044, "language_loss": 0.82243311, "learning_rate": 8.524572121081009e-07, "loss": 0.84443569, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.5853166580200195 }, { "auxiliary_loss_clip": 0.01161769, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.05183721, "balance_loss_mlp": 1.0163517, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.1493756242008106, "language_loss": 0.62794626, "learning_rate": 8.518193093125232e-07, "loss": 0.6498059, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.672945261001587 }, { "auxiliary_loss_clip": 0.01148246, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.0500958, "balance_loss_mlp": 1.02549052, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 3.1447865111184847, "language_loss": 0.811786, "learning_rate": 8.511815806950436e-07, "loss": 0.83359385, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.6872923374176025 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.04657221, "balance_loss_mlp": 1.01746154, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.9299769995008036, "language_loss": 0.78068709, "learning_rate": 8.505440263524044e-07, "loss": 0.80245578, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 3.5567641258239746 }, { "auxiliary_loss_clip": 0.01163369, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 1.0524056, "balance_loss_mlp": 1.01902866, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 3.213791101042422, "language_loss": 0.88282382, "learning_rate": 8.49906646381322e-07, "loss": 0.90472591, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 4.55251407623291 }, { "auxiliary_loss_clip": 0.01127116, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.04884589, "balance_loss_mlp": 1.017977, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 1.9582818290887485, "language_loss": 0.72181785, "learning_rate": 8.492694408784884e-07, "loss": 0.74333721, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.687633514404297 }, { "auxiliary_loss_clip": 0.0115976, "auxiliary_loss_mlp": 0.01024624, "balance_loss_clip": 1.05037951, "balance_loss_mlp": 1.01700044, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.7236039708029396, "language_loss": 0.62014079, "learning_rate": 8.486324099405642e-07, "loss": 0.64198458, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.662217140197754 }, { "auxiliary_loss_clip": 0.01159606, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.05124629, "balance_loss_mlp": 1.01906061, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.7115154644553445, "language_loss": 0.7481209, "learning_rate": 8.479955536641887e-07, "loss": 0.76997924, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.6566295623779297 }, { "auxiliary_loss_clip": 0.01133122, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04489195, "balance_loss_mlp": 1.01697135, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 2.0197679058163955, "language_loss": 0.66215825, "learning_rate": 8.473588721459716e-07, "loss": 0.68373281, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 3.6157147884368896 }, { "auxiliary_loss_clip": 0.0115997, "auxiliary_loss_mlp": 0.01032442, "balance_loss_clip": 1.05358434, "balance_loss_mlp": 1.02427626, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 2.4745976275106507, "language_loss": 0.70546985, "learning_rate": 8.467223654824967e-07, "loss": 0.72739398, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.652416706085205 }, { "auxiliary_loss_clip": 0.01149646, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.04831374, "balance_loss_mlp": 1.01897562, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 2.590714699155609, "language_loss": 0.62655377, "learning_rate": 8.460860337703233e-07, "loss": 0.64831316, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.8071587085723877 }, { "auxiliary_loss_clip": 0.01112194, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 1.04368472, "balance_loss_mlp": 1.01672339, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.846870449353771, "language_loss": 0.70195532, "learning_rate": 8.454498771059797e-07, "loss": 0.72332871, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.6952056884765625 }, { "auxiliary_loss_clip": 0.01104581, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04548001, "balance_loss_mlp": 1.01954198, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.2959756877440443, "language_loss": 0.83069044, "learning_rate": 8.448138955859725e-07, "loss": 0.85200548, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.6419003009796143 }, { "auxiliary_loss_clip": 0.01143172, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.04968202, "balance_loss_mlp": 1.022771, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 2.3898207487597953, "language_loss": 0.89776123, "learning_rate": 8.44178089306778e-07, "loss": 0.9194963, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.689450979232788 }, { "auxiliary_loss_clip": 0.01174961, "auxiliary_loss_mlp": 0.01025045, "balance_loss_clip": 1.05374742, "balance_loss_mlp": 1.01753449, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.965605348709592, "language_loss": 0.77183032, "learning_rate": 8.4354245836485e-07, "loss": 0.79383034, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.5634124279022217 }, { "auxiliary_loss_clip": 0.01126912, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.04730058, "balance_loss_mlp": 1.02245092, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.6966014336366184, "language_loss": 0.72946489, "learning_rate": 8.429070028566108e-07, "loss": 0.75103474, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.785684108734131 }, { "auxiliary_loss_clip": 0.01155051, "auxiliary_loss_mlp": 0.01027252, "balance_loss_clip": 1.05016112, "balance_loss_mlp": 1.01998031, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 2.223799668388269, "language_loss": 0.74944955, "learning_rate": 8.422717228784586e-07, "loss": 0.77127254, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.55788516998291 }, { "auxiliary_loss_clip": 0.01110146, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.04897678, "balance_loss_mlp": 1.02191758, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.899352861228698, "language_loss": 0.69393528, "learning_rate": 8.416366185267663e-07, "loss": 0.71533322, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.670790910720825 }, { "auxiliary_loss_clip": 0.01155918, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.04791045, "balance_loss_mlp": 1.01785111, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.7961993894211707, "language_loss": 0.77820218, "learning_rate": 8.410016898978778e-07, "loss": 0.80000937, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.5937066078186035 }, { "auxiliary_loss_clip": 0.01108651, "auxiliary_loss_mlp": 0.01027067, "balance_loss_clip": 1.04781294, "balance_loss_mlp": 1.01983976, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 1.8439466711571055, "language_loss": 0.79118747, "learning_rate": 8.403669370881115e-07, "loss": 0.81254464, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.703435182571411 }, { "auxiliary_loss_clip": 0.01173535, "auxiliary_loss_mlp": 0.01022756, "balance_loss_clip": 1.05257165, "balance_loss_mlp": 1.01576996, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.928888421449217, "language_loss": 0.78577495, "learning_rate": 8.397323601937587e-07, "loss": 0.80773783, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.6031429767608643 }, { "auxiliary_loss_clip": 0.01122866, "auxiliary_loss_mlp": 0.01024904, "balance_loss_clip": 1.04778826, "balance_loss_mlp": 1.017838, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 2.8023188669191224, "language_loss": 0.77589893, "learning_rate": 8.390979593110838e-07, "loss": 0.79737663, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.7786483764648438 }, { "auxiliary_loss_clip": 0.01148288, "auxiliary_loss_mlp": 0.01025347, "balance_loss_clip": 1.0511198, "balance_loss_mlp": 1.01755023, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.907992284606095, "language_loss": 0.81566024, "learning_rate": 8.384637345363262e-07, "loss": 0.83739656, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.6742148399353027 }, { "auxiliary_loss_clip": 0.01134618, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.04603553, "balance_loss_mlp": 1.01896477, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 4.849776920997062, "language_loss": 0.76722538, "learning_rate": 8.378296859656964e-07, "loss": 0.7888366, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.7776808738708496 }, { "auxiliary_loss_clip": 0.01142678, "auxiliary_loss_mlp": 0.01020332, "balance_loss_clip": 1.05027008, "balance_loss_mlp": 1.013376, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 2.1667293733897472, "language_loss": 0.6788559, "learning_rate": 8.371958136953792e-07, "loss": 0.700486, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.769843339920044 }, { "auxiliary_loss_clip": 0.01130869, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.04748511, "balance_loss_mlp": 1.02409518, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 8.15955908798817, "language_loss": 0.66237724, "learning_rate": 8.365621178215326e-07, "loss": 0.684003, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.688819646835327 }, { "auxiliary_loss_clip": 0.01153099, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.05012584, "balance_loss_mlp": 1.02709329, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.8731460253107213, "language_loss": 0.75037527, "learning_rate": 8.359285984402871e-07, "loss": 0.77225554, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.6717681884765625 }, { "auxiliary_loss_clip": 0.01138441, "auxiliary_loss_mlp": 0.01023774, "balance_loss_clip": 1.04994488, "balance_loss_mlp": 1.01708651, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 2.367585291980994, "language_loss": 0.74330068, "learning_rate": 8.352952556477489e-07, "loss": 0.76492286, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.6530699729919434 }, { "auxiliary_loss_clip": 0.01157527, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.05146408, "balance_loss_mlp": 1.02235508, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.8923129203706073, "language_loss": 0.76639569, "learning_rate": 8.34662089539993e-07, "loss": 0.78826976, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.6762940883636475 }, { "auxiliary_loss_clip": 0.01169886, "auxiliary_loss_mlp": 0.01025601, "balance_loss_clip": 1.0516665, "balance_loss_mlp": 1.01873481, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 5.576540982517425, "language_loss": 0.79647154, "learning_rate": 8.340291002130722e-07, "loss": 0.81842637, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 3.5889742374420166 }, { "auxiliary_loss_clip": 0.01175869, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.05319929, "balance_loss_mlp": 1.02349806, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 3.5590684682910965, "language_loss": 0.79677188, "learning_rate": 8.3339628776301e-07, "loss": 0.81884235, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 3.4951956272125244 }, { "auxiliary_loss_clip": 0.01169445, "auxiliary_loss_mlp": 0.01026326, "balance_loss_clip": 1.04938889, "balance_loss_mlp": 1.01929832, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 2.5217689736872284, "language_loss": 0.57321858, "learning_rate": 8.327636522858033e-07, "loss": 0.59517634, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.697981119155884 }, { "auxiliary_loss_clip": 0.01113877, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.04952621, "balance_loss_mlp": 1.02454913, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 6.795877072959768, "language_loss": 0.77464259, "learning_rate": 8.321311938774225e-07, "loss": 0.7961036, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 2.6865315437316895 }, { "auxiliary_loss_clip": 0.01175411, "auxiliary_loss_mlp": 0.01025559, "balance_loss_clip": 1.05163872, "balance_loss_mlp": 1.01803696, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 2.6875229880385114, "language_loss": 0.79569221, "learning_rate": 8.314989126338104e-07, "loss": 0.81770188, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.581852674484253 }, { "auxiliary_loss_clip": 0.01157615, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.04892457, "balance_loss_mlp": 1.0212965, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 2.221101129630549, "language_loss": 0.84505582, "learning_rate": 8.308668086508847e-07, "loss": 0.8669188, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 3.553972005844116 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01023837, "balance_loss_clip": 1.04319096, "balance_loss_mlp": 1.01610041, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 1.8958103337951437, "language_loss": 0.73709899, "learning_rate": 8.302348820245342e-07, "loss": 0.75858873, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 2.8989462852478027 }, { "auxiliary_loss_clip": 0.01124398, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.04707456, "balance_loss_mlp": 1.02061129, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 3.46646967532687, "language_loss": 0.70368433, "learning_rate": 8.296031328506232e-07, "loss": 0.72521508, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.8008346557617188 }, { "auxiliary_loss_clip": 0.01143606, "auxiliary_loss_mlp": 0.01022847, "balance_loss_clip": 1.05082321, "balance_loss_mlp": 1.01483059, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 7.098298016942434, "language_loss": 0.75564945, "learning_rate": 8.289715612249857e-07, "loss": 0.77731395, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.6404683589935303 }, { "auxiliary_loss_clip": 0.01142078, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.05114317, "balance_loss_mlp": 1.02112746, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.4139609894989036, "language_loss": 0.77540296, "learning_rate": 8.283401672434305e-07, "loss": 0.79711312, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.639285087585449 }, { "auxiliary_loss_clip": 0.01140709, "auxiliary_loss_mlp": 0.01020804, "balance_loss_clip": 1.05149758, "balance_loss_mlp": 1.01387763, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 2.4811860886344324, "language_loss": 0.7022956, "learning_rate": 8.277089510017412e-07, "loss": 0.72391075, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.6621108055114746 }, { "auxiliary_loss_clip": 0.01139681, "auxiliary_loss_mlp": 0.01025513, "balance_loss_clip": 1.05067623, "balance_loss_mlp": 1.01812196, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.6899243074162023, "language_loss": 0.82345533, "learning_rate": 8.270779125956719e-07, "loss": 0.84510732, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.6787612438201904 }, { "auxiliary_loss_clip": 0.01106876, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.04744363, "balance_loss_mlp": 1.01644421, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.1494021535599703, "language_loss": 0.79982346, "learning_rate": 8.264470521209505e-07, "loss": 0.82113022, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.680107355117798 }, { "auxiliary_loss_clip": 0.01148256, "auxiliary_loss_mlp": 0.01025122, "balance_loss_clip": 1.04679012, "balance_loss_mlp": 1.0176239, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 3.669162532107461, "language_loss": 0.77130318, "learning_rate": 8.258163696732785e-07, "loss": 0.793037, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.585881233215332 }, { "auxiliary_loss_clip": 0.01152174, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.04920375, "balance_loss_mlp": 1.02283943, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.7306727169161074, "language_loss": 0.77194059, "learning_rate": 8.251858653483288e-07, "loss": 0.79376626, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.5974557399749756 }, { "auxiliary_loss_clip": 0.01158849, "auxiliary_loss_mlp": 0.01022713, "balance_loss_clip": 1.0528146, "balance_loss_mlp": 1.01506257, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 4.6988802052327, "language_loss": 0.85884273, "learning_rate": 8.245555392417501e-07, "loss": 0.88065827, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.574554681777954 }, { "auxiliary_loss_clip": 0.01095899, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.043311, "balance_loss_mlp": 1.01605773, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.9959390402930732, "language_loss": 0.79042667, "learning_rate": 8.239253914491613e-07, "loss": 0.81162447, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.670384407043457 }, { "auxiliary_loss_clip": 0.01120407, "auxiliary_loss_mlp": 0.01028054, "balance_loss_clip": 1.04752922, "balance_loss_mlp": 1.0208447, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.7886238721456413, "language_loss": 0.75247145, "learning_rate": 8.232954220661556e-07, "loss": 0.77395606, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.7835910320281982 }, { "auxiliary_loss_clip": 0.01174782, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.05568242, "balance_loss_mlp": 1.02099729, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 3.5393719802931867, "language_loss": 0.70829618, "learning_rate": 8.226656311882989e-07, "loss": 0.73032957, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.571772336959839 }, { "auxiliary_loss_clip": 0.0115489, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.05207467, "balance_loss_mlp": 1.01904953, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.644119451345632, "language_loss": 0.76973522, "learning_rate": 8.22036018911129e-07, "loss": 0.79154682, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.690915822982788 }, { "auxiliary_loss_clip": 0.01175739, "auxiliary_loss_mlp": 0.01024884, "balance_loss_clip": 1.05149055, "balance_loss_mlp": 1.01705801, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.7774048799464905, "language_loss": 0.80361748, "learning_rate": 8.214065853301599e-07, "loss": 0.82562369, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.549111843109131 }, { "auxiliary_loss_clip": 0.01079201, "auxiliary_loss_mlp": 0.0100141, "balance_loss_clip": 1.03567886, "balance_loss_mlp": 1.00045609, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8221408249851706, "language_loss": 0.58195484, "learning_rate": 8.207773305408734e-07, "loss": 0.60276091, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.3285484313964844 }, { "auxiliary_loss_clip": 0.01121057, "auxiliary_loss_mlp": 0.01025026, "balance_loss_clip": 1.04746056, "balance_loss_mlp": 1.01745582, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 2.8211213976735605, "language_loss": 0.80273944, "learning_rate": 8.201482546387288e-07, "loss": 0.82420027, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.7891790866851807 }, { "auxiliary_loss_clip": 0.01159565, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.05292773, "balance_loss_mlp": 1.02745497, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.8921235162243466, "language_loss": 0.91865534, "learning_rate": 8.195193577191553e-07, "loss": 0.94060326, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.6893460750579834 }, { "auxiliary_loss_clip": 0.01148624, "auxiliary_loss_mlp": 0.00711359, "balance_loss_clip": 1.04958606, "balance_loss_mlp": 1.00058603, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 2.2418969923643104, "language_loss": 0.84572065, "learning_rate": 8.188906398775579e-07, "loss": 0.86432046, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.70078706741333 }, { "auxiliary_loss_clip": 0.01173987, "auxiliary_loss_mlp": 0.00712009, "balance_loss_clip": 1.05100155, "balance_loss_mlp": 1.00068283, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 1.8246258360841237, "language_loss": 0.68670064, "learning_rate": 8.18262101209311e-07, "loss": 0.70556062, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.594207763671875 }, { "auxiliary_loss_clip": 0.0116091, "auxiliary_loss_mlp": 0.01021892, "balance_loss_clip": 1.05134714, "balance_loss_mlp": 1.01446223, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.8311972742870901, "language_loss": 0.70160639, "learning_rate": 8.176337418097626e-07, "loss": 0.72343445, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 3.551187753677368 }, { "auxiliary_loss_clip": 0.011585, "auxiliary_loss_mlp": 0.00711606, "balance_loss_clip": 1.05316734, "balance_loss_mlp": 1.00073004, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.1363430120136977, "language_loss": 0.79484713, "learning_rate": 8.170055617742364e-07, "loss": 0.81354815, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 3.537527084350586 }, { "auxiliary_loss_clip": 0.01136127, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.04806781, "balance_loss_mlp": 1.01549029, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 2.0126853475478685, "language_loss": 0.70988399, "learning_rate": 8.163775611980252e-07, "loss": 0.73148108, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 3.374498128890991 }, { "auxiliary_loss_clip": 0.0114312, "auxiliary_loss_mlp": 0.01025653, "balance_loss_clip": 1.05083275, "balance_loss_mlp": 1.01855671, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.8648551781755902, "language_loss": 0.79125798, "learning_rate": 8.157497401763982e-07, "loss": 0.81294572, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.688891649246216 }, { "auxiliary_loss_clip": 0.01155581, "auxiliary_loss_mlp": 0.01025554, "balance_loss_clip": 1.05027151, "balance_loss_mlp": 1.01855397, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.838045715148228, "language_loss": 0.77602613, "learning_rate": 8.151220988045935e-07, "loss": 0.7978375, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 3.520496129989624 }, { "auxiliary_loss_clip": 0.01156189, "auxiliary_loss_mlp": 0.01022623, "balance_loss_clip": 1.05005085, "balance_loss_mlp": 1.01516032, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.821724840222053, "language_loss": 0.82468051, "learning_rate": 8.144946371778234e-07, "loss": 0.84646863, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.5949206352233887 }, { "auxiliary_loss_clip": 0.01144917, "auxiliary_loss_mlp": 0.00712555, "balance_loss_clip": 1.05258155, "balance_loss_mlp": 1.00072086, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 1.7733670436584437, "language_loss": 0.78068864, "learning_rate": 8.138673553912751e-07, "loss": 0.7992633, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.701425790786743 }, { "auxiliary_loss_clip": 0.01108211, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.04487181, "balance_loss_mlp": 1.02154398, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.1405725177845136, "language_loss": 0.56932575, "learning_rate": 8.132402535401059e-07, "loss": 0.59069908, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.7462780475616455 }, { "auxiliary_loss_clip": 0.01154943, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.05012465, "balance_loss_mlp": 1.02574849, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 2.001858854237275, "language_loss": 0.74277818, "learning_rate": 8.126133317194465e-07, "loss": 0.76465583, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.6737513542175293 }, { "auxiliary_loss_clip": 0.01102399, "auxiliary_loss_mlp": 0.01028498, "balance_loss_clip": 1.04405451, "balance_loss_mlp": 1.02056479, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 1.832553480566212, "language_loss": 0.74415535, "learning_rate": 8.11986590024401e-07, "loss": 0.76546431, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.7364065647125244 }, { "auxiliary_loss_clip": 0.01147942, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.05072606, "balance_loss_mlp": 1.0266304, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.6581238794591395, "language_loss": 0.69140458, "learning_rate": 8.113600285500442e-07, "loss": 0.71323538, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.7602014541625977 }, { "auxiliary_loss_clip": 0.0117483, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.05209279, "balance_loss_mlp": 1.02219737, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.7992143803374943, "language_loss": 0.74349791, "learning_rate": 8.107336473914268e-07, "loss": 0.76553714, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.627145528793335 }, { "auxiliary_loss_clip": 0.01062019, "auxiliary_loss_mlp": 0.01001436, "balance_loss_clip": 1.03225517, "balance_loss_mlp": 1.0004344, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7710024300884001, "language_loss": 0.55731994, "learning_rate": 8.101074466435694e-07, "loss": 0.57795447, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.1885342597961426 }, { "auxiliary_loss_clip": 0.01152465, "auxiliary_loss_mlp": 0.0102232, "balance_loss_clip": 1.05020893, "balance_loss_mlp": 1.01486349, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.7657924199338435, "language_loss": 0.67972457, "learning_rate": 8.094814264014662e-07, "loss": 0.70147246, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.662550210952759 }, { "auxiliary_loss_clip": 0.01176454, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.05246258, "balance_loss_mlp": 1.0221889, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 2.1396612640958614, "language_loss": 0.816154, "learning_rate": 8.088555867600844e-07, "loss": 0.8382169, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.5390360355377197 }, { "auxiliary_loss_clip": 0.01123867, "auxiliary_loss_mlp": 0.01023597, "balance_loss_clip": 1.04911518, "balance_loss_mlp": 1.01623559, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 5.4412772090853405, "language_loss": 0.608132, "learning_rate": 8.08229927814362e-07, "loss": 0.62960672, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.8156800270080566 }, { "auxiliary_loss_clip": 0.0112588, "auxiliary_loss_mlp": 0.01024997, "balance_loss_clip": 1.04664302, "balance_loss_mlp": 1.01788282, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.834387981670228, "language_loss": 0.65046382, "learning_rate": 8.076044496592134e-07, "loss": 0.67197257, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.7328603267669678 }, { "auxiliary_loss_clip": 0.01139512, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.04913568, "balance_loss_mlp": 1.02311516, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 3.2966111933409565, "language_loss": 0.78401089, "learning_rate": 8.069791523895204e-07, "loss": 0.80571091, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.648378610610962 }, { "auxiliary_loss_clip": 0.01115917, "auxiliary_loss_mlp": 0.01028361, "balance_loss_clip": 1.04417813, "balance_loss_mlp": 1.02101779, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 1.884667012826093, "language_loss": 0.77696455, "learning_rate": 8.063540361001422e-07, "loss": 0.79840732, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.6739485263824463 }, { "auxiliary_loss_clip": 0.01122074, "auxiliary_loss_mlp": 0.01025957, "balance_loss_clip": 1.04857063, "balance_loss_mlp": 1.01852429, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.122217820769275, "language_loss": 0.79258323, "learning_rate": 8.057291008859069e-07, "loss": 0.81406343, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.698838233947754 }, { "auxiliary_loss_clip": 0.01154335, "auxiliary_loss_mlp": 0.01026758, "balance_loss_clip": 1.05072188, "balance_loss_mlp": 1.01983166, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 2.049367633966112, "language_loss": 0.68039244, "learning_rate": 8.051043468416187e-07, "loss": 0.70220333, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.6647377014160156 }, { "auxiliary_loss_clip": 0.01173575, "auxiliary_loss_mlp": 0.01022398, "balance_loss_clip": 1.05319881, "balance_loss_mlp": 1.01561522, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 2.4312312614432656, "language_loss": 0.81866997, "learning_rate": 8.044797740620506e-07, "loss": 0.8406297, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.612637519836426 }, { "auxiliary_loss_clip": 0.01105212, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.02007914, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 2.2189421596485968, "language_loss": 0.7861619, "learning_rate": 8.038553826419494e-07, "loss": 0.80749148, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.748328447341919 }, { "auxiliary_loss_clip": 0.01172535, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05077076, "balance_loss_mlp": 1.02009702, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.7553637231015402, "language_loss": 0.81025553, "learning_rate": 8.032311726760364e-07, "loss": 0.83225232, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.6620936393737793 }, { "auxiliary_loss_clip": 0.01118731, "auxiliary_loss_mlp": 0.01023314, "balance_loss_clip": 1.04855514, "balance_loss_mlp": 1.01544607, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.837955229686902, "language_loss": 0.68784535, "learning_rate": 8.026071442590022e-07, "loss": 0.70926583, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.1343188285827637 }, { "auxiliary_loss_clip": 0.01161472, "auxiliary_loss_mlp": 0.01023071, "balance_loss_clip": 1.05484831, "balance_loss_mlp": 1.01573336, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 2.2016146247168304, "language_loss": 0.81066287, "learning_rate": 8.019832974855134e-07, "loss": 0.83250833, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.603508234024048 }, { "auxiliary_loss_clip": 0.01128137, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.05155516, "balance_loss_mlp": 1.02118587, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 5.641557959358076, "language_loss": 0.82471734, "learning_rate": 8.013596324502052e-07, "loss": 0.84628731, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 4.58283257484436 }, { "auxiliary_loss_clip": 0.01152744, "auxiliary_loss_mlp": 0.01021707, "balance_loss_clip": 1.05140007, "balance_loss_mlp": 1.01470637, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.9996743041185108, "language_loss": 0.78907126, "learning_rate": 8.007361492476872e-07, "loss": 0.81081581, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.6304688453674316 }, { "auxiliary_loss_clip": 0.01132667, "auxiliary_loss_mlp": 0.0102241, "balance_loss_clip": 1.0470624, "balance_loss_mlp": 1.01589239, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.6729139791115726, "language_loss": 0.78974843, "learning_rate": 8.001128479725426e-07, "loss": 0.8112992, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 3.6674933433532715 }, { "auxiliary_loss_clip": 0.01102491, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.04378486, "balance_loss_mlp": 1.02574253, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.7680768633622492, "language_loss": 0.81167293, "learning_rate": 7.994897287193248e-07, "loss": 0.83302879, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 2.7699766159057617 }, { "auxiliary_loss_clip": 0.01160436, "auxiliary_loss_mlp": 0.01022596, "balance_loss_clip": 1.05041623, "balance_loss_mlp": 1.01540208, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 3.6510397548268423, "language_loss": 0.83880317, "learning_rate": 7.988667915825605e-07, "loss": 0.86063349, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 3.5637805461883545 }, { "auxiliary_loss_clip": 0.01142939, "auxiliary_loss_mlp": 0.01026346, "balance_loss_clip": 1.04959607, "balance_loss_mlp": 1.01884151, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 3.2329678695379833, "language_loss": 0.76110578, "learning_rate": 7.982440366567491e-07, "loss": 0.78279859, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.6421797275543213 }, { "auxiliary_loss_clip": 0.01149694, "auxiliary_loss_mlp": 0.01022621, "balance_loss_clip": 1.04827857, "balance_loss_mlp": 1.01542997, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.7631111306962912, "language_loss": 0.75091648, "learning_rate": 7.97621464036361e-07, "loss": 0.77263957, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.67031192779541 }, { "auxiliary_loss_clip": 0.01160398, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.04969645, "balance_loss_mlp": 1.01760888, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.8966658582082656, "language_loss": 0.67871821, "learning_rate": 7.969990738158417e-07, "loss": 0.70057273, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.5885000228881836 }, { "auxiliary_loss_clip": 0.01161212, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.05468559, "balance_loss_mlp": 1.02007413, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 2.1187947579439936, "language_loss": 0.85182214, "learning_rate": 7.963768660896062e-07, "loss": 0.87371171, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.6616508960723877 }, { "auxiliary_loss_clip": 0.01160301, "auxiliary_loss_mlp": 0.01028891, "balance_loss_clip": 1.05170465, "balance_loss_mlp": 1.02143407, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 3.0357925605234612, "language_loss": 0.82288724, "learning_rate": 7.957548409520432e-07, "loss": 0.84477919, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.6546967029571533 }, { "auxiliary_loss_clip": 0.01126729, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.04753792, "balance_loss_mlp": 1.02466822, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 11.863713460821282, "language_loss": 0.84238917, "learning_rate": 7.951329984975135e-07, "loss": 0.86397338, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.716168165206909 }, { "auxiliary_loss_clip": 0.01054903, "auxiliary_loss_mlp": 0.01001847, "balance_loss_clip": 1.03299046, "balance_loss_mlp": 1.00102425, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.734885841204828, "language_loss": 0.54225719, "learning_rate": 7.94511338820349e-07, "loss": 0.56282467, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.248507022857666 }, { "auxiliary_loss_clip": 0.01141571, "auxiliary_loss_mlp": 0.00712166, "balance_loss_clip": 1.04983914, "balance_loss_mlp": 1.00067329, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.190349233633268, "language_loss": 0.78519636, "learning_rate": 7.938898620148575e-07, "loss": 0.80373371, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.6697168350219727 }, { "auxiliary_loss_clip": 0.01144916, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.05187738, "balance_loss_mlp": 1.02349234, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 2.029337992445838, "language_loss": 0.7129311, "learning_rate": 7.932685681753135e-07, "loss": 0.73469424, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.5892982482910156 }, { "auxiliary_loss_clip": 0.01171425, "auxiliary_loss_mlp": 0.01022749, "balance_loss_clip": 1.0529685, "balance_loss_mlp": 1.01555431, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 2.7541646500632724, "language_loss": 0.62587881, "learning_rate": 7.92647457395969e-07, "loss": 0.64782059, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.660008430480957 }, { "auxiliary_loss_clip": 0.01100101, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.04226911, "balance_loss_mlp": 1.02269983, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.7219287220690376, "language_loss": 0.74394894, "learning_rate": 7.920265297710444e-07, "loss": 0.76524937, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.7547037601470947 }, { "auxiliary_loss_clip": 0.01157474, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.04977155, "balance_loss_mlp": 1.0208236, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 2.1298014181537077, "language_loss": 0.73672605, "learning_rate": 7.914057853947363e-07, "loss": 0.75857866, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.6381654739379883 }, { "auxiliary_loss_clip": 0.01122538, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.0460465, "balance_loss_mlp": 1.02239645, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 2.5946582169412866, "language_loss": 0.62750208, "learning_rate": 7.907852243612089e-07, "loss": 0.6490289, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.7145185470581055 }, { "auxiliary_loss_clip": 0.01137952, "auxiliary_loss_mlp": 0.01024219, "balance_loss_clip": 1.04808736, "balance_loss_mlp": 1.01722145, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 8.171059358458535, "language_loss": 0.72589028, "learning_rate": 7.901648467646009e-07, "loss": 0.74751204, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.633249044418335 }, { "auxiliary_loss_clip": 0.01176519, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.05428457, "balance_loss_mlp": 1.01770902, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 6.365583324761511, "language_loss": 0.72401315, "learning_rate": 7.895446526990244e-07, "loss": 0.74603045, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.5797669887542725 }, { "auxiliary_loss_clip": 0.01117224, "auxiliary_loss_mlp": 0.01022755, "balance_loss_clip": 1.04540813, "balance_loss_mlp": 1.01575112, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.7547643641373403, "language_loss": 0.75801432, "learning_rate": 7.889246422585609e-07, "loss": 0.77941406, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.6817445755004883 }, { "auxiliary_loss_clip": 0.0117394, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.05217934, "balance_loss_mlp": 1.02306604, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 3.4496199914868564, "language_loss": 0.73424858, "learning_rate": 7.883048155372675e-07, "loss": 0.75628996, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.6462960243225098 }, { "auxiliary_loss_clip": 0.01148849, "auxiliary_loss_mlp": 0.0102549, "balance_loss_clip": 1.05158162, "balance_loss_mlp": 1.01837265, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 8.315512799597212, "language_loss": 0.71479881, "learning_rate": 7.876851726291698e-07, "loss": 0.73654222, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.6643354892730713 }, { "auxiliary_loss_clip": 0.01127114, "auxiliary_loss_mlp": 0.01030226, "balance_loss_clip": 1.04585075, "balance_loss_mlp": 1.0234195, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 2.238828174418278, "language_loss": 0.78288937, "learning_rate": 7.870657136282666e-07, "loss": 0.80446279, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.7613565921783447 }, { "auxiliary_loss_clip": 0.01152111, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.05011547, "balance_loss_mlp": 1.02133799, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 3.7041060003129935, "language_loss": 0.82114971, "learning_rate": 7.86446438628531e-07, "loss": 0.84295845, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.5940403938293457 }, { "auxiliary_loss_clip": 0.01087731, "auxiliary_loss_mlp": 0.01002754, "balance_loss_clip": 1.0337106, "balance_loss_mlp": 1.00179434, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7698217550925023, "language_loss": 0.56842363, "learning_rate": 7.858273477239059e-07, "loss": 0.58932841, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.197101354598999 }, { "auxiliary_loss_clip": 0.01096604, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.04372215, "balance_loss_mlp": 1.01924205, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 2.011892363671228, "language_loss": 0.71328539, "learning_rate": 7.852084410083067e-07, "loss": 0.73451304, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 3.688115119934082 }, { "auxiliary_loss_clip": 0.01137283, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.04976022, "balance_loss_mlp": 1.01839328, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.699775402221769, "language_loss": 0.63528836, "learning_rate": 7.84589718575621e-07, "loss": 0.6569109, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.691884994506836 }, { "auxiliary_loss_clip": 0.01141072, "auxiliary_loss_mlp": 0.010186, "balance_loss_clip": 1.044631, "balance_loss_mlp": 1.0112623, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 2.210368929445392, "language_loss": 0.68978095, "learning_rate": 7.83971180519708e-07, "loss": 0.71137762, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 3.5334813594818115 }, { "auxiliary_loss_clip": 0.01174026, "auxiliary_loss_mlp": 0.01022792, "balance_loss_clip": 1.05264008, "balance_loss_mlp": 1.01485217, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 2.733697291066397, "language_loss": 0.75685346, "learning_rate": 7.833528269344008e-07, "loss": 0.77882159, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.601412534713745 }, { "auxiliary_loss_clip": 0.01122693, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.04754603, "balance_loss_mlp": 1.02396333, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.3165604497818224, "language_loss": 0.77439225, "learning_rate": 7.827346579135023e-07, "loss": 0.79593444, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 3.583674430847168 }, { "auxiliary_loss_clip": 0.01138932, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.04759467, "balance_loss_mlp": 1.02059078, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 1.869128998418626, "language_loss": 0.82730341, "learning_rate": 7.821166735507885e-07, "loss": 0.84897399, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 2.6944315433502197 }, { "auxiliary_loss_clip": 0.01171238, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.05257106, "balance_loss_mlp": 1.02198684, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 4.482790625757333, "language_loss": 0.68916392, "learning_rate": 7.81498873940007e-07, "loss": 0.71117073, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.5427310466766357 }, { "auxiliary_loss_clip": 0.01160168, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.04770696, "balance_loss_mlp": 1.02261722, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.2802423207357165, "language_loss": 0.77423894, "learning_rate": 7.808812591748768e-07, "loss": 0.79614425, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.69705867767334 }, { "auxiliary_loss_clip": 0.01122118, "auxiliary_loss_mlp": 0.01028174, "balance_loss_clip": 1.0458771, "balance_loss_mlp": 1.02049053, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 2.560168373956918, "language_loss": 0.65029931, "learning_rate": 7.802638293490915e-07, "loss": 0.67180222, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.679635763168335 }, { "auxiliary_loss_clip": 0.01142137, "auxiliary_loss_mlp": 0.01024374, "balance_loss_clip": 1.04811823, "balance_loss_mlp": 1.01705432, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.7683351405105965, "language_loss": 0.77199423, "learning_rate": 7.796465845563123e-07, "loss": 0.79365933, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.684216022491455 }, { "auxiliary_loss_clip": 0.0113543, "auxiliary_loss_mlp": 0.00711086, "balance_loss_clip": 1.04764915, "balance_loss_mlp": 1.00073683, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 2.143479227908201, "language_loss": 0.7910074, "learning_rate": 7.790295248901766e-07, "loss": 0.80947256, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.686183452606201 }, { "auxiliary_loss_clip": 0.01157872, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.05102992, "balance_loss_mlp": 1.02075338, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 2.151659098914134, "language_loss": 0.6230039, "learning_rate": 7.784126504442902e-07, "loss": 0.64486575, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.700984477996826 }, { "auxiliary_loss_clip": 0.01118071, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.04715157, "balance_loss_mlp": 1.01840913, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.4456884469353457, "language_loss": 0.67907506, "learning_rate": 7.777959613122351e-07, "loss": 0.70050633, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.689654588699341 }, { "auxiliary_loss_clip": 0.01137341, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.05061042, "balance_loss_mlp": 1.02108276, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.895868725413646, "language_loss": 0.78335851, "learning_rate": 7.771794575875604e-07, "loss": 0.80501008, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.7256321907043457 }, { "auxiliary_loss_clip": 0.01157412, "auxiliary_loss_mlp": 0.01029641, "balance_loss_clip": 1.0518117, "balance_loss_mlp": 1.02168357, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 3.0258216433906564, "language_loss": 0.77963972, "learning_rate": 7.765631393637888e-07, "loss": 0.80151021, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.6456522941589355 }, { "auxiliary_loss_clip": 0.01150001, "auxiliary_loss_mlp": 0.01025525, "balance_loss_clip": 1.04684138, "balance_loss_mlp": 1.01784837, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 3.5478417933146558, "language_loss": 0.48955503, "learning_rate": 7.75947006734417e-07, "loss": 0.51131034, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.598029851913452 }, { "auxiliary_loss_clip": 0.01171568, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 1.05030012, "balance_loss_mlp": 1.01843035, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 2.261594473693705, "language_loss": 0.83207321, "learning_rate": 7.753310597929101e-07, "loss": 0.85405052, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.6022400856018066 }, { "auxiliary_loss_clip": 0.01086062, "auxiliary_loss_mlp": 0.01002207, "balance_loss_clip": 1.0322237, "balance_loss_mlp": 1.00128925, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7521339904482062, "language_loss": 0.5510608, "learning_rate": 7.747152986327095e-07, "loss": 0.57194352, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.0853679180145264 }, { "auxiliary_loss_clip": 0.01115204, "auxiliary_loss_mlp": 0.01029797, "balance_loss_clip": 1.04768538, "balance_loss_mlp": 1.02283859, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 2.18403320728565, "language_loss": 0.6820547, "learning_rate": 7.740997233472228e-07, "loss": 0.70350474, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.695284128189087 }, { "auxiliary_loss_clip": 0.0114268, "auxiliary_loss_mlp": 0.01022027, "balance_loss_clip": 1.05095387, "balance_loss_mlp": 1.01517534, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 5.396279250038809, "language_loss": 0.70489949, "learning_rate": 7.734843340298329e-07, "loss": 0.72654653, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.7424113750457764 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.01026597, "balance_loss_clip": 1.0473454, "balance_loss_mlp": 1.01899719, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.157362933458968, "language_loss": 0.75542331, "learning_rate": 7.72869130773895e-07, "loss": 0.77714789, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.782078742980957 }, { "auxiliary_loss_clip": 0.01074591, "auxiliary_loss_mlp": 0.01001915, "balance_loss_clip": 1.03107953, "balance_loss_mlp": 1.00102139, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.8273919288790343, "language_loss": 0.59284651, "learning_rate": 7.722541136727343e-07, "loss": 0.61361158, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.0717546939849854 }, { "auxiliary_loss_clip": 0.01155213, "auxiliary_loss_mlp": 0.01028358, "balance_loss_clip": 1.04999232, "balance_loss_mlp": 1.02136099, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 2.5300406007229914, "language_loss": 0.8041206, "learning_rate": 7.716392828196483e-07, "loss": 0.82595623, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.5706188678741455 }, { "auxiliary_loss_clip": 0.01158148, "auxiliary_loss_mlp": 0.01025232, "balance_loss_clip": 1.05161977, "balance_loss_mlp": 1.01730716, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 2.6814645356246922, "language_loss": 0.77047348, "learning_rate": 7.710246383079064e-07, "loss": 0.79230726, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.577960729598999 }, { "auxiliary_loss_clip": 0.0114289, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.04703999, "balance_loss_mlp": 1.01946855, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 3.8057100703494755, "language_loss": 0.91733176, "learning_rate": 7.704101802307492e-07, "loss": 0.93902457, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.6783955097198486 }, { "auxiliary_loss_clip": 0.01117367, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04553485, "balance_loss_mlp": 1.02311838, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.2839170399774433, "language_loss": 0.87290931, "learning_rate": 7.697959086813912e-07, "loss": 0.89439034, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.7328219413757324 }, { "auxiliary_loss_clip": 0.01115128, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.04328406, "balance_loss_mlp": 1.0242312, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.7776436053554598, "language_loss": 0.80156147, "learning_rate": 7.691818237530145e-07, "loss": 0.82302469, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 4.5338568687438965 }, { "auxiliary_loss_clip": 0.01122295, "auxiliary_loss_mlp": 0.01022649, "balance_loss_clip": 1.04790735, "balance_loss_mlp": 1.01499569, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 1.9668054872871807, "language_loss": 0.77493048, "learning_rate": 7.685679255387774e-07, "loss": 0.79637986, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.7536985874176025 }, { "auxiliary_loss_clip": 0.01140038, "auxiliary_loss_mlp": 0.01027276, "balance_loss_clip": 1.05000556, "balance_loss_mlp": 1.01969409, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 2.195046120140389, "language_loss": 0.77170122, "learning_rate": 7.679542141318065e-07, "loss": 0.79337436, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 3.526695966720581 }, { "auxiliary_loss_clip": 0.0112775, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.04466319, "balance_loss_mlp": 1.02083182, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 1.7506583804694076, "language_loss": 0.75904822, "learning_rate": 7.673406896252013e-07, "loss": 0.78060651, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.766411781311035 }, { "auxiliary_loss_clip": 0.01121137, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.04452276, "balance_loss_mlp": 1.02359104, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.60980886742931, "language_loss": 0.78622866, "learning_rate": 7.667273521120347e-07, "loss": 0.80775559, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 3.660324811935425 }, { "auxiliary_loss_clip": 0.01126998, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.04628325, "balance_loss_mlp": 1.02551627, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 2.6430070146588585, "language_loss": 0.79743737, "learning_rate": 7.661142016853468e-07, "loss": 0.8190347, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.6956968307495117 }, { "auxiliary_loss_clip": 0.01111016, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.04837418, "balance_loss_mlp": 1.01992834, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.793765921059629, "language_loss": 0.75121176, "learning_rate": 7.655012384381543e-07, "loss": 0.77259213, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.716196060180664 }, { "auxiliary_loss_clip": 0.01141179, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 1.05257046, "balance_loss_mlp": 1.02120149, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 3.0085102835927184, "language_loss": 0.81909704, "learning_rate": 7.648884624634415e-07, "loss": 0.84079736, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 2.7011022567749023 }, { "auxiliary_loss_clip": 0.01155151, "auxiliary_loss_mlp": 0.01026725, "balance_loss_clip": 1.05021477, "balance_loss_mlp": 1.01913095, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 1.8350977990745945, "language_loss": 0.88846231, "learning_rate": 7.642758738541683e-07, "loss": 0.91028106, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.5830271244049072 }, { "auxiliary_loss_clip": 0.01073505, "auxiliary_loss_mlp": 0.01000761, "balance_loss_clip": 1.03133512, "balance_loss_mlp": 0.99970627, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7549585713010374, "language_loss": 0.60763556, "learning_rate": 7.636634727032621e-07, "loss": 0.62837821, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.083916664123535 }, { "auxiliary_loss_clip": 0.01125157, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.04318547, "balance_loss_mlp": 1.02137733, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 11.915540545158132, "language_loss": 0.78807259, "learning_rate": 7.630512591036231e-07, "loss": 0.80961645, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.691251754760742 }, { "auxiliary_loss_clip": 0.0115793, "auxiliary_loss_mlp": 0.01026395, "balance_loss_clip": 1.05075014, "balance_loss_mlp": 1.01855731, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 3.1855419805849317, "language_loss": 0.65131712, "learning_rate": 7.624392331481255e-07, "loss": 0.67316031, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.6468684673309326 }, { "auxiliary_loss_clip": 0.01072068, "auxiliary_loss_mlp": 0.0100095, "balance_loss_clip": 1.03170776, "balance_loss_mlp": 0.99997807, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7447407804780886, "language_loss": 0.51856625, "learning_rate": 7.618273949296115e-07, "loss": 0.53929639, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.093607187271118 }, { "auxiliary_loss_clip": 0.01135618, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.04686832, "balance_loss_mlp": 1.02388799, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 2.4933760598526304, "language_loss": 0.68904334, "learning_rate": 7.612157445408987e-07, "loss": 0.71071243, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.6981563568115234 }, { "auxiliary_loss_clip": 0.01147143, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.05167651, "balance_loss_mlp": 1.01842844, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.223136321945356, "language_loss": 0.74421, "learning_rate": 7.606042820747716e-07, "loss": 0.76594138, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.788408041000366 }, { "auxiliary_loss_clip": 0.01147243, "auxiliary_loss_mlp": 0.01026025, "balance_loss_clip": 1.05079877, "balance_loss_mlp": 1.01864612, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.8461602487668651, "language_loss": 0.85415286, "learning_rate": 7.599930076239889e-07, "loss": 0.87588549, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.6301867961883545 }, { "auxiliary_loss_clip": 0.01114029, "auxiliary_loss_mlp": 0.00712024, "balance_loss_clip": 1.04723871, "balance_loss_mlp": 1.00076652, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 3.6778047860872753, "language_loss": 0.70579195, "learning_rate": 7.593819212812818e-07, "loss": 0.72405243, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.8689088821411133 }, { "auxiliary_loss_clip": 0.01156095, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 1.05126691, "balance_loss_mlp": 1.01854634, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 1.8570749556527077, "language_loss": 0.71844733, "learning_rate": 7.587710231393508e-07, "loss": 0.7402727, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.6020820140838623 }, { "auxiliary_loss_clip": 0.0106823, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.0410496, "balance_loss_mlp": 1.01954889, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 2.452965210601432, "language_loss": 0.83862925, "learning_rate": 7.581603132908685e-07, "loss": 0.85958314, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.807637929916382 }, { "auxiliary_loss_clip": 0.01118229, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.046134, "balance_loss_mlp": 1.01627994, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.9359433976495801, "language_loss": 0.78791356, "learning_rate": 7.575497918284795e-07, "loss": 0.8093313, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.638570547103882 }, { "auxiliary_loss_clip": 0.0117586, "auxiliary_loss_mlp": 0.01033202, "balance_loss_clip": 1.0515337, "balance_loss_mlp": 1.02549505, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 2.1516616605294843, "language_loss": 0.74022299, "learning_rate": 7.569394588447984e-07, "loss": 0.7623136, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.5908803939819336 }, { "auxiliary_loss_clip": 0.01148036, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.04770219, "balance_loss_mlp": 1.01890731, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 3.855988807178053, "language_loss": 0.78029406, "learning_rate": 7.563293144324146e-07, "loss": 0.80203867, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.559764862060547 }, { "auxiliary_loss_clip": 0.01172297, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.05304956, "balance_loss_mlp": 1.02035999, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 2.083685845464714, "language_loss": 0.80160093, "learning_rate": 7.557193586838834e-07, "loss": 0.8235985, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.659329891204834 }, { "auxiliary_loss_clip": 0.01145136, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04880309, "balance_loss_mlp": 1.01710057, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.481377134744629, "language_loss": 0.70289266, "learning_rate": 7.551095916917371e-07, "loss": 0.72458959, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.598253011703491 }, { "auxiliary_loss_clip": 0.01137547, "auxiliary_loss_mlp": 0.01028214, "balance_loss_clip": 1.04909325, "balance_loss_mlp": 1.01964915, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 5.7741723709983805, "language_loss": 0.66401374, "learning_rate": 7.545000135484758e-07, "loss": 0.68567145, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.7047455310821533 }, { "auxiliary_loss_clip": 0.0117569, "auxiliary_loss_mlp": 0.00712428, "balance_loss_clip": 1.05440235, "balance_loss_mlp": 1.0007515, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 4.0332497274759485, "language_loss": 0.63267124, "learning_rate": 7.538906243465714e-07, "loss": 0.65155244, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.644867181777954 }, { "auxiliary_loss_clip": 0.01173994, "auxiliary_loss_mlp": 0.01027047, "balance_loss_clip": 1.05236387, "balance_loss_mlp": 1.01976037, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.6596177886539363, "language_loss": 0.78600174, "learning_rate": 7.5328142417847e-07, "loss": 0.80801213, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 4.408808708190918 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01023598, "balance_loss_clip": 1.04825163, "balance_loss_mlp": 1.01652014, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.7952508094695159, "language_loss": 0.69199646, "learning_rate": 7.526724131365838e-07, "loss": 0.71377045, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.661393880844116 }, { "auxiliary_loss_clip": 0.01141593, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.05014384, "balance_loss_mlp": 1.02595901, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.8513335814073695, "language_loss": 0.70400792, "learning_rate": 7.520635913133017e-07, "loss": 0.72576606, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 3.5140247344970703 }, { "auxiliary_loss_clip": 0.0116457, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.05125129, "balance_loss_mlp": 1.02204061, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 3.9966307138029182, "language_loss": 0.82635462, "learning_rate": 7.514549588009798e-07, "loss": 0.84830111, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.669325351715088 }, { "auxiliary_loss_clip": 0.01146451, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.05042982, "balance_loss_mlp": 1.01891279, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 2.077721107282787, "language_loss": 0.70846796, "learning_rate": 7.508465156919492e-07, "loss": 0.73019636, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 3.560424566268921 }, { "auxiliary_loss_clip": 0.01141142, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.04810047, "balance_loss_mlp": 1.01957202, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.551857224128245, "language_loss": 0.61452663, "learning_rate": 7.502382620785083e-07, "loss": 0.63620877, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 2.6238598823547363 }, { "auxiliary_loss_clip": 0.0103896, "auxiliary_loss_mlp": 0.01004367, "balance_loss_clip": 1.02631307, "balance_loss_mlp": 1.00349677, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8121834115583246, "language_loss": 0.62502956, "learning_rate": 7.496301980529289e-07, "loss": 0.64546287, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.2748379707336426 }, { "auxiliary_loss_clip": 0.01173957, "auxiliary_loss_mlp": 0.01030952, "balance_loss_clip": 1.05224156, "balance_loss_mlp": 1.0237931, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.1526259231736833, "language_loss": 0.74674058, "learning_rate": 7.490223237074547e-07, "loss": 0.76878965, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.6136465072631836 }, { "auxiliary_loss_clip": 0.01125058, "auxiliary_loss_mlp": 0.0102892, "balance_loss_clip": 1.04680634, "balance_loss_mlp": 1.02124262, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 1.8808997322229457, "language_loss": 0.66332716, "learning_rate": 7.484146391342989e-07, "loss": 0.68486691, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.749868869781494 }, { "auxiliary_loss_clip": 0.01136799, "auxiliary_loss_mlp": 0.01024593, "balance_loss_clip": 1.04886186, "balance_loss_mlp": 1.01724339, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 3.8605209281008244, "language_loss": 0.57157111, "learning_rate": 7.478071444256484e-07, "loss": 0.59318507, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.653698682785034 }, { "auxiliary_loss_clip": 0.01134785, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.04762101, "balance_loss_mlp": 1.01762986, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 1.9311010375781807, "language_loss": 0.79280168, "learning_rate": 7.471998396736579e-07, "loss": 0.81440151, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.7200605869293213 }, { "auxiliary_loss_clip": 0.01127508, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 1.04800272, "balance_loss_mlp": 1.01870608, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.7229901744093372, "language_loss": 0.75990754, "learning_rate": 7.465927249704549e-07, "loss": 0.78143823, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.6554677486419678 }, { "auxiliary_loss_clip": 0.01155478, "auxiliary_loss_mlp": 0.0102491, "balance_loss_clip": 1.04989541, "balance_loss_mlp": 1.01777172, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 2.366433676861456, "language_loss": 0.77643478, "learning_rate": 7.459858004081398e-07, "loss": 0.79823864, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.5803234577178955 }, { "auxiliary_loss_clip": 0.01038398, "auxiliary_loss_mlp": 0.01001576, "balance_loss_clip": 1.02716613, "balance_loss_mlp": 1.00058079, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.9389942256654062, "language_loss": 0.58037186, "learning_rate": 7.453790660787815e-07, "loss": 0.60077167, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.361593008041382 }, { "auxiliary_loss_clip": 0.01147433, "auxiliary_loss_mlp": 0.01027449, "balance_loss_clip": 1.05258322, "balance_loss_mlp": 1.01961064, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.6814224716780677, "language_loss": 0.6364702, "learning_rate": 7.447725220744214e-07, "loss": 0.65821898, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.7768654823303223 }, { "auxiliary_loss_clip": 0.01174955, "auxiliary_loss_mlp": 0.01024619, "balance_loss_clip": 1.05252719, "balance_loss_mlp": 1.01674557, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.0765650761378542, "language_loss": 0.76861954, "learning_rate": 7.441661684870717e-07, "loss": 0.7906152, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.6351146697998047 }, { "auxiliary_loss_clip": 0.01175302, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.05287075, "balance_loss_mlp": 1.0196743, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 1.86808540521598, "language_loss": 0.81943703, "learning_rate": 7.435600054087152e-07, "loss": 0.8414619, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.6950225830078125 }, { "auxiliary_loss_clip": 0.01176696, "auxiliary_loss_mlp": 0.01025337, "balance_loss_clip": 1.05420709, "balance_loss_mlp": 1.01752901, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.7463105704668522, "language_loss": 0.74074626, "learning_rate": 7.42954032931308e-07, "loss": 0.76276666, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.6564393043518066 }, { "auxiliary_loss_clip": 0.01143472, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 1.04947793, "balance_loss_mlp": 1.01717973, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 3.3904085951525516, "language_loss": 0.74672836, "learning_rate": 7.423482511467733e-07, "loss": 0.76841241, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.7709615230560303 }, { "auxiliary_loss_clip": 0.01082565, "auxiliary_loss_mlp": 0.01029419, "balance_loss_clip": 1.04301596, "balance_loss_mlp": 1.0218854, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.3834694142073443, "language_loss": 0.64680177, "learning_rate": 7.417426601470099e-07, "loss": 0.66792166, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.766986846923828 }, { "auxiliary_loss_clip": 0.01162355, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.05343223, "balance_loss_mlp": 1.02321935, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.098764612221144, "language_loss": 0.78379381, "learning_rate": 7.411372600238841e-07, "loss": 0.80572844, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.699432373046875 }, { "auxiliary_loss_clip": 0.01174484, "auxiliary_loss_mlp": 0.01025204, "balance_loss_clip": 1.05320835, "balance_loss_mlp": 1.01787853, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 2.014508412123845, "language_loss": 0.73947835, "learning_rate": 7.405320508692346e-07, "loss": 0.76147521, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.540003538131714 }, { "auxiliary_loss_clip": 0.01172234, "auxiliary_loss_mlp": 0.01027236, "balance_loss_clip": 1.05379462, "balance_loss_mlp": 1.02033365, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 2.056956327618206, "language_loss": 0.75067496, "learning_rate": 7.399270327748727e-07, "loss": 0.77266967, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.6131033897399902 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.00711044, "balance_loss_clip": 1.0474416, "balance_loss_mlp": 1.00080359, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 2.6180523449863453, "language_loss": 0.74454534, "learning_rate": 7.39322205832577e-07, "loss": 0.76294118, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.7216997146606445 }, { "auxiliary_loss_clip": 0.01137434, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.04949844, "balance_loss_mlp": 1.02189517, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.989647317783262, "language_loss": 0.80726379, "learning_rate": 7.387175701341009e-07, "loss": 0.82892495, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.6819050312042236 }, { "auxiliary_loss_clip": 0.0115802, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.04994774, "balance_loss_mlp": 1.01956689, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.283481525144165, "language_loss": 0.71780646, "learning_rate": 7.381131257711659e-07, "loss": 0.73966146, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.565450668334961 }, { "auxiliary_loss_clip": 0.01138856, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.05130911, "balance_loss_mlp": 1.02223754, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.7808248708076053, "language_loss": 0.83472133, "learning_rate": 7.375088728354677e-07, "loss": 0.85639966, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 3.537233829498291 }, { "auxiliary_loss_clip": 0.01132887, "auxiliary_loss_mlp": 0.01027276, "balance_loss_clip": 1.04886329, "balance_loss_mlp": 1.01975989, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.570580743285669, "language_loss": 0.67928523, "learning_rate": 7.369048114186691e-07, "loss": 0.70088685, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.7325499057769775 }, { "auxiliary_loss_clip": 0.01135274, "auxiliary_loss_mlp": 0.00710906, "balance_loss_clip": 1.04992306, "balance_loss_mlp": 1.00074267, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 2.002147606385763, "language_loss": 0.83411491, "learning_rate": 7.363009416124055e-07, "loss": 0.85257667, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 3.668692111968994 }, { "auxiliary_loss_clip": 0.01129672, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.04919207, "balance_loss_mlp": 1.0194397, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 7.025372188747725, "language_loss": 0.62773037, "learning_rate": 7.356972635082852e-07, "loss": 0.64929283, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.701538562774658 }, { "auxiliary_loss_clip": 0.01108635, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.0474999, "balance_loss_mlp": 1.02204776, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 1.7290442533220782, "language_loss": 0.75474358, "learning_rate": 7.35093777197884e-07, "loss": 0.77613199, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 3.6742019653320312 }, { "auxiliary_loss_clip": 0.01139095, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.04861784, "balance_loss_mlp": 1.01890898, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.6072688987002866, "language_loss": 0.85939264, "learning_rate": 7.344904827727525e-07, "loss": 0.88104206, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.693089485168457 }, { "auxiliary_loss_clip": 0.01122325, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04321432, "balance_loss_mlp": 1.02013075, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 3.518930602922495, "language_loss": 0.73580039, "learning_rate": 7.338873803244076e-07, "loss": 0.75730181, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.726893424987793 }, { "auxiliary_loss_clip": 0.01137384, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.04838252, "balance_loss_mlp": 1.02152824, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.7739719492977377, "language_loss": 0.80935216, "learning_rate": 7.332844699443401e-07, "loss": 0.83101052, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.6668035984039307 }, { "auxiliary_loss_clip": 0.01101762, "auxiliary_loss_mlp": 0.01021002, "balance_loss_clip": 1.0430181, "balance_loss_mlp": 1.01413608, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 2.4320953326441, "language_loss": 0.75326097, "learning_rate": 7.326817517240121e-07, "loss": 0.77448863, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.7575132846832275 }, { "auxiliary_loss_clip": 0.01157513, "auxiliary_loss_mlp": 0.00711291, "balance_loss_clip": 1.04936492, "balance_loss_mlp": 1.00077653, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 2.056016197935266, "language_loss": 0.83386451, "learning_rate": 7.320792257548545e-07, "loss": 0.85255253, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.6883761882781982 }, { "auxiliary_loss_clip": 0.01147065, "auxiliary_loss_mlp": 0.01023081, "balance_loss_clip": 1.05020142, "balance_loss_mlp": 1.01541543, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 2.3809286307827513, "language_loss": 0.76469254, "learning_rate": 7.314768921282704e-07, "loss": 0.786394, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.709200143814087 }, { "auxiliary_loss_clip": 0.01159813, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.0503931, "balance_loss_mlp": 1.02100277, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.4426537638984787, "language_loss": 0.72155166, "learning_rate": 7.30874750935633e-07, "loss": 0.74343276, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.618360757827759 }, { "auxiliary_loss_clip": 0.01125649, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.04853344, "balance_loss_mlp": 1.02327585, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 2.280053351515818, "language_loss": 0.7921741, "learning_rate": 7.30272802268286e-07, "loss": 0.81373775, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.6825954914093018 }, { "auxiliary_loss_clip": 0.0106731, "auxiliary_loss_mlp": 0.01023719, "balance_loss_clip": 1.0391947, "balance_loss_mlp": 1.01716876, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.7057788084638652, "language_loss": 0.76386333, "learning_rate": 7.29671046217547e-07, "loss": 0.78477359, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.7451305389404297 }, { "auxiliary_loss_clip": 0.01125152, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.04664302, "balance_loss_mlp": 1.02074075, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.7855471863477776, "language_loss": 0.81993878, "learning_rate": 7.290694828746988e-07, "loss": 0.84147048, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.729492664337158 }, { "auxiliary_loss_clip": 0.01127581, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.0453006, "balance_loss_mlp": 1.01752257, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 2.0407038218750087, "language_loss": 0.85681975, "learning_rate": 7.284681123310004e-07, "loss": 0.87834346, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.679967164993286 }, { "auxiliary_loss_clip": 0.01157358, "auxiliary_loss_mlp": 0.01024853, "balance_loss_clip": 1.05024266, "balance_loss_mlp": 1.0175426, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 1.7917892687091193, "language_loss": 0.79722619, "learning_rate": 7.27866934677678e-07, "loss": 0.81904829, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.6331140995025635 }, { "auxiliary_loss_clip": 0.01107514, "auxiliary_loss_mlp": 0.01023905, "balance_loss_clip": 1.04559803, "balance_loss_mlp": 1.0163238, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.7641998727946906, "language_loss": 0.77938235, "learning_rate": 7.272659500059297e-07, "loss": 0.80069655, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.730268716812134 }, { "auxiliary_loss_clip": 0.01152344, "auxiliary_loss_mlp": 0.01032043, "balance_loss_clip": 1.05000865, "balance_loss_mlp": 1.02453828, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 3.165414664923291, "language_loss": 0.80277467, "learning_rate": 7.266651584069264e-07, "loss": 0.82461858, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.713480234146118 }, { "auxiliary_loss_clip": 0.01162488, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.0538044, "balance_loss_mlp": 1.02102292, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.8045122483113185, "language_loss": 0.57043993, "learning_rate": 7.260645599718045e-07, "loss": 0.59234732, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.7263331413269043 }, { "auxiliary_loss_clip": 0.01143101, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.04927969, "balance_loss_mlp": 1.01824248, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 2.6124766520673424, "language_loss": 0.67756164, "learning_rate": 7.254641547916767e-07, "loss": 0.69925964, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.6297340393066406 }, { "auxiliary_loss_clip": 0.01174877, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.05423403, "balance_loss_mlp": 1.01639318, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 1.9785685994430566, "language_loss": 0.69147336, "learning_rate": 7.248639429576226e-07, "loss": 0.7134586, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.6413917541503906 }, { "auxiliary_loss_clip": 0.01160391, "auxiliary_loss_mlp": 0.01023177, "balance_loss_clip": 1.05075717, "balance_loss_mlp": 1.01545823, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 2.0013737335621005, "language_loss": 0.71752369, "learning_rate": 7.242639245606959e-07, "loss": 0.73935938, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.628683567047119 }, { "auxiliary_loss_clip": 0.01146534, "auxiliary_loss_mlp": 0.01025959, "balance_loss_clip": 1.04878402, "balance_loss_mlp": 1.01846027, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 2.423136564340705, "language_loss": 0.82229483, "learning_rate": 7.236640996919168e-07, "loss": 0.84401977, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.6159942150115967 }, { "auxiliary_loss_clip": 0.01159888, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.05068827, "balance_loss_mlp": 1.01890147, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.6549291850138925, "language_loss": 0.70751476, "learning_rate": 7.230644684422782e-07, "loss": 0.72936982, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.5917155742645264 }, { "auxiliary_loss_clip": 0.01122997, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.04746699, "balance_loss_mlp": 1.02029896, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 2.043916945563521, "language_loss": 0.8155086, "learning_rate": 7.224650309027451e-07, "loss": 0.83701873, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.741605281829834 }, { "auxiliary_loss_clip": 0.01159556, "auxiliary_loss_mlp": 0.01025206, "balance_loss_clip": 1.05148947, "balance_loss_mlp": 1.01813674, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 2.511410656054169, "language_loss": 0.6892274, "learning_rate": 7.218657871642506e-07, "loss": 0.71107507, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 4.459387302398682 }, { "auxiliary_loss_clip": 0.0117605, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 1.05327606, "balance_loss_mlp": 1.01930881, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 2.128010583049504, "language_loss": 0.62670505, "learning_rate": 7.212667373177012e-07, "loss": 0.64873385, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.5321691036224365 }, { "auxiliary_loss_clip": 0.01125353, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.04638958, "balance_loss_mlp": 1.02301669, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 6.451020493723503, "language_loss": 0.75207824, "learning_rate": 7.206678814539704e-07, "loss": 0.77363527, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 3.5802149772644043 }, { "auxiliary_loss_clip": 0.01120583, "auxiliary_loss_mlp": 0.01023631, "balance_loss_clip": 1.04794097, "balance_loss_mlp": 1.01683891, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.8256121040131472, "language_loss": 0.72752321, "learning_rate": 7.20069219663904e-07, "loss": 0.74896538, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.7434399127960205 }, { "auxiliary_loss_clip": 0.01157835, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.04780209, "balance_loss_mlp": 1.01954293, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 1.810684345304477, "language_loss": 0.796646, "learning_rate": 7.1947075203832e-07, "loss": 0.81849051, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 3.5275776386260986 }, { "auxiliary_loss_clip": 0.01080917, "auxiliary_loss_mlp": 0.01001949, "balance_loss_clip": 1.02749753, "balance_loss_mlp": 1.00097108, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8646768323562347, "language_loss": 0.6010592, "learning_rate": 7.188724786680049e-07, "loss": 0.62188786, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.199930429458618 }, { "auxiliary_loss_clip": 0.01140472, "auxiliary_loss_mlp": 0.01029083, "balance_loss_clip": 1.04787552, "balance_loss_mlp": 1.0214715, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.6859086081983383, "language_loss": 0.76047939, "learning_rate": 7.182743996437162e-07, "loss": 0.78217494, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.6613597869873047 }, { "auxiliary_loss_clip": 0.01132632, "auxiliary_loss_mlp": 0.01029658, "balance_loss_clip": 1.04690671, "balance_loss_mlp": 1.02221918, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.9891065561260306, "language_loss": 0.68952334, "learning_rate": 7.176765150561819e-07, "loss": 0.71114624, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.7733230590820312 }, { "auxiliary_loss_clip": 0.01173603, "auxiliary_loss_mlp": 0.01028141, "balance_loss_clip": 1.0502162, "balance_loss_mlp": 1.02068162, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.5450361411514937, "language_loss": 0.80025673, "learning_rate": 7.170788249961002e-07, "loss": 0.82227415, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.5922656059265137 }, { "auxiliary_loss_clip": 0.0117191, "auxiliary_loss_mlp": 0.0103063, "balance_loss_clip": 1.05164254, "balance_loss_mlp": 1.02337611, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 1.898960695952764, "language_loss": 0.88433403, "learning_rate": 7.164813295541418e-07, "loss": 0.90635943, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.595933437347412 }, { "auxiliary_loss_clip": 0.01145425, "auxiliary_loss_mlp": 0.01025845, "balance_loss_clip": 1.04964793, "balance_loss_mlp": 1.01800656, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.7072945651822522, "language_loss": 0.70369917, "learning_rate": 7.15884028820944e-07, "loss": 0.72541189, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.666888952255249 }, { "auxiliary_loss_clip": 0.01120319, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04401481, "balance_loss_mlp": 1.02091098, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.228478343310099, "language_loss": 0.60280305, "learning_rate": 7.152869228871185e-07, "loss": 0.62428844, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.7137856483459473 }, { "auxiliary_loss_clip": 0.01138409, "auxiliary_loss_mlp": 0.01027484, "balance_loss_clip": 1.04896545, "balance_loss_mlp": 1.01998532, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 3.286968137077264, "language_loss": 0.72346044, "learning_rate": 7.146900118432457e-07, "loss": 0.74511939, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.6678261756896973 }, { "auxiliary_loss_clip": 0.01073913, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.03855121, "balance_loss_mlp": 1.02058601, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.7593582979236235, "language_loss": 0.86014748, "learning_rate": 7.140932957798753e-07, "loss": 0.88115966, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.791332483291626 }, { "auxiliary_loss_clip": 0.01145296, "auxiliary_loss_mlp": 0.01023866, "balance_loss_clip": 1.04836702, "balance_loss_mlp": 1.01679063, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 2.257694086058829, "language_loss": 0.71620721, "learning_rate": 7.134967747875309e-07, "loss": 0.73789883, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.654447078704834 }, { "auxiliary_loss_clip": 0.01151918, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.04914033, "balance_loss_mlp": 1.02376485, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 2.109431521957987, "language_loss": 0.81728309, "learning_rate": 7.129004489567014e-07, "loss": 0.83912104, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.564950466156006 }, { "auxiliary_loss_clip": 0.01129632, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.04740059, "balance_loss_mlp": 1.02058256, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.596112930027824, "language_loss": 0.77911723, "learning_rate": 7.123043183778512e-07, "loss": 0.80069542, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.675403118133545 }, { "auxiliary_loss_clip": 0.01133377, "auxiliary_loss_mlp": 0.01028493, "balance_loss_clip": 1.050246, "balance_loss_mlp": 1.0211736, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.6439758288465347, "language_loss": 0.65278101, "learning_rate": 7.117083831414114e-07, "loss": 0.67439967, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.651538848876953 }, { "auxiliary_loss_clip": 0.01171212, "auxiliary_loss_mlp": 0.01023549, "balance_loss_clip": 1.05130434, "balance_loss_mlp": 1.01628637, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.8590088287222246, "language_loss": 0.69869745, "learning_rate": 7.11112643337787e-07, "loss": 0.72064501, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.5944406986236572 }, { "auxiliary_loss_clip": 0.01143726, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.05026853, "balance_loss_mlp": 1.02143252, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.5218250476391764, "language_loss": 0.76380849, "learning_rate": 7.10517099057349e-07, "loss": 0.7855342, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.614783763885498 }, { "auxiliary_loss_clip": 0.0114471, "auxiliary_loss_mlp": 0.01024356, "balance_loss_clip": 1.05021858, "balance_loss_mlp": 1.01682758, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.3252059945906467, "language_loss": 0.61515701, "learning_rate": 7.099217503904411e-07, "loss": 0.63684767, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.6638169288635254 }, { "auxiliary_loss_clip": 0.01146005, "auxiliary_loss_mlp": 0.01022212, "balance_loss_clip": 1.04943514, "balance_loss_mlp": 1.0152142, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 2.851106839332794, "language_loss": 0.90016758, "learning_rate": 7.093265974273788e-07, "loss": 0.92184973, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.59458589553833 }, { "auxiliary_loss_clip": 0.01158918, "auxiliary_loss_mlp": 0.01023174, "balance_loss_clip": 1.05035615, "balance_loss_mlp": 1.01602125, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 2.7772026142914217, "language_loss": 0.71872473, "learning_rate": 7.087316402584447e-07, "loss": 0.74054563, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.665057897567749 }, { "auxiliary_loss_clip": 0.01172562, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.05172122, "balance_loss_mlp": 1.02203095, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 2.2426916086713837, "language_loss": 0.8644166, "learning_rate": 7.081368789738953e-07, "loss": 0.88643682, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.534250020980835 }, { "auxiliary_loss_clip": 0.01135866, "auxiliary_loss_mlp": 0.01024058, "balance_loss_clip": 1.04614902, "balance_loss_mlp": 1.01687229, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 2.020686737116716, "language_loss": 0.77780318, "learning_rate": 7.075423136639537e-07, "loss": 0.79940236, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.676150321960449 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.04518843, "balance_loss_mlp": 1.02141476, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 1.7541085510571774, "language_loss": 0.74674469, "learning_rate": 7.069479444188149e-07, "loss": 0.76823056, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.8846378326416016 }, { "auxiliary_loss_clip": 0.01135347, "auxiliary_loss_mlp": 0.01031939, "balance_loss_clip": 1.04906821, "balance_loss_mlp": 1.02444077, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.699906336702942, "language_loss": 0.82102418, "learning_rate": 7.063537713286453e-07, "loss": 0.84269702, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 4.510860204696655 }, { "auxiliary_loss_clip": 0.01145388, "auxiliary_loss_mlp": 0.01028912, "balance_loss_clip": 1.04794776, "balance_loss_mlp": 1.021402, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 1.858730524786445, "language_loss": 0.8091892, "learning_rate": 7.057597944835803e-07, "loss": 0.8309322, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 3.6472833156585693 }, { "auxiliary_loss_clip": 0.0113041, "auxiliary_loss_mlp": 0.01027597, "balance_loss_clip": 1.04517269, "balance_loss_mlp": 1.02007532, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 2.148745043012603, "language_loss": 0.74918973, "learning_rate": 7.051660139737253e-07, "loss": 0.77076977, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.762997627258301 }, { "auxiliary_loss_clip": 0.01155262, "auxiliary_loss_mlp": 0.00711588, "balance_loss_clip": 1.05136466, "balance_loss_mlp": 1.00072718, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 2.5010027276658398, "language_loss": 0.76681817, "learning_rate": 7.045724298891565e-07, "loss": 0.7854867, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 3.5395700931549072 }, { "auxiliary_loss_clip": 0.01154565, "auxiliary_loss_mlp": 0.01023318, "balance_loss_clip": 1.04917288, "balance_loss_mlp": 1.01571894, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 2.1163766975094545, "language_loss": 0.69491911, "learning_rate": 7.039790423199192e-07, "loss": 0.71669787, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.6595098972320557 }, { "auxiliary_loss_clip": 0.01144267, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 1.04847121, "balance_loss_mlp": 1.02126193, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 2.7488461121026844, "language_loss": 0.77874911, "learning_rate": 7.033858513560322e-07, "loss": 0.80048257, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.594082832336426 }, { "auxiliary_loss_clip": 0.01160423, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.05297482, "balance_loss_mlp": 1.02291739, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.3903032901686023, "language_loss": 0.76763213, "learning_rate": 7.027928570874794e-07, "loss": 0.78953964, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.6194519996643066 }, { "auxiliary_loss_clip": 0.01170532, "auxiliary_loss_mlp": 0.01024079, "balance_loss_clip": 1.05018067, "balance_loss_mlp": 1.01686049, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 2.5259734278239896, "language_loss": 0.85542595, "learning_rate": 7.022000596042194e-07, "loss": 0.87737203, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.563383102416992 }, { "auxiliary_loss_clip": 0.01125801, "auxiliary_loss_mlp": 0.01019758, "balance_loss_clip": 1.04440784, "balance_loss_mlp": 1.01273656, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.5765671693621974, "language_loss": 0.82246292, "learning_rate": 7.016074589961784e-07, "loss": 0.84391856, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.6733503341674805 }, { "auxiliary_loss_clip": 0.01138283, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.0481236, "balance_loss_mlp": 1.02176166, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 1.9123434557271988, "language_loss": 0.67281777, "learning_rate": 7.01015055353253e-07, "loss": 0.69448853, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.7372617721557617 }, { "auxiliary_loss_clip": 0.01099521, "auxiliary_loss_mlp": 0.01029838, "balance_loss_clip": 1.04529059, "balance_loss_mlp": 1.02266133, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 3.7378090724123614, "language_loss": 0.77926588, "learning_rate": 7.004228487653123e-07, "loss": 0.80055952, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.7529265880584717 }, { "auxiliary_loss_clip": 0.01122642, "auxiliary_loss_mlp": 0.01026639, "balance_loss_clip": 1.0433234, "balance_loss_mlp": 1.01941776, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 4.487991295614777, "language_loss": 0.78443146, "learning_rate": 6.998308393221906e-07, "loss": 0.8059243, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.911789655685425 }, { "auxiliary_loss_clip": 0.01128673, "auxiliary_loss_mlp": 0.01025697, "balance_loss_clip": 1.04863667, "balance_loss_mlp": 1.01855612, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.165517620985722, "language_loss": 0.71066177, "learning_rate": 6.992390271136977e-07, "loss": 0.73220545, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.663693428039551 }, { "auxiliary_loss_clip": 0.01148801, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.04755926, "balance_loss_mlp": 1.02162087, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 2.0377499708437625, "language_loss": 0.85770273, "learning_rate": 6.986474122296094e-07, "loss": 0.87948203, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.638334274291992 }, { "auxiliary_loss_clip": 0.01175995, "auxiliary_loss_mlp": 0.01027147, "balance_loss_clip": 1.05317569, "balance_loss_mlp": 1.01976204, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 1.9976653143166136, "language_loss": 0.72315371, "learning_rate": 6.980559947596751e-07, "loss": 0.74518514, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.583207130432129 }, { "auxiliary_loss_clip": 0.01110598, "auxiliary_loss_mlp": 0.01029943, "balance_loss_clip": 1.04576612, "balance_loss_mlp": 1.0223372, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 3.527598405624068, "language_loss": 0.76222378, "learning_rate": 6.974647747936109e-07, "loss": 0.78362918, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.7479138374328613 }, { "auxiliary_loss_clip": 0.01172436, "auxiliary_loss_mlp": 0.00711688, "balance_loss_clip": 1.05213201, "balance_loss_mlp": 1.00085378, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 1.9140791821150844, "language_loss": 0.82772893, "learning_rate": 6.968737524211039e-07, "loss": 0.84657013, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.597656726837158 }, { "auxiliary_loss_clip": 0.01154715, "auxiliary_loss_mlp": 0.0102727, "balance_loss_clip": 1.05049467, "balance_loss_mlp": 1.02007008, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.378732546526946, "language_loss": 0.79953313, "learning_rate": 6.962829277318132e-07, "loss": 0.82135296, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.6496565341949463 }, { "auxiliary_loss_clip": 0.01159142, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.05206847, "balance_loss_mlp": 1.02380157, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 2.030960975334753, "language_loss": 0.83635366, "learning_rate": 6.956923008153652e-07, "loss": 0.85825413, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.6339619159698486 }, { "auxiliary_loss_clip": 0.01158438, "auxiliary_loss_mlp": 0.01024749, "balance_loss_clip": 1.04974091, "balance_loss_mlp": 1.01779008, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.7289912422040845, "language_loss": 0.84771007, "learning_rate": 6.951018717613593e-07, "loss": 0.869542, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.688246011734009 }, { "auxiliary_loss_clip": 0.01155837, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.05103064, "balance_loss_mlp": 1.01814079, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 2.2511883773209527, "language_loss": 0.7846697, "learning_rate": 6.945116406593614e-07, "loss": 0.80648303, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.570456027984619 }, { "auxiliary_loss_clip": 0.01113546, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.0474304, "balance_loss_mlp": 1.01877105, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 4.269016740298267, "language_loss": 0.74365199, "learning_rate": 6.939216075989089e-07, "loss": 0.76504844, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.758150100708008 }, { "auxiliary_loss_clip": 0.01138649, "auxiliary_loss_mlp": 0.01024213, "balance_loss_clip": 1.0469799, "balance_loss_mlp": 1.01714396, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 1.7370510742476521, "language_loss": 0.65956569, "learning_rate": 6.933317726695109e-07, "loss": 0.68119437, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.6812314987182617 }, { "auxiliary_loss_clip": 0.01123429, "auxiliary_loss_mlp": 0.01021268, "balance_loss_clip": 1.04719067, "balance_loss_mlp": 1.01413298, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 3.8671260715574074, "language_loss": 0.80027395, "learning_rate": 6.92742135960644e-07, "loss": 0.8217209, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.652750253677368 }, { "auxiliary_loss_clip": 0.01069652, "auxiliary_loss_mlp": 0.0100248, "balance_loss_clip": 1.02736056, "balance_loss_mlp": 1.00152016, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8115426290636046, "language_loss": 0.55652034, "learning_rate": 6.921526975617556e-07, "loss": 0.57724166, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.220627784729004 }, { "auxiliary_loss_clip": 0.01142159, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.04676974, "balance_loss_mlp": 1.02157736, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 2.5363741195702687, "language_loss": 0.75196236, "learning_rate": 6.915634575622631e-07, "loss": 0.77367938, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.6024506092071533 }, { "auxiliary_loss_clip": 0.01170411, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 1.05029583, "balance_loss_mlp": 1.02097929, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 2.285396984028594, "language_loss": 0.70957899, "learning_rate": 6.909744160515532e-07, "loss": 0.7315647, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 4.456208229064941 }, { "auxiliary_loss_clip": 0.01138583, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 1.04945409, "balance_loss_mlp": 1.01679039, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 3.2004570592620074, "language_loss": 0.69518113, "learning_rate": 6.903855731189849e-07, "loss": 0.71681225, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 3.679591655731201 }, { "auxiliary_loss_clip": 0.01149028, "auxiliary_loss_mlp": 0.01028731, "balance_loss_clip": 1.05029356, "balance_loss_mlp": 1.02165604, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.1956175320654205, "language_loss": 0.81805575, "learning_rate": 6.897969288538825e-07, "loss": 0.83983338, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.6147379875183105 }, { "auxiliary_loss_clip": 0.01136192, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.04795337, "balance_loss_mlp": 1.0214299, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.9940374587102327, "language_loss": 0.81397432, "learning_rate": 6.892084833455452e-07, "loss": 0.83562225, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 3.576677083969116 }, { "auxiliary_loss_clip": 0.0115251, "auxiliary_loss_mlp": 0.01023571, "balance_loss_clip": 1.04944241, "balance_loss_mlp": 1.01659417, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 2.417042408203778, "language_loss": 0.84365821, "learning_rate": 6.886202366832384e-07, "loss": 0.86541903, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.657475709915161 }, { "auxiliary_loss_clip": 0.01107868, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.04680717, "balance_loss_mlp": 1.02002192, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 1.8978337619785495, "language_loss": 0.73652053, "learning_rate": 6.880321889561987e-07, "loss": 0.75787318, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.6802146434783936 }, { "auxiliary_loss_clip": 0.01120475, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.04657888, "balance_loss_mlp": 1.01506996, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.116134779410906, "language_loss": 0.65380859, "learning_rate": 6.874443402536338e-07, "loss": 0.67524993, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.7077584266662598 }, { "auxiliary_loss_clip": 0.01142012, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.04881835, "balance_loss_mlp": 1.02702522, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 3.325526214115959, "language_loss": 0.80081636, "learning_rate": 6.868566906647177e-07, "loss": 0.82258201, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.6959035396575928 }, { "auxiliary_loss_clip": 0.01157824, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.05010569, "balance_loss_mlp": 1.01883626, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 1.7529997414142635, "language_loss": 0.83644021, "learning_rate": 6.862692402785984e-07, "loss": 0.85827637, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.6662886142730713 }, { "auxiliary_loss_clip": 0.01050535, "auxiliary_loss_mlp": 0.01000914, "balance_loss_clip": 1.038715, "balance_loss_mlp": 1.00000215, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.7118862385161578, "language_loss": 0.49614406, "learning_rate": 6.856819891843899e-07, "loss": 0.51665854, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.3225624561309814 }, { "auxiliary_loss_clip": 0.01093362, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.0457046, "balance_loss_mlp": 1.02111232, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 2.1175261271372525, "language_loss": 0.72050345, "learning_rate": 6.8509493747118e-07, "loss": 0.74172151, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.7901673316955566 }, { "auxiliary_loss_clip": 0.01175903, "auxiliary_loss_mlp": 0.01022706, "balance_loss_clip": 1.05505323, "balance_loss_mlp": 1.01488566, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 5.099530595141654, "language_loss": 0.88491476, "learning_rate": 6.845080852280221e-07, "loss": 0.90690088, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.551908493041992 }, { "auxiliary_loss_clip": 0.01124964, "auxiliary_loss_mlp": 0.01027581, "balance_loss_clip": 1.04646254, "balance_loss_mlp": 1.02051175, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 2.656808957182529, "language_loss": 0.74895966, "learning_rate": 6.839214325439409e-07, "loss": 0.77048504, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.668955087661743 }, { "auxiliary_loss_clip": 0.01133152, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.04814792, "balance_loss_mlp": 1.01753736, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.7321881047191061, "language_loss": 0.71841151, "learning_rate": 6.833349795079327e-07, "loss": 0.73999047, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.63016676902771 }, { "auxiliary_loss_clip": 0.01125255, "auxiliary_loss_mlp": 0.0102662, "balance_loss_clip": 1.04786229, "balance_loss_mlp": 1.0188117, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.9164503523040135, "language_loss": 0.68952793, "learning_rate": 6.827487262089613e-07, "loss": 0.7110467, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.8000762462615967 }, { "auxiliary_loss_clip": 0.01053178, "auxiliary_loss_mlp": 0.01000683, "balance_loss_clip": 1.02700424, "balance_loss_mlp": 0.99970001, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.8825999002335698, "language_loss": 0.56790054, "learning_rate": 6.821626727359606e-07, "loss": 0.58843923, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.3074803352355957 }, { "auxiliary_loss_clip": 0.01140781, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.05071926, "balance_loss_mlp": 1.02187324, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.313828340544764, "language_loss": 0.77462876, "learning_rate": 6.815768191778348e-07, "loss": 0.79633242, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.597351312637329 }, { "auxiliary_loss_clip": 0.01150711, "auxiliary_loss_mlp": 0.01028071, "balance_loss_clip": 1.0480572, "balance_loss_mlp": 1.0205673, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.7950157617840399, "language_loss": 0.73087537, "learning_rate": 6.809911656234569e-07, "loss": 0.75266314, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.768094062805176 }, { "auxiliary_loss_clip": 0.01126686, "auxiliary_loss_mlp": 0.01025817, "balance_loss_clip": 1.04479444, "balance_loss_mlp": 1.01834571, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 1.9673634555526123, "language_loss": 0.78267765, "learning_rate": 6.804057121616707e-07, "loss": 0.80420274, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.6808905601501465 }, { "auxiliary_loss_clip": 0.01158243, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.04992795, "balance_loss_mlp": 1.01784635, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 3.699931310454921, "language_loss": 0.72290361, "learning_rate": 6.798204588812888e-07, "loss": 0.74473858, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.756700038909912 }, { "auxiliary_loss_clip": 0.01084629, "auxiliary_loss_mlp": 0.00711761, "balance_loss_clip": 1.04176092, "balance_loss_mlp": 1.00076652, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.7514254449703004, "language_loss": 0.75640756, "learning_rate": 6.792354058710937e-07, "loss": 0.7743715, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.759873390197754 }, { "auxiliary_loss_clip": 0.01166475, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.05095792, "balance_loss_mlp": 1.02021539, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 1.8678824583163078, "language_loss": 0.65329641, "learning_rate": 6.786505532198374e-07, "loss": 0.67523408, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.5932505130767822 }, { "auxiliary_loss_clip": 0.01171413, "auxiliary_loss_mlp": 0.01025811, "balance_loss_clip": 1.05121589, "balance_loss_mlp": 1.01846445, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 2.1370484543810626, "language_loss": 0.85529649, "learning_rate": 6.780659010162411e-07, "loss": 0.87726867, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.5898568630218506 }, { "auxiliary_loss_clip": 0.01128239, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.0479449, "balance_loss_mlp": 1.01875424, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 2.1261238449786353, "language_loss": 0.83108366, "learning_rate": 6.774814493489975e-07, "loss": 0.85262078, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.7238659858703613 }, { "auxiliary_loss_clip": 0.01152715, "auxiliary_loss_mlp": 0.01024642, "balance_loss_clip": 1.04877758, "balance_loss_mlp": 1.01742363, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 2.0787217945346637, "language_loss": 0.66095817, "learning_rate": 6.768971983067655e-07, "loss": 0.68273175, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.635550022125244 }, { "auxiliary_loss_clip": 0.01080614, "auxiliary_loss_mlp": 0.01001333, "balance_loss_clip": 1.02736115, "balance_loss_mlp": 1.0003916, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0096397768575147, "language_loss": 0.67774987, "learning_rate": 6.763131479781772e-07, "loss": 0.69856942, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.0315678119659424 }, { "auxiliary_loss_clip": 0.01130631, "auxiliary_loss_mlp": 0.01023889, "balance_loss_clip": 1.04729414, "balance_loss_mlp": 1.01663542, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 2.053435603424915, "language_loss": 0.76443517, "learning_rate": 6.757292984518316e-07, "loss": 0.78598034, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 3.6210129261016846 }, { "auxiliary_loss_clip": 0.01067198, "auxiliary_loss_mlp": 0.01000498, "balance_loss_clip": 1.02573466, "balance_loss_mlp": 0.99956846, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7397500898051346, "language_loss": 0.56378311, "learning_rate": 6.751456498162981e-07, "loss": 0.58446008, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.9610543251037598 }, { "auxiliary_loss_clip": 0.01151019, "auxiliary_loss_mlp": 0.01022683, "balance_loss_clip": 1.04533195, "balance_loss_mlp": 1.01562047, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 1.8951995227792202, "language_loss": 0.85713249, "learning_rate": 6.745622021601174e-07, "loss": 0.87886953, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.5641989707946777 }, { "auxiliary_loss_clip": 0.01124964, "auxiliary_loss_mlp": 0.01024142, "balance_loss_clip": 1.04558241, "balance_loss_mlp": 1.01699281, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.813275152606135, "language_loss": 0.6980449, "learning_rate": 6.739789555717954e-07, "loss": 0.71953595, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.742860794067383 }, { "auxiliary_loss_clip": 0.01170882, "auxiliary_loss_mlp": 0.01023996, "balance_loss_clip": 1.0506804, "balance_loss_mlp": 1.01708734, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 2.514111215235625, "language_loss": 0.77420199, "learning_rate": 6.733959101398124e-07, "loss": 0.7961508, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 3.519660711288452 }, { "auxiliary_loss_clip": 0.01136509, "auxiliary_loss_mlp": 0.01026921, "balance_loss_clip": 1.04606676, "balance_loss_mlp": 1.019804, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.6538245563860392, "language_loss": 0.81715727, "learning_rate": 6.728130659526143e-07, "loss": 0.83879155, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.688554048538208 }, { "auxiliary_loss_clip": 0.01141917, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.04870915, "balance_loss_mlp": 1.02061963, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 3.5093367582800363, "language_loss": 0.71782327, "learning_rate": 6.7223042309862e-07, "loss": 0.73951977, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.6992621421813965 }, { "auxiliary_loss_clip": 0.01151179, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.04661393, "balance_loss_mlp": 1.02193666, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 2.8248568029340064, "language_loss": 0.7369138, "learning_rate": 6.716479816662144e-07, "loss": 0.75871563, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.7046308517456055 }, { "auxiliary_loss_clip": 0.01143051, "auxiliary_loss_mlp": 0.01026754, "balance_loss_clip": 1.0475291, "balance_loss_mlp": 1.02005982, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.943207899947194, "language_loss": 0.72939122, "learning_rate": 6.710657417437531e-07, "loss": 0.75108927, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.713555335998535 }, { "auxiliary_loss_clip": 0.01139047, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.04791439, "balance_loss_mlp": 1.01735353, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.1801382327759176, "language_loss": 0.80176395, "learning_rate": 6.704837034195628e-07, "loss": 0.82340002, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.704359769821167 }, { "auxiliary_loss_clip": 0.01149589, "auxiliary_loss_mlp": 0.01040405, "balance_loss_clip": 1.04889894, "balance_loss_mlp": 1.0325073, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 1.993606811454867, "language_loss": 0.85069782, "learning_rate": 6.699018667819376e-07, "loss": 0.87259769, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.685176372528076 }, { "auxiliary_loss_clip": 0.01149947, "auxiliary_loss_mlp": 0.01022625, "balance_loss_clip": 1.04675698, "balance_loss_mlp": 1.01556206, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.8809447854107078, "language_loss": 0.72923195, "learning_rate": 6.693202319191415e-07, "loss": 0.75095773, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.6498591899871826 }, { "auxiliary_loss_clip": 0.01171661, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.05482459, "balance_loss_mlp": 1.02299547, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 2.119538945224614, "language_loss": 0.75053728, "learning_rate": 6.687387989194084e-07, "loss": 0.77255714, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.779646873474121 }, { "auxiliary_loss_clip": 0.01135249, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.04867363, "balance_loss_mlp": 1.0207243, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.957534881265153, "language_loss": 0.79508686, "learning_rate": 6.681575678709404e-07, "loss": 0.81672132, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.6756951808929443 }, { "auxiliary_loss_clip": 0.01154855, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.0498811, "balance_loss_mlp": 1.02034092, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 3.9000725860306926, "language_loss": 0.71129924, "learning_rate": 6.67576538861911e-07, "loss": 0.73312187, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.69706130027771 }, { "auxiliary_loss_clip": 0.01135343, "auxiliary_loss_mlp": 0.01023951, "balance_loss_clip": 1.04760373, "balance_loss_mlp": 1.01716542, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 1.5161123672676171, "language_loss": 0.82197332, "learning_rate": 6.669957119804612e-07, "loss": 0.84356624, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.6418356895446777 }, { "auxiliary_loss_clip": 0.01147461, "auxiliary_loss_mlp": 0.01021106, "balance_loss_clip": 1.0491631, "balance_loss_mlp": 1.01364315, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 2.8992009990916245, "language_loss": 0.72825092, "learning_rate": 6.66415087314702e-07, "loss": 0.74993658, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.688110589981079 }, { "auxiliary_loss_clip": 0.0114193, "auxiliary_loss_mlp": 0.01024018, "balance_loss_clip": 1.04873872, "balance_loss_mlp": 1.0167582, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.1242042162085273, "language_loss": 0.73319662, "learning_rate": 6.65834664952714e-07, "loss": 0.75485611, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.6207404136657715 }, { "auxiliary_loss_clip": 0.01124569, "auxiliary_loss_mlp": 0.01021469, "balance_loss_clip": 1.04506588, "balance_loss_mlp": 1.01452494, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.778054753495674, "language_loss": 0.75812042, "learning_rate": 6.652544449825457e-07, "loss": 0.77958083, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.822023391723633 }, { "auxiliary_loss_clip": 0.01145262, "auxiliary_loss_mlp": 0.01027981, "balance_loss_clip": 1.04718721, "balance_loss_mlp": 1.02060807, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.7642968831389925, "language_loss": 0.76432991, "learning_rate": 6.646744274922182e-07, "loss": 0.78606236, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.6706931591033936 }, { "auxiliary_loss_clip": 0.01140007, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.04704428, "balance_loss_mlp": 1.01861167, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 5.542252334397595, "language_loss": 0.75078505, "learning_rate": 6.640946125697171e-07, "loss": 0.77244043, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.6771981716156006 }, { "auxiliary_loss_clip": 0.01157172, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.04876459, "balance_loss_mlp": 1.02060294, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 3.9320682118943284, "language_loss": 0.75725543, "learning_rate": 6.635150003030017e-07, "loss": 0.77910662, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.7253541946411133 }, { "auxiliary_loss_clip": 0.01107226, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.04124224, "balance_loss_mlp": 1.02383685, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.302475852316833, "language_loss": 0.86019897, "learning_rate": 6.629355907799981e-07, "loss": 0.88158113, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.749242067337036 }, { "auxiliary_loss_clip": 0.01157889, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.04916775, "balance_loss_mlp": 1.02310658, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 2.053510997499577, "language_loss": 0.69298589, "learning_rate": 6.623563840886015e-07, "loss": 0.71487045, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.733980894088745 }, { "auxiliary_loss_clip": 0.01150284, "auxiliary_loss_mlp": 0.010275, "balance_loss_clip": 1.04775548, "balance_loss_mlp": 1.02063918, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 2.5229812653065036, "language_loss": 0.69437218, "learning_rate": 6.617773803166795e-07, "loss": 0.71615005, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.63637113571167 }, { "auxiliary_loss_clip": 0.01143844, "auxiliary_loss_mlp": 0.00711695, "balance_loss_clip": 1.04955268, "balance_loss_mlp": 1.00080299, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 3.066547296055129, "language_loss": 0.8238802, "learning_rate": 6.611985795520634e-07, "loss": 0.8424356, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.727961778640747 }, { "auxiliary_loss_clip": 0.01133916, "auxiliary_loss_mlp": 0.01024341, "balance_loss_clip": 1.04915655, "balance_loss_mlp": 1.01693249, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 2.2769332413161045, "language_loss": 0.77526957, "learning_rate": 6.606199818825588e-07, "loss": 0.79685211, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 4.613282203674316 }, { "auxiliary_loss_clip": 0.01140991, "auxiliary_loss_mlp": 0.01024401, "balance_loss_clip": 1.04467821, "balance_loss_mlp": 1.01754069, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 3.06317377082726, "language_loss": 0.81718767, "learning_rate": 6.600415873959377e-07, "loss": 0.83884156, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 3.5404648780822754 }, { "auxiliary_loss_clip": 0.01088694, "auxiliary_loss_mlp": 0.0071056, "balance_loss_clip": 1.03989446, "balance_loss_mlp": 1.00060856, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 2.160607006352499, "language_loss": 0.64688158, "learning_rate": 6.594633961799437e-07, "loss": 0.66487414, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.7733917236328125 }, { "auxiliary_loss_clip": 0.01132679, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.04829729, "balance_loss_mlp": 1.02155077, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 1.668020906444252, "language_loss": 0.81584531, "learning_rate": 6.588854083222857e-07, "loss": 0.8374539, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 3.5456764698028564 }, { "auxiliary_loss_clip": 0.01141204, "auxiliary_loss_mlp": 0.0102547, "balance_loss_clip": 1.04734194, "balance_loss_mlp": 1.01819873, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 2.5209583104084534, "language_loss": 0.80382961, "learning_rate": 6.583076239106444e-07, "loss": 0.82549632, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.6388983726501465 }, { "auxiliary_loss_clip": 0.01144749, "auxiliary_loss_mlp": 0.01023941, "balance_loss_clip": 1.04833889, "balance_loss_mlp": 1.01677012, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 3.652288963979642, "language_loss": 0.75498092, "learning_rate": 6.577300430326707e-07, "loss": 0.77666783, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.6577584743499756 }, { "auxiliary_loss_clip": 0.01117824, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.045573, "balance_loss_mlp": 1.02100098, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.71975499879491, "language_loss": 0.72474545, "learning_rate": 6.571526657759821e-07, "loss": 0.74620521, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.6472928524017334 }, { "auxiliary_loss_clip": 0.01149366, "auxiliary_loss_mlp": 0.01024211, "balance_loss_clip": 1.04721439, "balance_loss_mlp": 1.01715052, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.9589574558882084, "language_loss": 0.70736074, "learning_rate": 6.565754922281663e-07, "loss": 0.72909647, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.7141613960266113 }, { "auxiliary_loss_clip": 0.01138862, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.04702413, "balance_loss_mlp": 1.0184418, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.8126696392647887, "language_loss": 0.78481948, "learning_rate": 6.559985224767801e-07, "loss": 0.80646104, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.6166458129882812 }, { "auxiliary_loss_clip": 0.0112929, "auxiliary_loss_mlp": 0.0102562, "balance_loss_clip": 1.0470686, "balance_loss_mlp": 1.01828289, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.399051384695823, "language_loss": 0.75454438, "learning_rate": 6.55421756609349e-07, "loss": 0.77609348, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.709036111831665 }, { "auxiliary_loss_clip": 0.01152435, "auxiliary_loss_mlp": 0.0102187, "balance_loss_clip": 1.05041766, "balance_loss_mlp": 1.01464283, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 2.430913733187067, "language_loss": 0.7912367, "learning_rate": 6.54845194713369e-07, "loss": 0.81297976, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.6615357398986816 }, { "auxiliary_loss_clip": 0.01150259, "auxiliary_loss_mlp": 0.0102698, "balance_loss_clip": 1.04868269, "balance_loss_mlp": 1.01957083, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 2.541593711676108, "language_loss": 0.79841536, "learning_rate": 6.542688368763034e-07, "loss": 0.82018781, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.6261146068573 }, { "auxiliary_loss_clip": 0.01153611, "auxiliary_loss_mlp": 0.01024862, "balance_loss_clip": 1.05035448, "balance_loss_mlp": 1.01764393, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 2.057344590334953, "language_loss": 0.77047789, "learning_rate": 6.536926831855854e-07, "loss": 0.79226267, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.6337599754333496 }, { "auxiliary_loss_clip": 0.01136145, "auxiliary_loss_mlp": 0.010231, "balance_loss_clip": 1.04763496, "balance_loss_mlp": 1.01639175, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.287259141644716, "language_loss": 0.73242503, "learning_rate": 6.531167337286165e-07, "loss": 0.75401747, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.819960117340088 }, { "auxiliary_loss_clip": 0.01137871, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.05007911, "balance_loss_mlp": 1.01693153, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 13.82546207432684, "language_loss": 0.79975897, "learning_rate": 6.52540988592768e-07, "loss": 0.82138234, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.6803128719329834 }, { "auxiliary_loss_clip": 0.01142175, "auxiliary_loss_mlp": 0.01025312, "balance_loss_clip": 1.04896724, "balance_loss_mlp": 1.01874959, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 2.881958832058643, "language_loss": 0.83508956, "learning_rate": 6.519654478653814e-07, "loss": 0.85676444, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.6649348735809326 }, { "auxiliary_loss_clip": 0.01057127, "auxiliary_loss_mlp": 0.01002346, "balance_loss_clip": 1.02605104, "balance_loss_mlp": 1.00140405, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7421876695514018, "language_loss": 0.56057221, "learning_rate": 6.51390111633763e-07, "loss": 0.58116698, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.2654547691345215 }, { "auxiliary_loss_clip": 0.0109226, "auxiliary_loss_mlp": 0.01023067, "balance_loss_clip": 1.04263747, "balance_loss_mlp": 1.01633143, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.6714410047235149, "language_loss": 0.76104569, "learning_rate": 6.508149799851932e-07, "loss": 0.78219891, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.788416624069214 }, { "auxiliary_loss_clip": 0.0113475, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.0473907, "balance_loss_mlp": 1.02041125, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 3.0258943235670928, "language_loss": 0.61523324, "learning_rate": 6.502400530069183e-07, "loss": 0.63685453, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.7197322845458984 }, { "auxiliary_loss_clip": 0.01124997, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.04746771, "balance_loss_mlp": 1.02079904, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 2.5321903821334324, "language_loss": 0.68389654, "learning_rate": 6.496653307861535e-07, "loss": 0.70543402, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.66672682762146 }, { "auxiliary_loss_clip": 0.0115979, "auxiliary_loss_mlp": 0.01023496, "balance_loss_clip": 1.04871202, "balance_loss_mlp": 1.01636112, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.9435836826115624, "language_loss": 0.66016871, "learning_rate": 6.490908134100857e-07, "loss": 0.68200153, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.646984100341797 }, { "auxiliary_loss_clip": 0.01158841, "auxiliary_loss_mlp": 0.01024892, "balance_loss_clip": 1.04946685, "balance_loss_mlp": 1.0178107, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.790099341628695, "language_loss": 0.69326562, "learning_rate": 6.48516500965866e-07, "loss": 0.71510297, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.6306567192077637 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.04631424, "balance_loss_mlp": 1.02224517, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.7325778291899225, "language_loss": 0.81542408, "learning_rate": 6.479423935406192e-07, "loss": 0.83728695, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.697110891342163 }, { "auxiliary_loss_clip": 0.01044285, "auxiliary_loss_mlp": 0.01000926, "balance_loss_clip": 1.02036846, "balance_loss_mlp": 0.9999302, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8098312157267468, "language_loss": 0.61914349, "learning_rate": 6.473684912214357e-07, "loss": 0.63959557, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.3400943279266357 }, { "auxiliary_loss_clip": 0.01154843, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.04952204, "balance_loss_mlp": 1.02119589, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 2.6456319912479445, "language_loss": 0.69556272, "learning_rate": 6.467947940953778e-07, "loss": 0.71739358, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.601475238800049 }, { "auxiliary_loss_clip": 0.01137346, "auxiliary_loss_mlp": 0.01021618, "balance_loss_clip": 1.04740453, "balance_loss_mlp": 1.01515949, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 2.3144411327642773, "language_loss": 0.72482997, "learning_rate": 6.462213022494732e-07, "loss": 0.74641955, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 3.6855216026306152 }, { "auxiliary_loss_clip": 0.01067717, "auxiliary_loss_mlp": 0.01003194, "balance_loss_clip": 1.02584481, "balance_loss_mlp": 1.0022229, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.768838601445905, "language_loss": 0.6103611, "learning_rate": 6.456480157707201e-07, "loss": 0.63107026, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 4.084123611450195 }, { "auxiliary_loss_clip": 0.01114726, "auxiliary_loss_mlp": 0.01025193, "balance_loss_clip": 1.04364681, "balance_loss_mlp": 1.01850533, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.424555601933411, "language_loss": 0.85473013, "learning_rate": 6.450749347460866e-07, "loss": 0.87612933, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.638737678527832 }, { "auxiliary_loss_clip": 0.0116964, "auxiliary_loss_mlp": 0.01029056, "balance_loss_clip": 1.04945993, "balance_loss_mlp": 1.02156377, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 2.7282705186876814, "language_loss": 0.79012752, "learning_rate": 6.445020592625083e-07, "loss": 0.81211442, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.690241813659668 }, { "auxiliary_loss_clip": 0.01169261, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.04981017, "balance_loss_mlp": 1.02600992, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.3847627407312233, "language_loss": 0.80064422, "learning_rate": 6.4392938940689e-07, "loss": 0.82266802, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 3.469294786453247 }, { "auxiliary_loss_clip": 0.01102935, "auxiliary_loss_mlp": 0.00711906, "balance_loss_clip": 1.04491162, "balance_loss_mlp": 1.00074673, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 7.56013534564943, "language_loss": 0.71342856, "learning_rate": 6.433569252661049e-07, "loss": 0.73157692, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.7246687412261963 }, { "auxiliary_loss_clip": 0.01113473, "auxiliary_loss_mlp": 0.01018814, "balance_loss_clip": 1.0429709, "balance_loss_mlp": 1.01207554, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 2.1747806623948764, "language_loss": 0.71659213, "learning_rate": 6.427846669269952e-07, "loss": 0.73791498, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.6627163887023926 }, { "auxiliary_loss_clip": 0.0117496, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.05428004, "balance_loss_mlp": 1.02174151, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 3.386239812529431, "language_loss": 0.82617271, "learning_rate": 6.422126144763729e-07, "loss": 0.84821051, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.602506637573242 }, { "auxiliary_loss_clip": 0.01117566, "auxiliary_loss_mlp": 0.00712156, "balance_loss_clip": 1.04137659, "balance_loss_mlp": 1.00058639, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 3.6624204244422702, "language_loss": 0.77028954, "learning_rate": 6.416407680010174e-07, "loss": 0.78858674, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.704601287841797 }, { "auxiliary_loss_clip": 0.01122981, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.04753506, "balance_loss_mlp": 1.02209854, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 1.9304635723032828, "language_loss": 0.81225246, "learning_rate": 6.410691275876774e-07, "loss": 0.83377481, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.74904203414917 }, { "auxiliary_loss_clip": 0.01147928, "auxiliary_loss_mlp": 0.0102682, "balance_loss_clip": 1.0500679, "balance_loss_mlp": 1.01953626, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 3.334195200508876, "language_loss": 0.76907575, "learning_rate": 6.404976933230704e-07, "loss": 0.79082322, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.6365785598754883 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.04887128, "balance_loss_mlp": 1.02037525, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 2.044433221059131, "language_loss": 0.72524679, "learning_rate": 6.399264652938813e-07, "loss": 0.74696636, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.838297128677368 }, { "auxiliary_loss_clip": 0.01138852, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 1.04727829, "balance_loss_mlp": 1.02154422, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 2.564751015809529, "language_loss": 0.74500877, "learning_rate": 6.393554435867679e-07, "loss": 0.76668108, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.711724042892456 }, { "auxiliary_loss_clip": 0.01118264, "auxiliary_loss_mlp": 0.0102439, "balance_loss_clip": 1.04448533, "balance_loss_mlp": 1.01734531, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 2.374907017599763, "language_loss": 0.8394196, "learning_rate": 6.387846282883502e-07, "loss": 0.86084616, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.6557230949401855 }, { "auxiliary_loss_clip": 0.01168744, "auxiliary_loss_mlp": 0.0102227, "balance_loss_clip": 1.0500164, "balance_loss_mlp": 1.01492691, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 2.0411952091316206, "language_loss": 0.76844901, "learning_rate": 6.38214019485223e-07, "loss": 0.79035914, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.6144111156463623 }, { "auxiliary_loss_clip": 0.01084164, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.04036093, "balance_loss_mlp": 1.02159679, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 1.7752932478396553, "language_loss": 0.71576488, "learning_rate": 6.376436172639461e-07, "loss": 0.73689318, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.7155206203460693 }, { "auxiliary_loss_clip": 0.0107843, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.04170024, "balance_loss_mlp": 1.02692711, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.715425507203847, "language_loss": 0.6525774, "learning_rate": 6.370734217110487e-07, "loss": 0.67370367, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.804309129714966 }, { "auxiliary_loss_clip": 0.01145058, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.05134392, "balance_loss_mlp": 1.02207637, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.809459542463636, "language_loss": 0.639925, "learning_rate": 6.36503432913031e-07, "loss": 0.66167247, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 2.9158127307891846 }, { "auxiliary_loss_clip": 0.01151897, "auxiliary_loss_mlp": 0.01024072, "balance_loss_clip": 1.04802454, "balance_loss_mlp": 1.01641321, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 1.9680455973276947, "language_loss": 0.6909045, "learning_rate": 6.359336509563569e-07, "loss": 0.71266425, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.610002040863037 }, { "auxiliary_loss_clip": 0.01111524, "auxiliary_loss_mlp": 0.0102505, "balance_loss_clip": 1.04435205, "balance_loss_mlp": 1.01701546, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 5.195844870864138, "language_loss": 0.80699986, "learning_rate": 6.353640759274641e-07, "loss": 0.82836556, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.6920697689056396 }, { "auxiliary_loss_clip": 0.01150704, "auxiliary_loss_mlp": 0.0102885, "balance_loss_clip": 1.04643583, "balance_loss_mlp": 1.0214653, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 5.3237762771261, "language_loss": 0.74762195, "learning_rate": 6.347947079127556e-07, "loss": 0.76941752, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.6132194995880127 }, { "auxiliary_loss_clip": 0.01133892, "auxiliary_loss_mlp": 0.01025433, "balance_loss_clip": 1.04769683, "balance_loss_mlp": 1.01837015, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.160817745226543, "language_loss": 0.77558804, "learning_rate": 6.342255469986053e-07, "loss": 0.79718137, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.6177804470062256 }, { "auxiliary_loss_clip": 0.011701, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 1.05112028, "balance_loss_mlp": 1.01625443, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.8330191022209839, "language_loss": 0.76626456, "learning_rate": 6.336565932713533e-07, "loss": 0.78820169, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.614520311355591 }, { "auxiliary_loss_clip": 0.01138878, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.04999626, "balance_loss_mlp": 1.02023137, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 2.833127947539292, "language_loss": 0.77785873, "learning_rate": 6.330878468173088e-07, "loss": 0.79952675, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.712291955947876 }, { "auxiliary_loss_clip": 0.0114612, "auxiliary_loss_mlp": 0.01024223, "balance_loss_clip": 1.04609215, "balance_loss_mlp": 1.01680183, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 2.2353930612707167, "language_loss": 0.72586262, "learning_rate": 6.32519307722752e-07, "loss": 0.74756598, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.6170849800109863 }, { "auxiliary_loss_clip": 0.01046837, "auxiliary_loss_mlp": 0.01000981, "balance_loss_clip": 1.03559184, "balance_loss_mlp": 1.00002694, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8465714888717176, "language_loss": 0.54915643, "learning_rate": 6.31950976073929e-07, "loss": 0.56963456, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.2591042518615723 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01028762, "balance_loss_clip": 1.04468751, "balance_loss_mlp": 1.02151656, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 7.577272126426443, "language_loss": 0.80954099, "learning_rate": 6.31382851957055e-07, "loss": 0.83086717, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 3.6511855125427246 }, { "auxiliary_loss_clip": 0.01119456, "auxiliary_loss_mlp": 0.00711343, "balance_loss_clip": 1.04647326, "balance_loss_mlp": 1.00072646, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 2.149576752880385, "language_loss": 0.71897209, "learning_rate": 6.308149354583143e-07, "loss": 0.73728013, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 3.7333426475524902 }, { "auxiliary_loss_clip": 0.01157389, "auxiliary_loss_mlp": 0.01024644, "balance_loss_clip": 1.04917991, "balance_loss_mlp": 1.01731849, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 2.01333571965354, "language_loss": 0.81798565, "learning_rate": 6.302472266638586e-07, "loss": 0.83980596, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.680981397628784 }, { "auxiliary_loss_clip": 0.0117556, "auxiliary_loss_mlp": 0.01029024, "balance_loss_clip": 1.05159163, "balance_loss_mlp": 1.02016711, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.341937858768519, "language_loss": 0.70179188, "learning_rate": 6.296797256598101e-07, "loss": 0.72383773, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.674616575241089 }, { "auxiliary_loss_clip": 0.01114046, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.04373229, "balance_loss_mlp": 1.02116299, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.815234165692481, "language_loss": 0.81548685, "learning_rate": 6.291124325322576e-07, "loss": 0.83690935, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 3.6442806720733643 }, { "auxiliary_loss_clip": 0.01144216, "auxiliary_loss_mlp": 0.01022977, "balance_loss_clip": 1.04865313, "balance_loss_mlp": 1.0161643, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.6660122749394424, "language_loss": 0.62552989, "learning_rate": 6.285453473672595e-07, "loss": 0.64720178, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.829677104949951 }, { "auxiliary_loss_clip": 0.01166847, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.04811478, "balance_loss_mlp": 1.01540136, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 2.699314804917054, "language_loss": 0.75560635, "learning_rate": 6.279784702508415e-07, "loss": 0.777502, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.6431217193603516 }, { "auxiliary_loss_clip": 0.01043238, "auxiliary_loss_mlp": 0.01001595, "balance_loss_clip": 1.02257228, "balance_loss_mlp": 1.00063503, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7861078196667733, "language_loss": 0.58493686, "learning_rate": 6.274118012689979e-07, "loss": 0.60538518, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.4509925842285156 }, { "auxiliary_loss_clip": 0.01130192, "auxiliary_loss_mlp": 0.01022174, "balance_loss_clip": 1.04525805, "balance_loss_mlp": 1.01499152, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.672472086576375, "language_loss": 0.68598682, "learning_rate": 6.268453405076943e-07, "loss": 0.70751053, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.7243573665618896 }, { "auxiliary_loss_clip": 0.0113956, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.04707122, "balance_loss_mlp": 1.02944493, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 1.97392123813508, "language_loss": 0.82396376, "learning_rate": 6.262790880528592e-07, "loss": 0.84572524, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.651554822921753 }, { "auxiliary_loss_clip": 0.01130997, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.04349756, "balance_loss_mlp": 1.01937854, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.512413849883989, "language_loss": 0.79648864, "learning_rate": 6.257130439903951e-07, "loss": 0.81807053, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.6735482215881348 }, { "auxiliary_loss_clip": 0.01174901, "auxiliary_loss_mlp": 0.01025462, "balance_loss_clip": 1.05408072, "balance_loss_mlp": 1.01792848, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 2.4153391588501294, "language_loss": 0.81189913, "learning_rate": 6.251472084061695e-07, "loss": 0.83390272, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.5880181789398193 }, { "auxiliary_loss_clip": 0.01153557, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.0490694, "balance_loss_mlp": 1.01803803, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 2.0971192436587187, "language_loss": 0.89048922, "learning_rate": 6.245815813860191e-07, "loss": 0.91227639, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.648817777633667 }, { "auxiliary_loss_clip": 0.01171731, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04925203, "balance_loss_mlp": 1.01791394, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 2.3668910177022386, "language_loss": 0.7044813, "learning_rate": 6.240161630157495e-07, "loss": 0.72645569, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.5678787231445312 }, { "auxiliary_loss_clip": 0.0117145, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.05014229, "balance_loss_mlp": 1.02157819, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 2.377648083098951, "language_loss": 0.70861197, "learning_rate": 6.23450953381133e-07, "loss": 0.73062199, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.641383647918701 }, { "auxiliary_loss_clip": 0.01133007, "auxiliary_loss_mlp": 0.01020952, "balance_loss_clip": 1.04511738, "balance_loss_mlp": 1.01396072, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 3.1985591007269116, "language_loss": 0.68299043, "learning_rate": 6.228859525679131e-07, "loss": 0.70453, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.606473684310913 }, { "auxiliary_loss_clip": 0.01157362, "auxiliary_loss_mlp": 0.01024062, "balance_loss_clip": 1.05010724, "balance_loss_mlp": 1.0173384, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 3.6877777878557385, "language_loss": 0.7957257, "learning_rate": 6.223211606617986e-07, "loss": 0.81754005, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.6671507358551025 }, { "auxiliary_loss_clip": 0.0115234, "auxiliary_loss_mlp": 0.01026026, "balance_loss_clip": 1.05022323, "balance_loss_mlp": 1.01964498, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 2.347452414073662, "language_loss": 0.84076196, "learning_rate": 6.217565777484701e-07, "loss": 0.86254561, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.656730890274048 }, { "auxiliary_loss_clip": 0.01138645, "auxiliary_loss_mlp": 0.0071158, "balance_loss_clip": 1.04901409, "balance_loss_mlp": 1.00070703, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 2.1880562376891195, "language_loss": 0.80148703, "learning_rate": 6.211922039135722e-07, "loss": 0.81998932, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.7713773250579834 }, { "auxiliary_loss_clip": 0.0117009, "auxiliary_loss_mlp": 0.01029609, "balance_loss_clip": 1.04932082, "balance_loss_mlp": 1.02211666, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 1.9585867834587374, "language_loss": 0.80905122, "learning_rate": 6.206280392427201e-07, "loss": 0.83104825, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.613642930984497 }, { "auxiliary_loss_clip": 0.01146182, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.04615235, "balance_loss_mlp": 1.01350868, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.6792272757131739, "language_loss": 0.73810709, "learning_rate": 6.200640838214983e-07, "loss": 0.75977552, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.7330682277679443 }, { "auxiliary_loss_clip": 0.01169216, "auxiliary_loss_mlp": 0.01022317, "balance_loss_clip": 1.04976845, "balance_loss_mlp": 1.01574838, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 2.321153000506659, "language_loss": 0.6648277, "learning_rate": 6.195003377354578e-07, "loss": 0.68674302, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.5465738773345947 }, { "auxiliary_loss_clip": 0.01149199, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.04551816, "balance_loss_mlp": 1.02103937, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 3.184107167977896, "language_loss": 0.73437023, "learning_rate": 6.189368010701183e-07, "loss": 0.75614834, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.624211549758911 }, { "auxiliary_loss_clip": 0.01160507, "auxiliary_loss_mlp": 0.01023116, "balance_loss_clip": 1.04880083, "balance_loss_mlp": 1.01596904, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 2.0587805998510764, "language_loss": 0.76524305, "learning_rate": 6.183734739109683e-07, "loss": 0.78707922, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.552830696105957 }, { "auxiliary_loss_clip": 0.01164615, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.05144405, "balance_loss_mlp": 1.02037907, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 2.5439522406004365, "language_loss": 0.69099617, "learning_rate": 6.178103563434629e-07, "loss": 0.71292162, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.6586992740631104 }, { "auxiliary_loss_clip": 0.01168483, "auxiliary_loss_mlp": 0.01023836, "balance_loss_clip": 1.04906619, "balance_loss_mlp": 1.01653421, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.7026510742389325, "language_loss": 0.83684814, "learning_rate": 6.172474484530283e-07, "loss": 0.85877138, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.5547237396240234 }, { "auxiliary_loss_clip": 0.01127886, "auxiliary_loss_mlp": 0.0102294, "balance_loss_clip": 1.04420805, "balance_loss_mlp": 1.01564407, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 1.9003817214218421, "language_loss": 0.75762409, "learning_rate": 6.166847503250563e-07, "loss": 0.77913237, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.7987558841705322 }, { "auxiliary_loss_clip": 0.01136993, "auxiliary_loss_mlp": 0.01022795, "balance_loss_clip": 1.04600167, "balance_loss_mlp": 1.0156846, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.6359407264863486, "language_loss": 0.79314578, "learning_rate": 6.161222620449078e-07, "loss": 0.8147437, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 4.491199493408203 }, { "auxiliary_loss_clip": 0.0112678, "auxiliary_loss_mlp": 0.0102905, "balance_loss_clip": 1.04822338, "balance_loss_mlp": 1.02154541, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 7.947052034024928, "language_loss": 0.80018783, "learning_rate": 6.155599836979117e-07, "loss": 0.82174617, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.813185691833496 }, { "auxiliary_loss_clip": 0.01108117, "auxiliary_loss_mlp": 0.01028063, "balance_loss_clip": 1.04468298, "balance_loss_mlp": 1.02020454, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 2.4254436105663415, "language_loss": 0.82179356, "learning_rate": 6.149979153693649e-07, "loss": 0.84315526, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.7428460121154785 }, { "auxiliary_loss_clip": 0.01149741, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.04766464, "balance_loss_mlp": 1.02228546, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 2.116765767447862, "language_loss": 0.76755011, "learning_rate": 6.144360571445343e-07, "loss": 0.7893461, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 3.534935235977173 }, { "auxiliary_loss_clip": 0.01154765, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.05089569, "balance_loss_mlp": 1.0193671, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.8034536653455373, "language_loss": 0.80184913, "learning_rate": 6.138744091086509e-07, "loss": 0.82366335, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.638369560241699 }, { "auxiliary_loss_clip": 0.01125608, "auxiliary_loss_mlp": 0.01027882, "balance_loss_clip": 1.04795861, "balance_loss_mlp": 1.01991606, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.3769881628850382, "language_loss": 0.73063469, "learning_rate": 6.133129713469183e-07, "loss": 0.75216955, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.742480754852295 }, { "auxiliary_loss_clip": 0.01131104, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.04516876, "balance_loss_mlp": 1.0202235, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.897068020240368, "language_loss": 0.64420831, "learning_rate": 6.127517439445053e-07, "loss": 0.66579682, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.740584373474121 }, { "auxiliary_loss_clip": 0.01101203, "auxiliary_loss_mlp": 0.01023541, "balance_loss_clip": 1.04427099, "balance_loss_mlp": 1.01676393, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 2.0006320757838085, "language_loss": 0.82119167, "learning_rate": 6.121907269865498e-07, "loss": 0.84243912, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.771494150161743 }, { "auxiliary_loss_clip": 0.01034563, "auxiliary_loss_mlp": 0.01001416, "balance_loss_clip": 1.02167487, "balance_loss_mlp": 1.00044465, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9228527802685239, "language_loss": 0.67248917, "learning_rate": 6.116299205581577e-07, "loss": 0.69284904, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.209184408187866 }, { "auxiliary_loss_clip": 0.01175757, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.05314255, "balance_loss_mlp": 1.02488661, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 2.5955605652670624, "language_loss": 0.68416619, "learning_rate": 6.110693247444018e-07, "loss": 0.70624942, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.765641212463379 }, { "auxiliary_loss_clip": 0.01111543, "auxiliary_loss_mlp": 0.01025589, "balance_loss_clip": 1.04342043, "balance_loss_mlp": 1.01859164, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 2.1879581476331915, "language_loss": 0.82536578, "learning_rate": 6.105089396303258e-07, "loss": 0.84673715, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.761998414993286 }, { "auxiliary_loss_clip": 0.01139905, "auxiliary_loss_mlp": 0.01024717, "balance_loss_clip": 1.04698741, "balance_loss_mlp": 1.0168674, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 2.203531878695988, "language_loss": 0.75569236, "learning_rate": 6.099487653009383e-07, "loss": 0.77733862, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.7566447257995605 }, { "auxiliary_loss_clip": 0.01150467, "auxiliary_loss_mlp": 0.01024783, "balance_loss_clip": 1.04586601, "balance_loss_mlp": 1.01839995, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 2.346093860932657, "language_loss": 0.83482474, "learning_rate": 6.093888018412192e-07, "loss": 0.85657722, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.6355106830596924 }, { "auxiliary_loss_clip": 0.010674, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.02536237, "balance_loss_mlp": 1.00096917, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7480428067621648, "language_loss": 0.54668295, "learning_rate": 6.088290493361125e-07, "loss": 0.56737626, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.3642024993896484 }, { "auxiliary_loss_clip": 0.0109991, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.04356849, "balance_loss_mlp": 1.02163029, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.273432720578952, "language_loss": 0.71645105, "learning_rate": 6.082695078705322e-07, "loss": 0.73774135, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.749316692352295 }, { "auxiliary_loss_clip": 0.0114884, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.04820204, "balance_loss_mlp": 1.01813006, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.2468344024159967, "language_loss": 0.68744445, "learning_rate": 6.077101775293618e-07, "loss": 0.70918858, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.6600306034088135 }, { "auxiliary_loss_clip": 0.01154704, "auxiliary_loss_mlp": 0.01023655, "balance_loss_clip": 1.04788184, "balance_loss_mlp": 1.01617718, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 3.0559306046475596, "language_loss": 0.8247869, "learning_rate": 6.071510583974504e-07, "loss": 0.84657043, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.6357853412628174 }, { "auxiliary_loss_clip": 0.0117217, "auxiliary_loss_mlp": 0.01033668, "balance_loss_clip": 1.05119419, "balance_loss_mlp": 1.02606857, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 2.1678995812704143, "language_loss": 0.72172099, "learning_rate": 6.065921505596161e-07, "loss": 0.74377936, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.5736162662506104 }, { "auxiliary_loss_clip": 0.01120878, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.04480934, "balance_loss_mlp": 1.02140379, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 2.3455036851241706, "language_loss": 0.77365768, "learning_rate": 6.060334541006445e-07, "loss": 0.79515195, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.7429451942443848 }, { "auxiliary_loss_clip": 0.01124564, "auxiliary_loss_mlp": 0.01022754, "balance_loss_clip": 1.04332495, "balance_loss_mlp": 1.01570868, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.634943869245585, "language_loss": 0.69148189, "learning_rate": 6.05474969105289e-07, "loss": 0.71295512, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.7906012535095215 }, { "auxiliary_loss_clip": 0.0115576, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.04948497, "balance_loss_mlp": 1.02064347, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.7703554270926984, "language_loss": 0.73671579, "learning_rate": 6.049166956582725e-07, "loss": 0.75855446, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.557624578475952 }, { "auxiliary_loss_clip": 0.0115053, "auxiliary_loss_mlp": 0.01021395, "balance_loss_clip": 1.04774022, "balance_loss_mlp": 1.01403952, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 2.2308200286690107, "language_loss": 0.87597585, "learning_rate": 6.043586338442841e-07, "loss": 0.89769506, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.715510845184326 }, { "auxiliary_loss_clip": 0.01164823, "auxiliary_loss_mlp": 0.01021545, "balance_loss_clip": 1.04893613, "balance_loss_mlp": 1.01460993, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.4465408935585324, "language_loss": 0.73149782, "learning_rate": 6.038007837479815e-07, "loss": 0.75336152, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.6057891845703125 }, { "auxiliary_loss_clip": 0.01153166, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.04959762, "balance_loss_mlp": 1.02063274, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 4.206958598169935, "language_loss": 0.63936293, "learning_rate": 6.032431454539897e-07, "loss": 0.66116583, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.653191328048706 }, { "auxiliary_loss_clip": 0.0112176, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 1.04527891, "balance_loss_mlp": 1.01859164, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.77782172484795, "language_loss": 0.81376278, "learning_rate": 6.026857190469014e-07, "loss": 0.8352381, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.772308588027954 }, { "auxiliary_loss_clip": 0.01139641, "auxiliary_loss_mlp": 0.01022307, "balance_loss_clip": 1.0463053, "balance_loss_mlp": 1.01528573, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 2.20154633578895, "language_loss": 0.74340355, "learning_rate": 6.0212850461128e-07, "loss": 0.76502299, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 3.7554872035980225 }, { "auxiliary_loss_clip": 0.01139334, "auxiliary_loss_mlp": 0.01023962, "balance_loss_clip": 1.04636669, "balance_loss_mlp": 1.01630855, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 2.2383547755159316, "language_loss": 0.74423033, "learning_rate": 6.015715022316516e-07, "loss": 0.7658633, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 3.7081761360168457 }, { "auxiliary_loss_clip": 0.01106242, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.04205918, "balance_loss_mlp": 1.01900506, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.6087195840837936, "language_loss": 0.78173923, "learning_rate": 6.010147119925154e-07, "loss": 0.80306238, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.825260877609253 }, { "auxiliary_loss_clip": 0.01114734, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.04471684, "balance_loss_mlp": 1.01676297, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 2.8649315938265723, "language_loss": 0.66410619, "learning_rate": 6.004581339783348e-07, "loss": 0.68549633, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.760059356689453 }, { "auxiliary_loss_clip": 0.01158312, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.04986739, "balance_loss_mlp": 1.02349353, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 2.639004401526252, "language_loss": 0.69313598, "learning_rate": 5.999017682735425e-07, "loss": 0.71503603, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 3.648463010787964 }, { "auxiliary_loss_clip": 0.01100975, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.04446054, "balance_loss_mlp": 1.02284479, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 1.9512193278658976, "language_loss": 0.6696173, "learning_rate": 5.993456149625387e-07, "loss": 0.69093221, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.8157639503479004 }, { "auxiliary_loss_clip": 0.01115163, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.04470968, "balance_loss_mlp": 1.02176309, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.8887453371082628, "language_loss": 0.82393068, "learning_rate": 5.987896741296909e-07, "loss": 0.84537357, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.712554693222046 }, { "auxiliary_loss_clip": 0.01140624, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.05054903, "balance_loss_mlp": 1.02629781, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.8191472437820337, "language_loss": 0.78410959, "learning_rate": 5.982339458593361e-07, "loss": 0.80584812, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.6651973724365234 }, { "auxiliary_loss_clip": 0.01149377, "auxiliary_loss_mlp": 0.00711405, "balance_loss_clip": 1.04859996, "balance_loss_mlp": 1.00070047, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.5723216801048303, "language_loss": 0.84185416, "learning_rate": 5.976784302357767e-07, "loss": 0.86046201, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.724182367324829 }, { "auxiliary_loss_clip": 0.01156784, "auxiliary_loss_mlp": 0.01022897, "balance_loss_clip": 1.04880047, "balance_loss_mlp": 1.01607227, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 2.082741249787157, "language_loss": 0.73178804, "learning_rate": 5.971231273432855e-07, "loss": 0.75358486, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.5734477043151855 }, { "auxiliary_loss_clip": 0.01066222, "auxiliary_loss_mlp": 0.01001757, "balance_loss_clip": 1.02457809, "balance_loss_mlp": 1.00068426, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8100672906262452, "language_loss": 0.54511011, "learning_rate": 5.965680372661e-07, "loss": 0.56578994, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.11002254486084 }, { "auxiliary_loss_clip": 0.01136129, "auxiliary_loss_mlp": 0.01023932, "balance_loss_clip": 1.04682469, "balance_loss_mlp": 1.01724112, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 2.3698505640043797, "language_loss": 0.55922967, "learning_rate": 5.960131600884266e-07, "loss": 0.58083034, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.659832000732422 }, { "auxiliary_loss_clip": 0.01124507, "auxiliary_loss_mlp": 0.01023825, "balance_loss_clip": 1.04630101, "balance_loss_mlp": 1.01711679, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.7539868015003384, "language_loss": 0.76188284, "learning_rate": 5.954584958944413e-07, "loss": 0.78336614, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.7350480556488037 }, { "auxiliary_loss_clip": 0.01124447, "auxiliary_loss_mlp": 0.00711598, "balance_loss_clip": 1.0445919, "balance_loss_mlp": 1.00073826, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 3.8032439705251258, "language_loss": 0.81619096, "learning_rate": 5.949040447682854e-07, "loss": 0.83455139, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.678255558013916 }, { "auxiliary_loss_clip": 0.01144314, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.04855025, "balance_loss_mlp": 1.01810479, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.4012429672602433, "language_loss": 0.6871205, "learning_rate": 5.943498067940686e-07, "loss": 0.70882207, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.5886430740356445 }, { "auxiliary_loss_clip": 0.01136022, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.05255055, "balance_loss_mlp": 1.02070296, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 1.782175030394846, "language_loss": 0.81474894, "learning_rate": 5.937957820558686e-07, "loss": 0.83638579, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.7518515586853027 }, { "auxiliary_loss_clip": 0.0105476, "auxiliary_loss_mlp": 0.01002468, "balance_loss_clip": 1.02342916, "balance_loss_mlp": 1.00147867, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8527819082990798, "language_loss": 0.65306741, "learning_rate": 5.932419706377296e-07, "loss": 0.67363966, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.1889357566833496 }, { "auxiliary_loss_clip": 0.01117171, "auxiliary_loss_mlp": 0.01022267, "balance_loss_clip": 1.04680324, "balance_loss_mlp": 1.01485765, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.8442949695404316, "language_loss": 0.74244243, "learning_rate": 5.92688372623666e-07, "loss": 0.7638368, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.780602216720581 }, { "auxiliary_loss_clip": 0.01154195, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.04794621, "balance_loss_mlp": 1.02342534, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 2.5543270510729728, "language_loss": 0.73895842, "learning_rate": 5.921349880976574e-07, "loss": 0.760809, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.588547468185425 }, { "auxiliary_loss_clip": 0.01141593, "auxiliary_loss_mlp": 0.00711931, "balance_loss_clip": 1.04667783, "balance_loss_mlp": 1.00078773, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 1.7753446547223395, "language_loss": 0.82148397, "learning_rate": 5.915818171436515e-07, "loss": 0.84001923, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.6565399169921875 }, { "auxiliary_loss_clip": 0.01135094, "auxiliary_loss_mlp": 0.01029164, "balance_loss_clip": 1.0425868, "balance_loss_mlp": 1.02155268, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.880813475158201, "language_loss": 0.7479232, "learning_rate": 5.910288598455642e-07, "loss": 0.76956582, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.614645481109619 }, { "auxiliary_loss_clip": 0.01161486, "auxiliary_loss_mlp": 0.01029776, "balance_loss_clip": 1.05043042, "balance_loss_mlp": 1.02184224, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.241762401099263, "language_loss": 0.74437928, "learning_rate": 5.90476116287278e-07, "loss": 0.76629186, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.6103851795196533 }, { "auxiliary_loss_clip": 0.01143397, "auxiliary_loss_mlp": 0.01024029, "balance_loss_clip": 1.05217171, "balance_loss_mlp": 1.01690078, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 2.1940354058744274, "language_loss": 0.6788981, "learning_rate": 5.899235865526456e-07, "loss": 0.70057237, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.6723556518554688 }, { "auxiliary_loss_clip": 0.01115549, "auxiliary_loss_mlp": 0.01020887, "balance_loss_clip": 1.04504061, "balance_loss_mlp": 1.01399922, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.9886343796807593, "language_loss": 0.82823974, "learning_rate": 5.893712707254825e-07, "loss": 0.84960413, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.7962257862091064 }, { "auxiliary_loss_clip": 0.0110279, "auxiliary_loss_mlp": 0.01023505, "balance_loss_clip": 1.04152393, "balance_loss_mlp": 1.01588178, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 3.5968302878824274, "language_loss": 0.66499853, "learning_rate": 5.888191688895769e-07, "loss": 0.68626148, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.7107162475585938 }, { "auxiliary_loss_clip": 0.01170545, "auxiliary_loss_mlp": 0.01026194, "balance_loss_clip": 1.04809868, "balance_loss_mlp": 1.01875854, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 2.497638418454557, "language_loss": 0.625911, "learning_rate": 5.882672811286813e-07, "loss": 0.64787835, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.5396227836608887 }, { "auxiliary_loss_clip": 0.01172501, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.05057704, "balance_loss_mlp": 1.0165844, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.3437344257811534, "language_loss": 0.69531035, "learning_rate": 5.877156075265166e-07, "loss": 0.71727401, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 3.5457377433776855 }, { "auxiliary_loss_clip": 0.01134899, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 1.04417944, "balance_loss_mlp": 1.01795459, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 3.762767932034609, "language_loss": 0.69934523, "learning_rate": 5.871641481667715e-07, "loss": 0.7209444, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 4.438681602478027 }, { "auxiliary_loss_clip": 0.01110326, "auxiliary_loss_mlp": 0.01025672, "balance_loss_clip": 1.04453278, "balance_loss_mlp": 1.01847172, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 3.478321170516053, "language_loss": 0.84233868, "learning_rate": 5.866129031331011e-07, "loss": 0.8636986, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.8250675201416016 }, { "auxiliary_loss_clip": 0.01138458, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.04463768, "balance_loss_mlp": 1.02178431, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 6.488446157119565, "language_loss": 0.83386683, "learning_rate": 5.8606187250913e-07, "loss": 0.85554135, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 3.6734066009521484 }, { "auxiliary_loss_clip": 0.01154624, "auxiliary_loss_mlp": 0.00711627, "balance_loss_clip": 1.05110264, "balance_loss_mlp": 1.00076747, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 1.9076255495247794, "language_loss": 0.8418923, "learning_rate": 5.855110563784482e-07, "loss": 0.86055481, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.7274367809295654 }, { "auxiliary_loss_clip": 0.01147294, "auxiliary_loss_mlp": 0.00711351, "balance_loss_clip": 1.04591179, "balance_loss_mlp": 1.00080228, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 3.3383670894863284, "language_loss": 0.64339733, "learning_rate": 5.849604548246156e-07, "loss": 0.66198379, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.74814772605896 }, { "auxiliary_loss_clip": 0.01146732, "auxiliary_loss_mlp": 0.00711823, "balance_loss_clip": 1.0497874, "balance_loss_mlp": 1.00077653, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 3.9546184716629535, "language_loss": 0.80330354, "learning_rate": 5.844100679311565e-07, "loss": 0.8218891, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.708876848220825 }, { "auxiliary_loss_clip": 0.01136893, "auxiliary_loss_mlp": 0.01022378, "balance_loss_clip": 1.04691267, "balance_loss_mlp": 1.01499903, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 2.5734281995723562, "language_loss": 0.76363909, "learning_rate": 5.838598957815637e-07, "loss": 0.78523183, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.670513391494751 }, { "auxiliary_loss_clip": 0.01133979, "auxiliary_loss_mlp": 0.0102643, "balance_loss_clip": 1.04719329, "balance_loss_mlp": 1.01916695, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.994455358477874, "language_loss": 0.85544729, "learning_rate": 5.833099384592996e-07, "loss": 0.87705135, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.7200827598571777 }, { "auxiliary_loss_clip": 0.01135131, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.04603124, "balance_loss_mlp": 1.01967835, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.0842957084784515, "language_loss": 0.71709031, "learning_rate": 5.827601960477913e-07, "loss": 0.73871493, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.6760079860687256 }, { "auxiliary_loss_clip": 0.01150816, "auxiliary_loss_mlp": 0.01024803, "balance_loss_clip": 1.04719973, "balance_loss_mlp": 1.0180794, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 2.1570924720253313, "language_loss": 0.70632827, "learning_rate": 5.822106686304344e-07, "loss": 0.72808444, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.7069616317749023 }, { "auxiliary_loss_clip": 0.01125922, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.04491544, "balance_loss_mlp": 1.02433014, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 1.699180020215592, "language_loss": 0.58070397, "learning_rate": 5.816613562905919e-07, "loss": 0.60227901, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.776074171066284 }, { "auxiliary_loss_clip": 0.01119698, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 1.04802966, "balance_loss_mlp": 1.01909983, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.7801577939528415, "language_loss": 0.70087361, "learning_rate": 5.811122591115933e-07, "loss": 0.72232956, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.785893678665161 }, { "auxiliary_loss_clip": 0.01124519, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.04938459, "balance_loss_mlp": 1.0228126, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.6126234233827, "language_loss": 0.71669263, "learning_rate": 5.805633771767376e-07, "loss": 0.73824239, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.644073009490967 }, { "auxiliary_loss_clip": 0.01134708, "auxiliary_loss_mlp": 0.01026642, "balance_loss_clip": 1.04584682, "balance_loss_mlp": 1.02021956, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 2.7390497831476592, "language_loss": 0.77439433, "learning_rate": 5.800147105692888e-07, "loss": 0.79600787, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.6725921630859375 }, { "auxiliary_loss_clip": 0.01153234, "auxiliary_loss_mlp": 0.01027228, "balance_loss_clip": 1.04550731, "balance_loss_mlp": 1.02048719, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.7492110434949784, "language_loss": 0.79198146, "learning_rate": 5.794662593724795e-07, "loss": 0.81378609, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.555227518081665 }, { "auxiliary_loss_clip": 0.0117155, "auxiliary_loss_mlp": 0.01025899, "balance_loss_clip": 1.05223453, "balance_loss_mlp": 1.01827598, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 2.6493359746229883, "language_loss": 0.74893832, "learning_rate": 5.789180236695091e-07, "loss": 0.77091289, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.5811173915863037 }, { "auxiliary_loss_clip": 0.01149182, "auxiliary_loss_mlp": 0.01020492, "balance_loss_clip": 1.0488565, "balance_loss_mlp": 1.01366448, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 1.9861878070134376, "language_loss": 0.85182124, "learning_rate": 5.78370003543544e-07, "loss": 0.87351793, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.5563158988952637 }, { "auxiliary_loss_clip": 0.01155743, "auxiliary_loss_mlp": 0.0071119, "balance_loss_clip": 1.04855919, "balance_loss_mlp": 1.00070763, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 1.983616732300648, "language_loss": 0.83539027, "learning_rate": 5.778221990777203e-07, "loss": 0.85405958, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.624255895614624 }, { "auxiliary_loss_clip": 0.01140331, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.04926264, "balance_loss_mlp": 1.02099562, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.22489427101533, "language_loss": 0.82953751, "learning_rate": 5.772746103551372e-07, "loss": 0.85122478, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.654843807220459 }, { "auxiliary_loss_clip": 0.01134445, "auxiliary_loss_mlp": 0.01023847, "balance_loss_clip": 1.04651201, "balance_loss_mlp": 1.01651835, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 3.5137299105420094, "language_loss": 0.7197094, "learning_rate": 5.767272374588648e-07, "loss": 0.74129236, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.727969169616699 }, { "auxiliary_loss_clip": 0.01154806, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.05033636, "balance_loss_mlp": 1.02204478, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.7423453361312058, "language_loss": 0.78210473, "learning_rate": 5.76180080471939e-07, "loss": 0.80394405, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.763275384902954 }, { "auxiliary_loss_clip": 0.01172954, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.04991639, "balance_loss_mlp": 1.01979184, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.1405214606330563, "language_loss": 0.72595668, "learning_rate": 5.756331394773631e-07, "loss": 0.74795902, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.5405263900756836 }, { "auxiliary_loss_clip": 0.01090462, "auxiliary_loss_mlp": 0.00711787, "balance_loss_clip": 1.04092586, "balance_loss_mlp": 1.00074458, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.8614383651792756, "language_loss": 0.76334816, "learning_rate": 5.750864145581071e-07, "loss": 0.78137064, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.7510452270507812 }, { "auxiliary_loss_clip": 0.01171167, "auxiliary_loss_mlp": 0.01023641, "balance_loss_clip": 1.05102909, "balance_loss_mlp": 1.0168879, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 6.019804662904669, "language_loss": 0.85797971, "learning_rate": 5.745399057971085e-07, "loss": 0.87992781, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.6211137771606445 }, { "auxiliary_loss_clip": 0.0115731, "auxiliary_loss_mlp": 0.0102593, "balance_loss_clip": 1.04894674, "balance_loss_mlp": 1.01865804, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.0095714115193113, "language_loss": 0.75440693, "learning_rate": 5.739936132772738e-07, "loss": 0.77623928, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.6087193489074707 }, { "auxiliary_loss_clip": 0.01168211, "auxiliary_loss_mlp": 0.0103018, "balance_loss_clip": 1.04897094, "balance_loss_mlp": 1.02245498, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 2.9636447759334628, "language_loss": 0.74253809, "learning_rate": 5.734475370814733e-07, "loss": 0.76452196, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 3.528944969177246 }, { "auxiliary_loss_clip": 0.01156255, "auxiliary_loss_mlp": 0.01022781, "balance_loss_clip": 1.04753137, "balance_loss_mlp": 1.01572967, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.86970346749941, "language_loss": 0.78625786, "learning_rate": 5.729016772925483e-07, "loss": 0.80804825, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 4.523828506469727 }, { "auxiliary_loss_clip": 0.01102561, "auxiliary_loss_mlp": 0.01025498, "balance_loss_clip": 1.04283535, "balance_loss_mlp": 1.01800001, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 2.895067811459626, "language_loss": 0.71007681, "learning_rate": 5.723560339933038e-07, "loss": 0.7313574, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.8013906478881836 }, { "auxiliary_loss_clip": 0.01151012, "auxiliary_loss_mlp": 0.0071189, "balance_loss_clip": 1.04637265, "balance_loss_mlp": 1.00075817, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.124472972696273, "language_loss": 0.6547913, "learning_rate": 5.71810607266513e-07, "loss": 0.67342031, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 3.5937132835388184 }, { "auxiliary_loss_clip": 0.01155695, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 1.0477097, "balance_loss_mlp": 1.01585829, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.9590291868639769, "language_loss": 0.60343611, "learning_rate": 5.712653971949184e-07, "loss": 0.62522233, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.5957624912261963 }, { "auxiliary_loss_clip": 0.01148733, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 1.04701328, "balance_loss_mlp": 1.01651263, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.462369472717495, "language_loss": 0.7546339, "learning_rate": 5.707204038612268e-07, "loss": 0.77635974, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.603469133377075 }, { "auxiliary_loss_clip": 0.01149879, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.05604982, "balance_loss_mlp": 1.02164459, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.3836288732939104, "language_loss": 0.7355386, "learning_rate": 5.701756273481138e-07, "loss": 0.75733411, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.5928306579589844 }, { "auxiliary_loss_clip": 0.01141567, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04645288, "balance_loss_mlp": 1.016559, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.6374184127333076, "language_loss": 0.74135715, "learning_rate": 5.696310677382212e-07, "loss": 0.76300865, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.675144672393799 }, { "auxiliary_loss_clip": 0.01039529, "auxiliary_loss_mlp": 0.01004499, "balance_loss_clip": 1.02519417, "balance_loss_mlp": 1.00337827, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8571276957105034, "language_loss": 0.61693424, "learning_rate": 5.690867251141576e-07, "loss": 0.63737452, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.373344659805298 }, { "auxiliary_loss_clip": 0.01159851, "auxiliary_loss_mlp": 0.01026615, "balance_loss_clip": 1.04782295, "balance_loss_mlp": 1.01909864, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 3.720468192376637, "language_loss": 0.91887093, "learning_rate": 5.685425995585013e-07, "loss": 0.94073558, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.6277530193328857 }, { "auxiliary_loss_clip": 0.01052932, "auxiliary_loss_mlp": 0.01000996, "balance_loss_clip": 1.02179384, "balance_loss_mlp": 0.99997634, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.7601246107066153, "language_loss": 0.58921552, "learning_rate": 5.679986911537935e-07, "loss": 0.6097548, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.389810562133789 }, { "auxiliary_loss_clip": 0.01096896, "auxiliary_loss_mlp": 0.01025971, "balance_loss_clip": 1.04434097, "balance_loss_mlp": 1.01841962, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.9732577233459534, "language_loss": 0.67078084, "learning_rate": 5.674549999825462e-07, "loss": 0.69200951, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.810640335083008 }, { "auxiliary_loss_clip": 0.01066954, "auxiliary_loss_mlp": 0.01005148, "balance_loss_clip": 1.02346635, "balance_loss_mlp": 1.00395036, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 1.060474012309751, "language_loss": 0.71384561, "learning_rate": 5.669115261272363e-07, "loss": 0.73456663, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.207794189453125 }, { "auxiliary_loss_clip": 0.01155218, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.04924273, "balance_loss_mlp": 1.02028477, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 2.4293011581044603, "language_loss": 0.72806871, "learning_rate": 5.663682696703081e-07, "loss": 0.74990267, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.5697391033172607 }, { "auxiliary_loss_clip": 0.01170532, "auxiliary_loss_mlp": 0.0102577, "balance_loss_clip": 1.05130887, "balance_loss_mlp": 1.0191505, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 2.2333564615536345, "language_loss": 0.8220709, "learning_rate": 5.658252306941746e-07, "loss": 0.84403396, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.5775270462036133 }, { "auxiliary_loss_clip": 0.01112876, "auxiliary_loss_mlp": 0.01033882, "balance_loss_clip": 1.04696393, "balance_loss_mlp": 1.02607965, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.05532331184893, "language_loss": 0.75437498, "learning_rate": 5.65282409281212e-07, "loss": 0.77584255, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.638930559158325 }, { "auxiliary_loss_clip": 0.01135493, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.04677618, "balance_loss_mlp": 1.0204277, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.2449861097031523, "language_loss": 0.70183414, "learning_rate": 5.64739805513768e-07, "loss": 0.72347093, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.699159860610962 }, { "auxiliary_loss_clip": 0.01059931, "auxiliary_loss_mlp": 0.00701902, "balance_loss_clip": 1.01948237, "balance_loss_mlp": 1.00012386, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7899646961377099, "language_loss": 0.55712211, "learning_rate": 5.641974194741541e-07, "loss": 0.57474053, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.111583948135376 }, { "auxiliary_loss_clip": 0.01053061, "auxiliary_loss_mlp": 0.01005784, "balance_loss_clip": 1.03370142, "balance_loss_mlp": 1.0048784, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7861399125657302, "language_loss": 0.63728791, "learning_rate": 5.636552512446502e-07, "loss": 0.65787637, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.183398962020874 }, { "auxiliary_loss_clip": 0.01149468, "auxiliary_loss_mlp": 0.0102307, "balance_loss_clip": 1.0465765, "balance_loss_mlp": 1.01607203, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 2.6205530481237833, "language_loss": 0.78215533, "learning_rate": 5.631133009075027e-07, "loss": 0.80388075, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.7126760482788086 }, { "auxiliary_loss_clip": 0.01154308, "auxiliary_loss_mlp": 0.0071128, "balance_loss_clip": 1.04833841, "balance_loss_mlp": 1.00077379, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 2.1035068896905127, "language_loss": 0.68892574, "learning_rate": 5.625715685449242e-07, "loss": 0.70758158, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.6656670570373535 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01031404, "balance_loss_clip": 1.04976273, "balance_loss_mlp": 1.02469254, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.7143641136807786, "language_loss": 0.71507972, "learning_rate": 5.620300542390966e-07, "loss": 0.73661005, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.8817834854125977 }, { "auxiliary_loss_clip": 0.01133769, "auxiliary_loss_mlp": 0.01026028, "balance_loss_clip": 1.04505944, "balance_loss_mlp": 1.01939976, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 2.586178134068807, "language_loss": 0.85489464, "learning_rate": 5.614887580721659e-07, "loss": 0.87649256, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.6799049377441406 }, { "auxiliary_loss_clip": 0.01117589, "auxiliary_loss_mlp": 0.01026766, "balance_loss_clip": 1.04774261, "balance_loss_mlp": 1.01962221, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.3526423234903917, "language_loss": 0.74038792, "learning_rate": 5.609476801262481e-07, "loss": 0.7618314, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.7157368659973145 }, { "auxiliary_loss_clip": 0.01117082, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.04449809, "balance_loss_mlp": 1.01875651, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 3.652691965303602, "language_loss": 0.64670807, "learning_rate": 5.604068204834223e-07, "loss": 0.66813922, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.6479380130767822 }, { "auxiliary_loss_clip": 0.01104773, "auxiliary_loss_mlp": 0.00711972, "balance_loss_clip": 1.0431869, "balance_loss_mlp": 1.00073791, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 2.664208002194778, "language_loss": 0.76830959, "learning_rate": 5.598661792257367e-07, "loss": 0.78647709, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.774571657180786 }, { "auxiliary_loss_clip": 0.01154957, "auxiliary_loss_mlp": 0.01018827, "balance_loss_clip": 1.04889488, "balance_loss_mlp": 1.01221085, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 2.042065835358883, "language_loss": 0.75973332, "learning_rate": 5.593257564352071e-07, "loss": 0.78147113, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 3.593777656555176 }, { "auxiliary_loss_clip": 0.01154039, "auxiliary_loss_mlp": 0.01021722, "balance_loss_clip": 1.04995358, "balance_loss_mlp": 1.01487339, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.5901755633014045, "language_loss": 0.75785768, "learning_rate": 5.58785552193815e-07, "loss": 0.77961528, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 4.403176307678223 }, { "auxiliary_loss_clip": 0.01171016, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.05075824, "balance_loss_mlp": 1.02199626, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 1.8667268468501805, "language_loss": 0.75568068, "learning_rate": 5.582455665835086e-07, "loss": 0.7776798, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.674502372741699 }, { "auxiliary_loss_clip": 0.01146771, "auxiliary_loss_mlp": 0.01028832, "balance_loss_clip": 1.04617739, "balance_loss_mlp": 1.0209229, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 3.257508369510131, "language_loss": 0.72879195, "learning_rate": 5.577057996862036e-07, "loss": 0.75054801, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 3.574474334716797 }, { "auxiliary_loss_clip": 0.01166826, "auxiliary_loss_mlp": 0.01019224, "balance_loss_clip": 1.04960656, "balance_loss_mlp": 1.01218474, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 2.1161734199159095, "language_loss": 0.76080191, "learning_rate": 5.571662515837814e-07, "loss": 0.78266239, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.6258130073547363 }, { "auxiliary_loss_clip": 0.01140068, "auxiliary_loss_mlp": 0.01024233, "balance_loss_clip": 1.04954135, "balance_loss_mlp": 1.01728868, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.8078657462951375, "language_loss": 0.83750522, "learning_rate": 5.566269223580926e-07, "loss": 0.85914826, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.8298072814941406 }, { "auxiliary_loss_clip": 0.01155424, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.04898012, "balance_loss_mlp": 1.02102685, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.8830918213092942, "language_loss": 0.75449264, "learning_rate": 5.560878120909511e-07, "loss": 0.77632642, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.66068959236145 }, { "auxiliary_loss_clip": 0.01063242, "auxiliary_loss_mlp": 0.01005108, "balance_loss_clip": 1.02139544, "balance_loss_mlp": 1.00414228, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8453430052564104, "language_loss": 0.58507478, "learning_rate": 5.55548920864141e-07, "loss": 0.60575831, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.2510409355163574 }, { "auxiliary_loss_clip": 0.01151973, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.04944408, "balance_loss_mlp": 1.02606368, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.6852027115902546, "language_loss": 0.78037477, "learning_rate": 5.550102487594113e-07, "loss": 0.80223173, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.60296893119812 }, { "auxiliary_loss_clip": 0.01109576, "auxiliary_loss_mlp": 0.00711598, "balance_loss_clip": 1.04266763, "balance_loss_mlp": 1.00071502, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.694018103274495, "language_loss": 0.71778667, "learning_rate": 5.54471795858477e-07, "loss": 0.73599833, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.7666726112365723 }, { "auxiliary_loss_clip": 0.01120295, "auxiliary_loss_mlp": 0.01021933, "balance_loss_clip": 1.0421468, "balance_loss_mlp": 1.01486719, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 2.356534606106758, "language_loss": 0.83014393, "learning_rate": 5.539335622430235e-07, "loss": 0.8515662, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.608832359313965 }, { "auxiliary_loss_clip": 0.01147079, "auxiliary_loss_mlp": 0.01024624, "balance_loss_clip": 1.04548752, "balance_loss_mlp": 1.01731658, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 2.562734096816823, "language_loss": 0.74663734, "learning_rate": 5.533955479946975e-07, "loss": 0.7683543, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.478938579559326 }, { "auxiliary_loss_clip": 0.01042389, "auxiliary_loss_mlp": 0.00701638, "balance_loss_clip": 1.03417039, "balance_loss_mlp": 1.0002383, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 1.1870365095024105, "language_loss": 0.65673655, "learning_rate": 5.528577531951173e-07, "loss": 0.67417681, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.1766932010650635 }, { "auxiliary_loss_clip": 0.01142553, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.04794049, "balance_loss_mlp": 1.02537942, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 2.060966803144167, "language_loss": 0.73951316, "learning_rate": 5.523201779258653e-07, "loss": 0.76125973, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.630544900894165 }, { "auxiliary_loss_clip": 0.01168572, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.047539, "balance_loss_mlp": 1.01761246, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 2.022063048082634, "language_loss": 0.83868766, "learning_rate": 5.517828222684912e-07, "loss": 0.86062318, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.5739221572875977 }, { "auxiliary_loss_clip": 0.01049278, "auxiliary_loss_mlp": 0.01002932, "balance_loss_clip": 1.02092206, "balance_loss_mlp": 1.00191867, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7692727208157807, "language_loss": 0.59002215, "learning_rate": 5.512456863045117e-07, "loss": 0.61054426, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.212327718734741 }, { "auxiliary_loss_clip": 0.0116911, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.04826152, "balance_loss_mlp": 1.01774168, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 1.7310864905105097, "language_loss": 0.74254346, "learning_rate": 5.507087701154089e-07, "loss": 0.7644819, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.574071168899536 }, { "auxiliary_loss_clip": 0.01109485, "auxiliary_loss_mlp": 0.01026584, "balance_loss_clip": 1.04529071, "balance_loss_mlp": 1.01942515, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 3.8752405853692453, "language_loss": 0.75803101, "learning_rate": 5.50172073782634e-07, "loss": 0.77939177, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.7403554916381836 }, { "auxiliary_loss_clip": 0.01121664, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.04759741, "balance_loss_mlp": 1.0200839, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 1.7493095885527004, "language_loss": 0.87933791, "learning_rate": 5.496355973876023e-07, "loss": 0.90082562, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.6497511863708496 }, { "auxiliary_loss_clip": 0.01118726, "auxiliary_loss_mlp": 0.00711946, "balance_loss_clip": 1.04338193, "balance_loss_mlp": 1.00066948, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.6820388260306345, "language_loss": 0.70857638, "learning_rate": 5.490993410116984e-07, "loss": 0.72688305, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.9435572624206543 }, { "auxiliary_loss_clip": 0.01117886, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.0459466, "balance_loss_mlp": 1.01979661, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.8899657065763842, "language_loss": 0.69814217, "learning_rate": 5.485633047362704e-07, "loss": 0.71958864, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.9500155448913574 }, { "auxiliary_loss_clip": 0.01177449, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.05497348, "balance_loss_mlp": 1.02445745, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 2.2646499409976237, "language_loss": 0.79115462, "learning_rate": 5.480274886426341e-07, "loss": 0.81325114, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.596735715866089 }, { "auxiliary_loss_clip": 0.01152785, "auxiliary_loss_mlp": 0.01021727, "balance_loss_clip": 1.05065894, "balance_loss_mlp": 1.01512885, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 2.301417436087754, "language_loss": 0.7787199, "learning_rate": 5.474918928120744e-07, "loss": 0.80046499, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.5546348094940186 }, { "auxiliary_loss_clip": 0.01152064, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 1.0489862, "balance_loss_mlp": 1.01624739, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 2.21279733136507, "language_loss": 0.87324274, "learning_rate": 5.469565173258392e-07, "loss": 0.89499962, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.6085970401763916 }, { "auxiliary_loss_clip": 0.01174314, "auxiliary_loss_mlp": 0.01027408, "balance_loss_clip": 1.05111241, "balance_loss_mlp": 1.01960611, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 2.1204251249606787, "language_loss": 0.63918895, "learning_rate": 5.464213622651454e-07, "loss": 0.66120613, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.5150325298309326 }, { "auxiliary_loss_clip": 0.01126257, "auxiliary_loss_mlp": 0.01027121, "balance_loss_clip": 1.04555249, "balance_loss_mlp": 1.01993263, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 7.821077787742629, "language_loss": 0.84331548, "learning_rate": 5.458864277111753e-07, "loss": 0.86484921, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.753274917602539 }, { "auxiliary_loss_clip": 0.01130678, "auxiliary_loss_mlp": 0.0071077, "balance_loss_clip": 1.04387259, "balance_loss_mlp": 1.00060642, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 3.1442729321574707, "language_loss": 0.68928885, "learning_rate": 5.453517137450769e-07, "loss": 0.70770329, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 3.5890321731567383 }, { "auxiliary_loss_clip": 0.01156739, "auxiliary_loss_mlp": 0.01024876, "balance_loss_clip": 1.05133653, "balance_loss_mlp": 1.01774716, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 1.700315507970363, "language_loss": 0.76123291, "learning_rate": 5.448172204479684e-07, "loss": 0.78304905, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 4.469582796096802 }, { "auxiliary_loss_clip": 0.01168559, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.05020118, "balance_loss_mlp": 1.02494526, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.9931210958955332, "language_loss": 0.74616981, "learning_rate": 5.442829479009294e-07, "loss": 0.76817501, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.5956809520721436 }, { "auxiliary_loss_clip": 0.01160956, "auxiliary_loss_mlp": 0.01024774, "balance_loss_clip": 1.0495286, "balance_loss_mlp": 1.01695371, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 4.523702951651932, "language_loss": 0.71898174, "learning_rate": 5.437488961850103e-07, "loss": 0.74083906, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 3.5379791259765625 }, { "auxiliary_loss_clip": 0.01101714, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.04215503, "balance_loss_mlp": 1.01742053, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.8074631790838025, "language_loss": 0.75583816, "learning_rate": 5.432150653812258e-07, "loss": 0.77710348, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.7092015743255615 }, { "auxiliary_loss_clip": 0.01150762, "auxiliary_loss_mlp": 0.01019899, "balance_loss_clip": 1.04934752, "balance_loss_mlp": 1.01241899, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 2.209687031125716, "language_loss": 0.82917476, "learning_rate": 5.42681455570557e-07, "loss": 0.85088134, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.5902884006500244 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01027536, "balance_loss_clip": 1.04760766, "balance_loss_mlp": 1.02067256, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 3.8696571838958653, "language_loss": 0.6478852, "learning_rate": 5.42148066833954e-07, "loss": 0.66981477, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.549283981323242 }, { "auxiliary_loss_clip": 0.01167174, "auxiliary_loss_mlp": 0.01027657, "balance_loss_clip": 1.04895532, "balance_loss_mlp": 1.02053142, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 2.5664203474855634, "language_loss": 0.75458288, "learning_rate": 5.416148992523289e-07, "loss": 0.77653122, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.603851318359375 }, { "auxiliary_loss_clip": 0.01075026, "auxiliary_loss_mlp": 0.01027104, "balance_loss_clip": 1.04282665, "balance_loss_mlp": 1.02005291, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 1.8948553491976612, "language_loss": 0.78805453, "learning_rate": 5.410819529065644e-07, "loss": 0.80907583, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.726529598236084 }, { "auxiliary_loss_clip": 0.01105734, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.04279482, "balance_loss_mlp": 1.01855731, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 3.371167217987255, "language_loss": 0.65385759, "learning_rate": 5.405492278775079e-07, "loss": 0.67517483, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.817431926727295 }, { "auxiliary_loss_clip": 0.01136932, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.04446077, "balance_loss_mlp": 1.02183056, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.594570359971464, "language_loss": 0.79652935, "learning_rate": 5.400167242459732e-07, "loss": 0.81819344, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.7130236625671387 }, { "auxiliary_loss_clip": 0.01151759, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.04874992, "balance_loss_mlp": 1.01700103, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 1.7449836021326537, "language_loss": 0.80762041, "learning_rate": 5.394844420927405e-07, "loss": 0.82937479, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.6941823959350586 }, { "auxiliary_loss_clip": 0.01168127, "auxiliary_loss_mlp": 0.01024832, "balance_loss_clip": 1.04968464, "balance_loss_mlp": 1.01749206, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.3303530516153885, "language_loss": 0.7330997, "learning_rate": 5.389523814985562e-07, "loss": 0.75502932, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.575230360031128 }, { "auxiliary_loss_clip": 0.01104593, "auxiliary_loss_mlp": 0.01025989, "balance_loss_clip": 1.04259467, "balance_loss_mlp": 1.01874757, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 1.8845326990009794, "language_loss": 0.76381683, "learning_rate": 5.384205425441344e-07, "loss": 0.78512263, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.7516183853149414 }, { "auxiliary_loss_clip": 0.01135136, "auxiliary_loss_mlp": 0.01022799, "balance_loss_clip": 1.04518509, "balance_loss_mlp": 1.01532197, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.6372847552234246, "language_loss": 0.84181917, "learning_rate": 5.378889253101537e-07, "loss": 0.86339855, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.688734531402588 }, { "auxiliary_loss_clip": 0.01152758, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.04621267, "balance_loss_mlp": 1.01904416, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.6927817226914572, "language_loss": 0.80960631, "learning_rate": 5.373575298772617e-07, "loss": 0.83139491, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.589984893798828 }, { "auxiliary_loss_clip": 0.0106213, "auxiliary_loss_mlp": 0.01000881, "balance_loss_clip": 1.02015352, "balance_loss_mlp": 0.9999513, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7648816399709744, "language_loss": 0.61277342, "learning_rate": 5.368263563260689e-07, "loss": 0.63340354, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.2436397075653076 }, { "auxiliary_loss_clip": 0.01152347, "auxiliary_loss_mlp": 0.01025297, "balance_loss_clip": 1.04667974, "balance_loss_mlp": 1.01814473, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 2.80709317023322, "language_loss": 0.64019096, "learning_rate": 5.362954047371537e-07, "loss": 0.6619674, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.5349020957946777 }, { "auxiliary_loss_clip": 0.01124342, "auxiliary_loss_mlp": 0.01023778, "balance_loss_clip": 1.05203879, "balance_loss_mlp": 1.0163815, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 2.0941445370466463, "language_loss": 0.7197243, "learning_rate": 5.357646751910627e-07, "loss": 0.74120557, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.7348544597625732 }, { "auxiliary_loss_clip": 0.01134034, "auxiliary_loss_mlp": 0.01035033, "balance_loss_clip": 1.04504251, "balance_loss_mlp": 1.02731395, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 6.05579327898346, "language_loss": 0.80104768, "learning_rate": 5.352341677683061e-07, "loss": 0.82273829, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.619889259338379 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01026518, "balance_loss_clip": 1.04502344, "balance_loss_mlp": 1.01938009, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 2.4229629327282924, "language_loss": 0.78890932, "learning_rate": 5.347038825493617e-07, "loss": 0.81046081, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.7045276165008545 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01027786, "balance_loss_clip": 1.04841232, "balance_loss_mlp": 1.02053833, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.0547266512490134, "language_loss": 0.68776107, "learning_rate": 5.341738196146732e-07, "loss": 0.70937455, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.60616135597229 }, { "auxiliary_loss_clip": 0.01149388, "auxiliary_loss_mlp": 0.01023849, "balance_loss_clip": 1.04706836, "balance_loss_mlp": 1.01656568, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.1514030577132286, "language_loss": 0.7379092, "learning_rate": 5.336439790446503e-07, "loss": 0.75964165, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.6107900142669678 }, { "auxiliary_loss_clip": 0.0111569, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.04117298, "balance_loss_mlp": 1.01953709, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 1.8508280764748335, "language_loss": 0.62466455, "learning_rate": 5.331143609196711e-07, "loss": 0.64608854, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 2.9297406673431396 }, { "auxiliary_loss_clip": 0.01153079, "auxiliary_loss_mlp": 0.01023627, "balance_loss_clip": 1.05042386, "balance_loss_mlp": 1.0158937, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.8002173323263402, "language_loss": 0.7728399, "learning_rate": 5.325849653200758e-07, "loss": 0.79460692, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.7675743103027344 }, { "auxiliary_loss_clip": 0.01168924, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.05000341, "balance_loss_mlp": 1.01906335, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 2.047242390151088, "language_loss": 0.76481831, "learning_rate": 5.32055792326175e-07, "loss": 0.78677005, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.5955536365509033 }, { "auxiliary_loss_clip": 0.01139159, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.04852331, "balance_loss_mlp": 1.01827073, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 2.5613206649194966, "language_loss": 0.72916019, "learning_rate": 5.315268420182437e-07, "loss": 0.75080836, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 3.552492618560791 }, { "auxiliary_loss_clip": 0.01126084, "auxiliary_loss_mlp": 0.00712091, "balance_loss_clip": 1.04518747, "balance_loss_mlp": 1.0006634, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 2.864796095474511, "language_loss": 0.76603806, "learning_rate": 5.309981144765221e-07, "loss": 0.78441983, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 4.5623619556427 }, { "auxiliary_loss_clip": 0.0111031, "auxiliary_loss_mlp": 0.01025014, "balance_loss_clip": 1.04319978, "balance_loss_mlp": 1.01828802, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.3295136791351783, "language_loss": 0.75562656, "learning_rate": 5.304696097812196e-07, "loss": 0.7769798, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 3.592839479446411 }, { "auxiliary_loss_clip": 0.01135227, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.04587221, "balance_loss_mlp": 1.02097678, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 34.146980136709004, "language_loss": 0.60422564, "learning_rate": 5.299413280125078e-07, "loss": 0.62586772, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.705631971359253 }, { "auxiliary_loss_clip": 0.01134499, "auxiliary_loss_mlp": 0.01023462, "balance_loss_clip": 1.04403985, "balance_loss_mlp": 1.01626158, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 2.2636764773361073, "language_loss": 0.72795409, "learning_rate": 5.294132692505284e-07, "loss": 0.74953365, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.606705665588379 }, { "auxiliary_loss_clip": 0.01095346, "auxiliary_loss_mlp": 0.01026987, "balance_loss_clip": 1.04161215, "balance_loss_mlp": 1.0195123, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 2.2524029498498703, "language_loss": 0.79892647, "learning_rate": 5.288854335753861e-07, "loss": 0.82014978, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.748465061187744 }, { "auxiliary_loss_clip": 0.01156273, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 1.04854274, "balance_loss_mlp": 1.01656389, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 1.7695203087314109, "language_loss": 0.75621045, "learning_rate": 5.283578210671551e-07, "loss": 0.77800846, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.69535231590271 }, { "auxiliary_loss_clip": 0.01143104, "auxiliary_loss_mlp": 0.01030396, "balance_loss_clip": 1.04931927, "balance_loss_mlp": 1.02346694, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 2.5067889182173295, "language_loss": 0.76690829, "learning_rate": 5.278304318058719e-07, "loss": 0.78864336, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.6973202228546143 }, { "auxiliary_loss_clip": 0.01085979, "auxiliary_loss_mlp": 0.0102764, "balance_loss_clip": 1.0400033, "balance_loss_mlp": 1.02058578, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 5.576609346870077, "language_loss": 0.79235303, "learning_rate": 5.273032658715411e-07, "loss": 0.8134892, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.892794370651245 }, { "auxiliary_loss_clip": 0.01099721, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.04218817, "balance_loss_mlp": 1.02163601, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 2.548223384609378, "language_loss": 0.76625907, "learning_rate": 5.267763233441347e-07, "loss": 0.7875393, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.7634124755859375 }, { "auxiliary_loss_clip": 0.01156934, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.0506264, "balance_loss_mlp": 1.02009726, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 2.5009443003354077, "language_loss": 0.69578344, "learning_rate": 5.26249604303588e-07, "loss": 0.71763527, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.7345633506774902 }, { "auxiliary_loss_clip": 0.01169719, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04978275, "balance_loss_mlp": 1.02189517, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 6.78943993983714, "language_loss": 0.78369671, "learning_rate": 5.257231088298057e-07, "loss": 0.80568576, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.5979602336883545 }, { "auxiliary_loss_clip": 0.01035425, "auxiliary_loss_mlp": 0.01001791, "balance_loss_clip": 1.02016211, "balance_loss_mlp": 1.00085497, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7976330235604564, "language_loss": 0.53944206, "learning_rate": 5.25196837002655e-07, "loss": 0.55981421, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.4069230556488037 }, { "auxiliary_loss_clip": 0.01135014, "auxiliary_loss_mlp": 0.01027275, "balance_loss_clip": 1.04655313, "balance_loss_mlp": 1.01973534, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.150725467563254, "language_loss": 0.68044937, "learning_rate": 5.24670788901971e-07, "loss": 0.70207226, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.7823989391326904 }, { "auxiliary_loss_clip": 0.01136418, "auxiliary_loss_mlp": 0.01031204, "balance_loss_clip": 1.04789495, "balance_loss_mlp": 1.02284694, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.909392093572806, "language_loss": 0.68411624, "learning_rate": 5.241449646075557e-07, "loss": 0.70579243, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.8663787841796875 }, { "auxiliary_loss_clip": 0.0115846, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 1.04726887, "balance_loss_mlp": 1.0223639, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.8757267667912085, "language_loss": 0.72822762, "learning_rate": 5.236193641991762e-07, "loss": 0.75010294, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.6891345977783203 }, { "auxiliary_loss_clip": 0.01136455, "auxiliary_loss_mlp": 0.01025282, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.01832354, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 2.1310434135545853, "language_loss": 0.69952738, "learning_rate": 5.23093987756565e-07, "loss": 0.7211448, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.7112019062042236 }, { "auxiliary_loss_clip": 0.01129172, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.0438211, "balance_loss_mlp": 1.02065575, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 2.054995701608067, "language_loss": 0.75190908, "learning_rate": 5.225688353594217e-07, "loss": 0.77348334, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.686088800430298 }, { "auxiliary_loss_clip": 0.01141908, "auxiliary_loss_mlp": 0.00710746, "balance_loss_clip": 1.04809868, "balance_loss_mlp": 1.00059688, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.5795227475527773, "language_loss": 0.77719605, "learning_rate": 5.220439070874108e-07, "loss": 0.7957226, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.6360671520233154 }, { "auxiliary_loss_clip": 0.01152307, "auxiliary_loss_mlp": 0.01023682, "balance_loss_clip": 1.04960513, "balance_loss_mlp": 1.01615953, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.7646637201814857, "language_loss": 0.71493924, "learning_rate": 5.215192030201652e-07, "loss": 0.7366991, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.629445791244507 }, { "auxiliary_loss_clip": 0.01107085, "auxiliary_loss_mlp": 0.0102344, "balance_loss_clip": 1.04056525, "balance_loss_mlp": 1.0164597, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.9362308540106452, "language_loss": 0.85912752, "learning_rate": 5.209947232372798e-07, "loss": 0.88043278, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.7353458404541016 }, { "auxiliary_loss_clip": 0.01154736, "auxiliary_loss_mlp": 0.00711505, "balance_loss_clip": 1.04690337, "balance_loss_mlp": 1.00053895, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 1.8971481315257344, "language_loss": 0.81362593, "learning_rate": 5.204704678183196e-07, "loss": 0.83228832, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.675032377243042 }, { "auxiliary_loss_clip": 0.01171636, "auxiliary_loss_mlp": 0.01023265, "balance_loss_clip": 1.05201232, "balance_loss_mlp": 1.01637757, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 1.886145070593391, "language_loss": 0.85406303, "learning_rate": 5.19946436842813e-07, "loss": 0.87601209, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.5724239349365234 }, { "auxiliary_loss_clip": 0.01124161, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.04700089, "balance_loss_mlp": 1.02047384, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.6174709988208424, "language_loss": 0.68285263, "learning_rate": 5.194226303902546e-07, "loss": 0.70436984, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.7014801502227783 }, { "auxiliary_loss_clip": 0.01132163, "auxiliary_loss_mlp": 0.0102525, "balance_loss_clip": 1.04400241, "balance_loss_mlp": 1.01799369, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 2.3530279086678676, "language_loss": 0.70753521, "learning_rate": 5.188990485401072e-07, "loss": 0.72910935, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.6416804790496826 }, { "auxiliary_loss_clip": 0.01154023, "auxiliary_loss_mlp": 0.01024763, "balance_loss_clip": 1.04889441, "balance_loss_mlp": 1.01784551, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.8616685396215398, "language_loss": 0.86465716, "learning_rate": 5.183756913717954e-07, "loss": 0.88644505, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.610591173171997 }, { "auxiliary_loss_clip": 0.01132994, "auxiliary_loss_mlp": 0.01024645, "balance_loss_clip": 1.04698479, "balance_loss_mlp": 1.01791823, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 1.7931657577077311, "language_loss": 0.73278618, "learning_rate": 5.178525589647136e-07, "loss": 0.75436258, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 4.5462775230407715 }, { "auxiliary_loss_clip": 0.01140785, "auxiliary_loss_mlp": 0.01024703, "balance_loss_clip": 1.04574776, "balance_loss_mlp": 1.01779139, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 4.99190431135823, "language_loss": 0.78771287, "learning_rate": 5.173296513982197e-07, "loss": 0.80936772, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 3.540743112564087 }, { "auxiliary_loss_clip": 0.01132589, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.04919696, "balance_loss_mlp": 1.02337849, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 2.579522404873011, "language_loss": 0.65163314, "learning_rate": 5.168069687516398e-07, "loss": 0.67326725, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.7087209224700928 }, { "auxiliary_loss_clip": 0.01138232, "auxiliary_loss_mlp": 0.01029159, "balance_loss_clip": 1.04861999, "balance_loss_mlp": 1.02190804, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 2.308321468186408, "language_loss": 0.71671027, "learning_rate": 5.16284511104263e-07, "loss": 0.73838419, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 3.45757794380188 }, { "auxiliary_loss_clip": 0.01135501, "auxiliary_loss_mlp": 0.01028161, "balance_loss_clip": 1.04589915, "balance_loss_mlp": 1.02080011, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 3.025281609249227, "language_loss": 0.80529547, "learning_rate": 5.157622785353457e-07, "loss": 0.82693207, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.6500792503356934 }, { "auxiliary_loss_clip": 0.01063516, "auxiliary_loss_mlp": 0.01000958, "balance_loss_clip": 1.02153957, "balance_loss_mlp": 0.99993896, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6491977404650158, "language_loss": 0.60342634, "learning_rate": 5.152402711241113e-07, "loss": 0.624071, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.249922513961792 }, { "auxiliary_loss_clip": 0.01116738, "auxiliary_loss_mlp": 0.01021155, "balance_loss_clip": 1.04270768, "balance_loss_mlp": 1.01470625, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.7598400047328657, "language_loss": 0.83245426, "learning_rate": 5.147184889497465e-07, "loss": 0.8538332, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.714503288269043 }, { "auxiliary_loss_clip": 0.01111322, "auxiliary_loss_mlp": 0.01023327, "balance_loss_clip": 1.04387951, "balance_loss_mlp": 1.01632333, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.5992225693677704, "language_loss": 0.80325735, "learning_rate": 5.141969320914072e-07, "loss": 0.82460386, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.643151044845581 }, { "auxiliary_loss_clip": 0.01172023, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.04972506, "balance_loss_mlp": 1.018013, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 2.4820931729925353, "language_loss": 0.62655175, "learning_rate": 5.136756006282113e-07, "loss": 0.64853078, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.6131441593170166 }, { "auxiliary_loss_clip": 0.01171918, "auxiliary_loss_mlp": 0.01027282, "balance_loss_clip": 1.0508765, "balance_loss_mlp": 1.02009058, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.7704294884707967, "language_loss": 0.84893692, "learning_rate": 5.131544946392446e-07, "loss": 0.87092888, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.56961989402771 }, { "auxiliary_loss_clip": 0.0113822, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.04936182, "balance_loss_mlp": 1.02067327, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 2.9371928105937064, "language_loss": 0.64137167, "learning_rate": 5.126336142035592e-07, "loss": 0.66302979, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.719541311264038 }, { "auxiliary_loss_clip": 0.0113847, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.0466094, "balance_loss_mlp": 1.01817131, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 5.161229070380165, "language_loss": 0.7245121, "learning_rate": 5.121129594001721e-07, "loss": 0.746153, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.610322952270508 }, { "auxiliary_loss_clip": 0.01151519, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.04889655, "balance_loss_mlp": 1.01891041, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.8889701442462767, "language_loss": 0.8111105, "learning_rate": 5.115925303080661e-07, "loss": 0.83288622, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.576758861541748 }, { "auxiliary_loss_clip": 0.01139639, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04793417, "balance_loss_mlp": 1.01796544, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.3318359483450735, "language_loss": 0.79463875, "learning_rate": 5.110723270061899e-07, "loss": 0.81628639, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.639394998550415 }, { "auxiliary_loss_clip": 0.01167274, "auxiliary_loss_mlp": 0.01023834, "balance_loss_clip": 1.04831004, "balance_loss_mlp": 1.01745915, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 2.0825255847637805, "language_loss": 0.79725242, "learning_rate": 5.105523495734572e-07, "loss": 0.81916344, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.508399486541748 }, { "auxiliary_loss_clip": 0.01170396, "auxiliary_loss_mlp": 0.01029307, "balance_loss_clip": 1.049685, "balance_loss_mlp": 1.02196121, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.5930467621715176, "language_loss": 0.7528339, "learning_rate": 5.100325980887499e-07, "loss": 0.77483094, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.6036956310272217 }, { "auxiliary_loss_clip": 0.01146167, "auxiliary_loss_mlp": 0.01024054, "balance_loss_clip": 1.04688084, "balance_loss_mlp": 1.01707458, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.791142091428416, "language_loss": 0.82957792, "learning_rate": 5.095130726309116e-07, "loss": 0.85128009, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.642596960067749 }, { "auxiliary_loss_clip": 0.01071619, "auxiliary_loss_mlp": 0.01001736, "balance_loss_clip": 1.01935101, "balance_loss_mlp": 1.00086546, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 1.907089641767066, "language_loss": 0.58993363, "learning_rate": 5.089937732787559e-07, "loss": 0.61066711, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.2528584003448486 }, { "auxiliary_loss_clip": 0.01121124, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.04488564, "balance_loss_mlp": 1.0221734, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.958152338884435, "language_loss": 0.66417944, "learning_rate": 5.084747001110592e-07, "loss": 0.68568838, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.762486696243286 }, { "auxiliary_loss_clip": 0.01152191, "auxiliary_loss_mlp": 0.00711063, "balance_loss_clip": 1.0511682, "balance_loss_mlp": 1.00067139, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.7940946100764115, "language_loss": 0.70504022, "learning_rate": 5.07955853206564e-07, "loss": 0.72367275, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.755450963973999 }, { "auxiliary_loss_clip": 0.01158188, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.04986382, "balance_loss_mlp": 1.02009392, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 2.2677018640933846, "language_loss": 0.71152461, "learning_rate": 5.074372326439807e-07, "loss": 0.73338318, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.9906206130981445 }, { "auxiliary_loss_clip": 0.01121577, "auxiliary_loss_mlp": 0.01025415, "balance_loss_clip": 1.04418969, "balance_loss_mlp": 1.01809585, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 3.097377149746801, "language_loss": 0.73820424, "learning_rate": 5.069188385019814e-07, "loss": 0.75967419, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.745598077774048 }, { "auxiliary_loss_clip": 0.01111718, "auxiliary_loss_mlp": 0.01023405, "balance_loss_clip": 1.04218614, "balance_loss_mlp": 1.01606178, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 2.8293807875570796, "language_loss": 0.60977066, "learning_rate": 5.064006708592077e-07, "loss": 0.63112193, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.7234585285186768 }, { "auxiliary_loss_clip": 0.01132652, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.0485574, "balance_loss_mlp": 1.02248645, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.6455539109132005, "language_loss": 0.75576067, "learning_rate": 5.058827297942641e-07, "loss": 0.77738094, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.8105204105377197 }, { "auxiliary_loss_clip": 0.01143653, "auxiliary_loss_mlp": 0.01022371, "balance_loss_clip": 1.04779482, "balance_loss_mlp": 1.01502502, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 2.7926561498960907, "language_loss": 0.75064296, "learning_rate": 5.053650153857237e-07, "loss": 0.77230322, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.7801132202148438 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01023685, "balance_loss_clip": 1.05056763, "balance_loss_mlp": 1.01604652, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.9213274761387378, "language_loss": 0.69841969, "learning_rate": 5.048475277121214e-07, "loss": 0.72019637, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 3.564821481704712 }, { "auxiliary_loss_clip": 0.01156132, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04809213, "balance_loss_mlp": 1.01665783, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 6.610934929112008, "language_loss": 0.77354628, "learning_rate": 5.043302668519598e-07, "loss": 0.79535276, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.6664934158325195 }, { "auxiliary_loss_clip": 0.01154694, "auxiliary_loss_mlp": 0.01024696, "balance_loss_clip": 1.0467782, "balance_loss_mlp": 1.01719511, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.7909560854582762, "language_loss": 0.72154987, "learning_rate": 5.038132328837079e-07, "loss": 0.74334371, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 4.455316066741943 }, { "auxiliary_loss_clip": 0.01156637, "auxiliary_loss_mlp": 0.01026489, "balance_loss_clip": 1.04893208, "balance_loss_mlp": 1.01972115, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 1.9665561698934864, "language_loss": 0.7419374, "learning_rate": 5.032964258857993e-07, "loss": 0.76376867, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 3.67356276512146 }, { "auxiliary_loss_clip": 0.01150232, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04436815, "balance_loss_mlp": 1.01811647, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.6447373643055854, "language_loss": 0.68405175, "learning_rate": 5.027798459366329e-07, "loss": 0.7058115, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.9028289318084717 }, { "auxiliary_loss_clip": 0.01158121, "auxiliary_loss_mlp": 0.01021728, "balance_loss_clip": 1.04882896, "balance_loss_mlp": 1.01465535, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.4641114815720366, "language_loss": 0.63699478, "learning_rate": 5.02263493114573e-07, "loss": 0.65879321, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.8174681663513184 }, { "auxiliary_loss_clip": 0.01167284, "auxiliary_loss_mlp": 0.01023495, "balance_loss_clip": 1.04830694, "balance_loss_mlp": 1.01624095, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 4.532192543133554, "language_loss": 0.77212626, "learning_rate": 5.017473674979502e-07, "loss": 0.794034, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.6506664752960205 }, { "auxiliary_loss_clip": 0.01026241, "auxiliary_loss_mlp": 0.01005149, "balance_loss_clip": 1.01828516, "balance_loss_mlp": 1.00412405, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7481623650902129, "language_loss": 0.58278543, "learning_rate": 5.01231469165061e-07, "loss": 0.60309935, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.1972663402557373 }, { "auxiliary_loss_clip": 0.01060308, "auxiliary_loss_mlp": 0.0100195, "balance_loss_clip": 1.01923656, "balance_loss_mlp": 1.00093699, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.9302629111541455, "language_loss": 0.56890124, "learning_rate": 5.007157981941663e-07, "loss": 0.58952385, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.292654514312744 }, { "auxiliary_loss_clip": 0.01048906, "auxiliary_loss_mlp": 0.01002836, "balance_loss_clip": 1.01977873, "balance_loss_mlp": 1.00188279, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8801305871326727, "language_loss": 0.67431569, "learning_rate": 5.002003546634928e-07, "loss": 0.69483304, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.1637187004089355 }, { "auxiliary_loss_clip": 0.01104832, "auxiliary_loss_mlp": 0.01025182, "balance_loss_clip": 1.04642916, "balance_loss_mlp": 1.01819074, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.958769449042134, "language_loss": 0.76188266, "learning_rate": 4.996851386512331e-07, "loss": 0.78318286, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.6946799755096436 }, { "auxiliary_loss_clip": 0.0113955, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.04814541, "balance_loss_mlp": 1.02330375, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 2.028130077753732, "language_loss": 0.83140874, "learning_rate": 4.991701502355444e-07, "loss": 0.85311711, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.6237192153930664 }, { "auxiliary_loss_clip": 0.01157196, "auxiliary_loss_mlp": 0.0102491, "balance_loss_clip": 1.04868793, "balance_loss_mlp": 1.01822495, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.5739226802778157, "language_loss": 0.76038337, "learning_rate": 4.986553894945518e-07, "loss": 0.78220439, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.6947174072265625 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.01020378, "balance_loss_clip": 1.04319584, "balance_loss_mlp": 1.01381278, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.185142519617957, "language_loss": 0.86343205, "learning_rate": 4.981408565063416e-07, "loss": 0.88471663, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.7375919818878174 }, { "auxiliary_loss_clip": 0.01170108, "auxiliary_loss_mlp": 0.01027811, "balance_loss_clip": 1.04983079, "balance_loss_mlp": 1.02041435, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 2.3977765842150123, "language_loss": 0.7608161, "learning_rate": 4.976265513489701e-07, "loss": 0.78279525, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.579509735107422 }, { "auxiliary_loss_clip": 0.01150716, "auxiliary_loss_mlp": 0.01026524, "balance_loss_clip": 1.04663205, "balance_loss_mlp": 1.01943135, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 2.4928424946811885, "language_loss": 0.80508769, "learning_rate": 4.971124741004562e-07, "loss": 0.82686001, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.564700126647949 }, { "auxiliary_loss_clip": 0.01149793, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04600346, "balance_loss_mlp": 1.02053988, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.8128234181105207, "language_loss": 0.76284873, "learning_rate": 4.965986248387846e-07, "loss": 0.78462166, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.608443260192871 }, { "auxiliary_loss_clip": 0.01139043, "auxiliary_loss_mlp": 0.010216, "balance_loss_clip": 1.04556394, "balance_loss_mlp": 1.01473022, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 3.435081001939822, "language_loss": 0.77204233, "learning_rate": 4.960850036419073e-07, "loss": 0.79364878, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.6344616413116455 }, { "auxiliary_loss_clip": 0.01133425, "auxiliary_loss_mlp": 0.01020978, "balance_loss_clip": 1.04505348, "balance_loss_mlp": 1.01417673, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 2.072739008766625, "language_loss": 0.78707868, "learning_rate": 4.955716105877378e-07, "loss": 0.80862272, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.6488776206970215 }, { "auxiliary_loss_clip": 0.01157923, "auxiliary_loss_mlp": 0.00711842, "balance_loss_clip": 1.04843414, "balance_loss_mlp": 1.00070071, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.765973896804256, "language_loss": 0.83323288, "learning_rate": 4.950584457541598e-07, "loss": 0.85193056, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.6329970359802246 }, { "auxiliary_loss_clip": 0.01156606, "auxiliary_loss_mlp": 0.01025361, "balance_loss_clip": 1.04921055, "balance_loss_mlp": 1.01836944, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.4468461461169322, "language_loss": 0.8191368, "learning_rate": 4.945455092190183e-07, "loss": 0.84095645, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.5796000957489014 }, { "auxiliary_loss_clip": 0.01074152, "auxiliary_loss_mlp": 0.01004504, "balance_loss_clip": 1.020684, "balance_loss_mlp": 1.00344884, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6859322313745296, "language_loss": 0.55984342, "learning_rate": 4.940328010601271e-07, "loss": 0.58063006, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.1746294498443604 }, { "auxiliary_loss_clip": 0.01145846, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.05111957, "balance_loss_mlp": 1.02123523, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.9564473497801038, "language_loss": 0.76748711, "learning_rate": 4.935203213552621e-07, "loss": 0.78923053, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.807692527770996 }, { "auxiliary_loss_clip": 0.01140388, "auxiliary_loss_mlp": 0.01024336, "balance_loss_clip": 1.04756272, "balance_loss_mlp": 1.01674819, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 2.3748061402140612, "language_loss": 0.66871732, "learning_rate": 4.930080701821662e-07, "loss": 0.6903646, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.6518678665161133 }, { "auxiliary_loss_clip": 0.01138479, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.0461514, "balance_loss_mlp": 1.0249989, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 1.9889475069531148, "language_loss": 0.76969707, "learning_rate": 4.92496047618548e-07, "loss": 0.791403, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.703956127166748 }, { "auxiliary_loss_clip": 0.0115429, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.05057395, "balance_loss_mlp": 1.01662302, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 2.1859911254385995, "language_loss": 0.77567804, "learning_rate": 4.919842537420811e-07, "loss": 0.79745847, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.5975749492645264 }, { "auxiliary_loss_clip": 0.01141384, "auxiliary_loss_mlp": 0.01024449, "balance_loss_clip": 1.04948473, "balance_loss_mlp": 1.01729631, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.9331535621418472, "language_loss": 0.79583669, "learning_rate": 4.91472688630404e-07, "loss": 0.81749499, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 3.6142055988311768 }, { "auxiliary_loss_clip": 0.01169232, "auxiliary_loss_mlp": 0.01026733, "balance_loss_clip": 1.05065334, "balance_loss_mlp": 1.01995575, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 2.0844209289801428, "language_loss": 0.73823071, "learning_rate": 4.909613523611202e-07, "loss": 0.76019031, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 3.499282121658325 }, { "auxiliary_loss_clip": 0.01102924, "auxiliary_loss_mlp": 0.00711669, "balance_loss_clip": 1.04196882, "balance_loss_mlp": 1.00065589, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.7702196850878475, "language_loss": 0.74781626, "learning_rate": 4.904502450117991e-07, "loss": 0.76596218, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 3.7871463298797607 }, { "auxiliary_loss_clip": 0.0113614, "auxiliary_loss_mlp": 0.01025931, "balance_loss_clip": 1.04970574, "balance_loss_mlp": 1.01845634, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.661393150487659, "language_loss": 0.72725934, "learning_rate": 4.899393666599762e-07, "loss": 0.74888009, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 3.519474983215332 }, { "auxiliary_loss_clip": 0.01165386, "auxiliary_loss_mlp": 0.01026121, "balance_loss_clip": 1.04626799, "balance_loss_mlp": 1.01903152, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.2849082942571695, "language_loss": 0.72504914, "learning_rate": 4.894287173831506e-07, "loss": 0.74696422, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.663423538208008 }, { "auxiliary_loss_clip": 0.01140476, "auxiliary_loss_mlp": 0.01029145, "balance_loss_clip": 1.04660475, "balance_loss_mlp": 1.02200389, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.5735831800620317, "language_loss": 0.84569132, "learning_rate": 4.889182972587877e-07, "loss": 0.86738753, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.6605141162872314 }, { "auxiliary_loss_clip": 0.01130572, "auxiliary_loss_mlp": 0.01025768, "balance_loss_clip": 1.04682159, "balance_loss_mlp": 1.0190146, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.7898691972102445, "language_loss": 0.66301668, "learning_rate": 4.884081063643177e-07, "loss": 0.68458009, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.652482748031616 }, { "auxiliary_loss_clip": 0.0103961, "auxiliary_loss_mlp": 0.01001247, "balance_loss_clip": 1.01763844, "balance_loss_mlp": 1.00028169, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8652886043336034, "language_loss": 0.52529132, "learning_rate": 4.878981447771353e-07, "loss": 0.54569989, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.195613384246826 }, { "auxiliary_loss_clip": 0.01117832, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04619884, "balance_loss_mlp": 1.02215266, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.4790012936534533, "language_loss": 0.72951663, "learning_rate": 4.873884125746035e-07, "loss": 0.75099218, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.755439043045044 }, { "auxiliary_loss_clip": 0.01132254, "auxiliary_loss_mlp": 0.01022445, "balance_loss_clip": 1.0455997, "balance_loss_mlp": 1.01518536, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 4.346733356827455, "language_loss": 0.72403955, "learning_rate": 4.868789098340456e-07, "loss": 0.74558651, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.6304824352264404 }, { "auxiliary_loss_clip": 0.01124845, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.04617834, "balance_loss_mlp": 1.01943326, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 2.676916501708683, "language_loss": 0.73225683, "learning_rate": 4.863696366327543e-07, "loss": 0.75377363, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.7139320373535156 }, { "auxiliary_loss_clip": 0.01151553, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.04506862, "balance_loss_mlp": 1.02095532, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.7452899822594494, "language_loss": 0.7839632, "learning_rate": 4.85860593047986e-07, "loss": 0.80575931, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.616553544998169 }, { "auxiliary_loss_clip": 0.01116126, "auxiliary_loss_mlp": 0.01024728, "balance_loss_clip": 1.04223132, "balance_loss_mlp": 1.01753724, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.6087499249644903, "language_loss": 0.74822295, "learning_rate": 4.853517791569613e-07, "loss": 0.76963151, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.7345309257507324 }, { "auxiliary_loss_clip": 0.01140129, "auxiliary_loss_mlp": 0.00711756, "balance_loss_clip": 1.04499602, "balance_loss_mlp": 1.00062275, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.850853477865058, "language_loss": 0.66152591, "learning_rate": 4.848431950368684e-07, "loss": 0.68004477, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.7775204181671143 }, { "auxiliary_loss_clip": 0.01072456, "auxiliary_loss_mlp": 0.00701455, "balance_loss_clip": 1.01980495, "balance_loss_mlp": 1.00004089, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.722612169815857, "language_loss": 0.5562067, "learning_rate": 4.843348407648569e-07, "loss": 0.57394576, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.141148090362549 }, { "auxiliary_loss_clip": 0.01152381, "auxiliary_loss_mlp": 0.0102361, "balance_loss_clip": 1.04304361, "balance_loss_mlp": 1.01607633, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.3372841109980826, "language_loss": 0.83369672, "learning_rate": 4.838267164180457e-07, "loss": 0.85545665, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.5453145503997803 }, { "auxiliary_loss_clip": 0.0117182, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.04957175, "balance_loss_mlp": 1.02655733, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 2.365078452878682, "language_loss": 0.83803344, "learning_rate": 4.833188220735156e-07, "loss": 0.8600896, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.6059205532073975 }, { "auxiliary_loss_clip": 0.01152801, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04780006, "balance_loss_mlp": 1.01697755, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.1909127221281763, "language_loss": 0.74837106, "learning_rate": 4.828111578083152e-07, "loss": 0.77013803, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.5490121841430664 }, { "auxiliary_loss_clip": 0.01136709, "auxiliary_loss_mlp": 0.01022714, "balance_loss_clip": 1.04735494, "balance_loss_mlp": 1.01512098, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.045308160009075, "language_loss": 0.81374586, "learning_rate": 4.823037236994556e-07, "loss": 0.83534008, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.687640905380249 }, { "auxiliary_loss_clip": 0.01060706, "auxiliary_loss_mlp": 0.01001219, "balance_loss_clip": 1.01897287, "balance_loss_mlp": 1.00031936, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.714764711999872, "language_loss": 0.56303287, "learning_rate": 4.817965198239136e-07, "loss": 0.58365208, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.207753896713257 }, { "auxiliary_loss_clip": 0.01118885, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.04303861, "balance_loss_mlp": 1.02338588, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 4.13174948516567, "language_loss": 0.74625218, "learning_rate": 4.812895462586331e-07, "loss": 0.76774764, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.638638973236084 }, { "auxiliary_loss_clip": 0.01126756, "auxiliary_loss_mlp": 0.010241, "balance_loss_clip": 1.04756474, "balance_loss_mlp": 1.01736808, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.8664825919716794, "language_loss": 0.82022709, "learning_rate": 4.807828030805207e-07, "loss": 0.84173566, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.741076946258545 }, { "auxiliary_loss_clip": 0.0115268, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.04854274, "balance_loss_mlp": 1.01879644, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.9277374746770746, "language_loss": 0.68245149, "learning_rate": 4.802762903664495e-07, "loss": 0.70423782, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.579131841659546 }, { "auxiliary_loss_clip": 0.01143698, "auxiliary_loss_mlp": 0.01029698, "balance_loss_clip": 1.04792929, "balance_loss_mlp": 1.02267361, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.2990856938327746, "language_loss": 0.73843992, "learning_rate": 4.797700081932565e-07, "loss": 0.76017392, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.645744562149048 }, { "auxiliary_loss_clip": 0.01085659, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.04023099, "balance_loss_mlp": 1.02286184, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 3.4322129497734815, "language_loss": 0.8204819, "learning_rate": 4.792639566377442e-07, "loss": 0.84163642, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.703437328338623 }, { "auxiliary_loss_clip": 0.01146026, "auxiliary_loss_mlp": 0.01024835, "balance_loss_clip": 1.04462767, "balance_loss_mlp": 1.01779568, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.9587362279047513, "language_loss": 0.7773478, "learning_rate": 4.78758135776681e-07, "loss": 0.79905641, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.6346542835235596 }, { "auxiliary_loss_clip": 0.01140356, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.04888535, "balance_loss_mlp": 1.02239251, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 7.064617807019376, "language_loss": 0.79031456, "learning_rate": 4.782525456867989e-07, "loss": 0.81201255, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 3.6354727745056152 }, { "auxiliary_loss_clip": 0.01124816, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.04622102, "balance_loss_mlp": 1.02113616, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.6109398888873137, "language_loss": 0.83127517, "learning_rate": 4.777471864447959e-07, "loss": 0.85281259, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 3.7095937728881836 }, { "auxiliary_loss_clip": 0.01136984, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.04361987, "balance_loss_mlp": 1.02201009, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.1921140422895338, "language_loss": 0.80680043, "learning_rate": 4.772420581273344e-07, "loss": 0.82846296, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 3.6777565479278564 }, { "auxiliary_loss_clip": 0.01150026, "auxiliary_loss_mlp": 0.01021616, "balance_loss_clip": 1.0486989, "balance_loss_mlp": 1.01457071, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 3.1562015161985055, "language_loss": 0.7636627, "learning_rate": 4.7673716081104134e-07, "loss": 0.78537917, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 3.6361336708068848 }, { "auxiliary_loss_clip": 0.01151897, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.04909229, "balance_loss_mlp": 1.02063107, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 2.6728812310441192, "language_loss": 0.84675574, "learning_rate": 4.762324945725109e-07, "loss": 0.86855423, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.827033519744873 }, { "auxiliary_loss_clip": 0.01135498, "auxiliary_loss_mlp": 0.01033664, "balance_loss_clip": 1.0495882, "balance_loss_mlp": 1.02706015, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 3.332590502170967, "language_loss": 0.75827193, "learning_rate": 4.7572805948829844e-07, "loss": 0.77996361, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.780102252960205 }, { "auxiliary_loss_clip": 0.01111375, "auxiliary_loss_mlp": 0.01020962, "balance_loss_clip": 1.04395485, "balance_loss_mlp": 1.01423216, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.8674351996352174, "language_loss": 0.7103408, "learning_rate": 4.7522385563492795e-07, "loss": 0.73166418, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.810579776763916 }, { "auxiliary_loss_clip": 0.01124071, "auxiliary_loss_mlp": 0.01024274, "balance_loss_clip": 1.04523396, "balance_loss_mlp": 1.01704395, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 2.3027146577611317, "language_loss": 0.70449358, "learning_rate": 4.747198830888863e-07, "loss": 0.72597706, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.6941981315612793 }, { "auxiliary_loss_clip": 0.01133518, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.04589367, "balance_loss_mlp": 1.01969957, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 2.7030557692855237, "language_loss": 0.68565738, "learning_rate": 4.742161419266251e-07, "loss": 0.70726168, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.6784069538116455 }, { "auxiliary_loss_clip": 0.01157745, "auxiliary_loss_mlp": 0.01027699, "balance_loss_clip": 1.04947591, "balance_loss_mlp": 1.02020669, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 3.9226509479644807, "language_loss": 0.65547603, "learning_rate": 4.7371263222456304e-07, "loss": 0.67733049, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.683912754058838 }, { "auxiliary_loss_clip": 0.0105668, "auxiliary_loss_mlp": 0.01003003, "balance_loss_clip": 1.01785767, "balance_loss_mlp": 1.00204945, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 1.0092694037051075, "language_loss": 0.613585, "learning_rate": 4.7320935405908004e-07, "loss": 0.63418174, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.1741931438446045 }, { "auxiliary_loss_clip": 0.01171083, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.04873145, "balance_loss_mlp": 1.02224946, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.144643199515694, "language_loss": 0.84201324, "learning_rate": 4.7270630750652475e-07, "loss": 0.86402202, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.5825424194335938 }, { "auxiliary_loss_clip": 0.01148927, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 1.04633033, "balance_loss_mlp": 1.02052641, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.910295285657371, "language_loss": 0.80326307, "learning_rate": 4.7220349264320746e-07, "loss": 0.82502973, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.6526732444763184 }, { "auxiliary_loss_clip": 0.01058665, "auxiliary_loss_mlp": 0.01000905, "balance_loss_clip": 1.01887918, "balance_loss_mlp": 0.99994582, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7328146382778923, "language_loss": 0.54955792, "learning_rate": 4.71700909545407e-07, "loss": 0.57015365, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.1793692111968994 }, { "auxiliary_loss_clip": 0.01153681, "auxiliary_loss_mlp": 0.01028053, "balance_loss_clip": 1.04673362, "balance_loss_mlp": 1.0209924, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 6.189919663340605, "language_loss": 0.772789, "learning_rate": 4.711985582893627e-07, "loss": 0.79460633, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.6107547283172607 }, { "auxiliary_loss_clip": 0.01109308, "auxiliary_loss_mlp": 0.01028359, "balance_loss_clip": 1.04411745, "balance_loss_mlp": 1.02131724, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 3.756991610493369, "language_loss": 0.71704936, "learning_rate": 4.706964389512811e-07, "loss": 0.73842609, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.6912221908569336 }, { "auxiliary_loss_clip": 0.01167335, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 1.05034137, "balance_loss_mlp": 1.01650739, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 2.277053521228233, "language_loss": 0.87620842, "learning_rate": 4.701945516073345e-07, "loss": 0.89811546, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.5989606380462646 }, { "auxiliary_loss_clip": 0.01118735, "auxiliary_loss_mlp": 0.01021525, "balance_loss_clip": 1.04545748, "balance_loss_mlp": 1.0145638, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.9993952878839476, "language_loss": 0.75280434, "learning_rate": 4.696928963336577e-07, "loss": 0.774207, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.6826577186584473 }, { "auxiliary_loss_clip": 0.01056606, "auxiliary_loss_mlp": 0.01003917, "balance_loss_clip": 1.01780307, "balance_loss_mlp": 1.00295734, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8603675030765275, "language_loss": 0.6091975, "learning_rate": 4.6919147320635224e-07, "loss": 0.6298027, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.1412084102630615 }, { "auxiliary_loss_clip": 0.01152184, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.04577506, "balance_loss_mlp": 1.02313244, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.6107910620523946, "language_loss": 0.73435122, "learning_rate": 4.6869028230148286e-07, "loss": 0.75617683, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.7165544033050537 }, { "auxiliary_loss_clip": 0.01116825, "auxiliary_loss_mlp": 0.01023209, "balance_loss_clip": 1.04306936, "balance_loss_mlp": 1.01555872, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.209286244187161, "language_loss": 0.60309702, "learning_rate": 4.6818932369507957e-07, "loss": 0.62449741, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.642657995223999 }, { "auxiliary_loss_clip": 0.01152741, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.04954648, "balance_loss_mlp": 1.0228709, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.839989365945675, "language_loss": 0.88768047, "learning_rate": 4.676885974631386e-07, "loss": 0.90950787, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.6246907711029053 }, { "auxiliary_loss_clip": 0.01154171, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.04893637, "balance_loss_mlp": 1.02122962, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 3.0315318680395946, "language_loss": 0.81214035, "learning_rate": 4.67188103681619e-07, "loss": 0.83396757, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.577214002609253 }, { "auxiliary_loss_clip": 0.01150296, "auxiliary_loss_mlp": 0.00711268, "balance_loss_clip": 1.04889059, "balance_loss_mlp": 1.00065136, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.136249388642537, "language_loss": 0.68741685, "learning_rate": 4.666878424264453e-07, "loss": 0.70603251, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.608041286468506 }, { "auxiliary_loss_clip": 0.0112891, "auxiliary_loss_mlp": 0.01021431, "balance_loss_clip": 1.04519963, "balance_loss_mlp": 1.01478529, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 2.697369687719048, "language_loss": 0.73775029, "learning_rate": 4.661878137735069e-07, "loss": 0.75925374, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.629000425338745 }, { "auxiliary_loss_clip": 0.01138484, "auxiliary_loss_mlp": 0.01025607, "balance_loss_clip": 1.04764771, "balance_loss_mlp": 1.01867759, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 1.769091152331228, "language_loss": 0.74494755, "learning_rate": 4.656880177986571e-07, "loss": 0.76658839, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.6107404232025146 }, { "auxiliary_loss_clip": 0.01140195, "auxiliary_loss_mlp": 0.01024005, "balance_loss_clip": 1.0455811, "balance_loss_mlp": 1.01715088, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 2.3907596039338905, "language_loss": 0.81910014, "learning_rate": 4.6518845457771607e-07, "loss": 0.84074217, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 3.475425958633423 }, { "auxiliary_loss_clip": 0.0114864, "auxiliary_loss_mlp": 0.00711754, "balance_loss_clip": 1.04761851, "balance_loss_mlp": 1.00069189, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.7363618511321932, "language_loss": 0.79156053, "learning_rate": 4.646891241864652e-07, "loss": 0.81016445, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 3.528881788253784 }, { "auxiliary_loss_clip": 0.01149481, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.04607272, "balance_loss_mlp": 1.02597094, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 2.04578400632738, "language_loss": 0.73061967, "learning_rate": 4.6419002670065397e-07, "loss": 0.75244862, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 3.5922093391418457 }, { "auxiliary_loss_clip": 0.01127347, "auxiliary_loss_mlp": 0.01028489, "balance_loss_clip": 1.04807758, "balance_loss_mlp": 1.02130103, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 3.661339861827687, "language_loss": 0.86640674, "learning_rate": 4.6369116219599445e-07, "loss": 0.88796502, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 3.6278631687164307 }, { "auxiliary_loss_clip": 0.0112074, "auxiliary_loss_mlp": 0.01022369, "balance_loss_clip": 1.04402435, "balance_loss_mlp": 1.01581562, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.6382833302715352, "language_loss": 0.79277372, "learning_rate": 4.631925307481637e-07, "loss": 0.81420481, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.6583287715911865 }, { "auxiliary_loss_clip": 0.01139937, "auxiliary_loss_mlp": 0.01024224, "balance_loss_clip": 1.05045915, "balance_loss_mlp": 1.01727426, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.1197838807888325, "language_loss": 0.75772691, "learning_rate": 4.6269413243280533e-07, "loss": 0.77936852, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.7088029384613037 }, { "auxiliary_loss_clip": 0.01143532, "auxiliary_loss_mlp": 0.01023643, "balance_loss_clip": 1.04973483, "balance_loss_mlp": 1.01617408, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 3.9620788987951348, "language_loss": 0.73744828, "learning_rate": 4.621959673255236e-07, "loss": 0.75911999, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.5649349689483643 }, { "auxiliary_loss_clip": 0.01104804, "auxiliary_loss_mlp": 0.01023288, "balance_loss_clip": 1.04338014, "balance_loss_mlp": 1.01604569, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.3739090928606563, "language_loss": 0.90423584, "learning_rate": 4.6169803550189135e-07, "loss": 0.92551672, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.659029483795166 }, { "auxiliary_loss_clip": 0.01101377, "auxiliary_loss_mlp": 0.0102615, "balance_loss_clip": 1.04593635, "balance_loss_mlp": 1.01862228, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 1.8755923485075408, "language_loss": 0.77408576, "learning_rate": 4.6120033703744355e-07, "loss": 0.79536104, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.679548978805542 }, { "auxiliary_loss_clip": 0.01127656, "auxiliary_loss_mlp": 0.01025678, "balance_loss_clip": 1.04483461, "balance_loss_mlp": 1.0185076, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 3.1256670952229295, "language_loss": 0.78564036, "learning_rate": 4.607028720076822e-07, "loss": 0.80717373, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.707071542739868 }, { "auxiliary_loss_clip": 0.01155395, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.04984784, "balance_loss_mlp": 1.02386248, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 1.9406845221588451, "language_loss": 0.73440993, "learning_rate": 4.6020564048807074e-07, "loss": 0.75627089, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.6147186756134033 }, { "auxiliary_loss_clip": 0.01156445, "auxiliary_loss_mlp": 0.01029166, "balance_loss_clip": 1.05010259, "balance_loss_mlp": 1.02150095, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 2.9558618073454928, "language_loss": 0.72234625, "learning_rate": 4.5970864255403883e-07, "loss": 0.74420238, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.8479647636413574 }, { "auxiliary_loss_clip": 0.01142584, "auxiliary_loss_mlp": 0.01024769, "balance_loss_clip": 1.0460155, "balance_loss_mlp": 1.01778662, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 2.360187513040167, "language_loss": 0.82503915, "learning_rate": 4.59211878280982e-07, "loss": 0.84671271, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.686311721801758 }, { "auxiliary_loss_clip": 0.01140336, "auxiliary_loss_mlp": 0.01024399, "balance_loss_clip": 1.04718447, "balance_loss_mlp": 1.01776242, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 3.321106927942314, "language_loss": 0.70080936, "learning_rate": 4.587153477442578e-07, "loss": 0.72245669, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.6659131050109863 }, { "auxiliary_loss_clip": 0.01173073, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.05045319, "balance_loss_mlp": 1.0215714, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 2.954130982144279, "language_loss": 0.81529897, "learning_rate": 4.582190510191899e-07, "loss": 0.83732343, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.5658938884735107 }, { "auxiliary_loss_clip": 0.01122026, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 1.04754913, "balance_loss_mlp": 1.02184415, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.211013034670224, "language_loss": 0.87201047, "learning_rate": 4.5772298818106625e-07, "loss": 0.89352214, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.616563320159912 }, { "auxiliary_loss_clip": 0.01127938, "auxiliary_loss_mlp": 0.01026011, "balance_loss_clip": 1.04625726, "balance_loss_mlp": 1.0184474, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 5.3779696531005285, "language_loss": 0.71906221, "learning_rate": 4.572271593051384e-07, "loss": 0.74060166, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.728990077972412 }, { "auxiliary_loss_clip": 0.01097655, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.04361296, "balance_loss_mlp": 1.01955462, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 2.2652457821344143, "language_loss": 0.78552347, "learning_rate": 4.567315644666245e-07, "loss": 0.80676574, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.6747374534606934 }, { "auxiliary_loss_clip": 0.01117288, "auxiliary_loss_mlp": 0.01028412, "balance_loss_clip": 1.04639792, "balance_loss_mlp": 1.0217905, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.0206197473508687, "language_loss": 0.84809566, "learning_rate": 4.5623620374070507e-07, "loss": 0.86955261, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.6830692291259766 }, { "auxiliary_loss_clip": 0.01034059, "auxiliary_loss_mlp": 0.01000899, "balance_loss_clip": 1.01804304, "balance_loss_mlp": 1.00005877, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7728478556903742, "language_loss": 0.58351225, "learning_rate": 4.557410772025263e-07, "loss": 0.60386181, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.312706708908081 }, { "auxiliary_loss_clip": 0.0113482, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.04666519, "balance_loss_mlp": 1.02177751, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 1.9581426205244594, "language_loss": 0.66253686, "learning_rate": 4.5524618492719803e-07, "loss": 0.68418145, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.7629730701446533 }, { "auxiliary_loss_clip": 0.01154298, "auxiliary_loss_mlp": 0.01020449, "balance_loss_clip": 1.04725516, "balance_loss_mlp": 1.01373172, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.6325782377134495, "language_loss": 0.78879201, "learning_rate": 4.54751526989795e-07, "loss": 0.81053948, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.7720983028411865 }, { "auxiliary_loss_clip": 0.01155214, "auxiliary_loss_mlp": 0.01027379, "balance_loss_clip": 1.04757118, "balance_loss_mlp": 1.02042294, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 3.0699650444307447, "language_loss": 0.79361153, "learning_rate": 4.5425710346535775e-07, "loss": 0.81543744, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.762568235397339 }, { "auxiliary_loss_clip": 0.01153545, "auxiliary_loss_mlp": 0.01024165, "balance_loss_clip": 1.04730392, "balance_loss_mlp": 1.01686645, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.499165707309677, "language_loss": 0.81891817, "learning_rate": 4.537629144288877e-07, "loss": 0.84069526, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.9275405406951904 }, { "auxiliary_loss_clip": 0.01112618, "auxiliary_loss_mlp": 0.01022826, "balance_loss_clip": 1.0425272, "balance_loss_mlp": 1.01544738, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 2.287765014747768, "language_loss": 0.74914491, "learning_rate": 4.5326895995535477e-07, "loss": 0.77049935, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.791428565979004 }, { "auxiliary_loss_clip": 0.01150963, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.0471189, "balance_loss_mlp": 1.01673746, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.670011845312656, "language_loss": 0.84561908, "learning_rate": 4.527752401196907e-07, "loss": 0.867369, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.652559757232666 }, { "auxiliary_loss_clip": 0.01131506, "auxiliary_loss_mlp": 0.01022246, "balance_loss_clip": 1.04609728, "balance_loss_mlp": 1.01494455, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 2.109545419309949, "language_loss": 0.67006743, "learning_rate": 4.5228175499679254e-07, "loss": 0.69160497, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 3.6118311882019043 }, { "auxiliary_loss_clip": 0.01058011, "auxiliary_loss_mlp": 0.01000885, "balance_loss_clip": 1.01711679, "balance_loss_mlp": 0.99998504, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8297720138352572, "language_loss": 0.54495943, "learning_rate": 4.5178850466152174e-07, "loss": 0.56554836, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 4.995608806610107 }, { "auxiliary_loss_clip": 0.01133715, "auxiliary_loss_mlp": 0.01019734, "balance_loss_clip": 1.04574156, "balance_loss_mlp": 1.01292145, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.884340299145309, "language_loss": 0.81516588, "learning_rate": 4.512954891887031e-07, "loss": 0.83670032, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 3.581284761428833 }, { "auxiliary_loss_clip": 0.0113227, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.04630733, "balance_loss_mlp": 1.01965725, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 3.0406262478068373, "language_loss": 0.8376292, "learning_rate": 4.5080270865312806e-07, "loss": 0.85922205, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.691082239151001 }, { "auxiliary_loss_clip": 0.01153871, "auxiliary_loss_mlp": 0.0102372, "balance_loss_clip": 1.04966211, "balance_loss_mlp": 1.01706219, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 3.068459007080081, "language_loss": 0.71175379, "learning_rate": 4.5031016312954985e-07, "loss": 0.73352969, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.615731954574585 }, { "auxiliary_loss_clip": 0.01162261, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.05037403, "balance_loss_mlp": 1.01893187, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 2.005423190925602, "language_loss": 0.74850786, "learning_rate": 4.498178526926886e-07, "loss": 0.77039433, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.7570812702178955 }, { "auxiliary_loss_clip": 0.01168342, "auxiliary_loss_mlp": 0.01021709, "balance_loss_clip": 1.05091739, "balance_loss_mlp": 1.01493764, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.504424251475727, "language_loss": 0.72050703, "learning_rate": 4.4932577741722635e-07, "loss": 0.7424075, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.582557201385498 }, { "auxiliary_loss_clip": 0.01136086, "auxiliary_loss_mlp": 0.01026579, "balance_loss_clip": 1.04603398, "balance_loss_mlp": 1.01970625, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.9699809890793065, "language_loss": 0.74324393, "learning_rate": 4.4883393737780985e-07, "loss": 0.76487058, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.7130188941955566 }, { "auxiliary_loss_clip": 0.01146955, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.04565668, "balance_loss_mlp": 1.01913083, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 2.401181198191292, "language_loss": 0.78233951, "learning_rate": 4.4834233264905254e-07, "loss": 0.80406708, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.597179889678955 }, { "auxiliary_loss_clip": 0.01114858, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.04419518, "balance_loss_mlp": 1.01941991, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.541171513966014, "language_loss": 0.71714681, "learning_rate": 4.478509633055294e-07, "loss": 0.73856497, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.65606427192688 }, { "auxiliary_loss_clip": 0.01141787, "auxiliary_loss_mlp": 0.01029956, "balance_loss_clip": 1.04695511, "balance_loss_mlp": 1.02227926, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 2.618100386408604, "language_loss": 0.80235887, "learning_rate": 4.473598294217813e-07, "loss": 0.82407629, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.614628314971924 }, { "auxiliary_loss_clip": 0.01152025, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.04939532, "balance_loss_mlp": 1.02139938, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.5256455827215185, "language_loss": 0.71680284, "learning_rate": 4.468689310723124e-07, "loss": 0.73860389, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.6685056686401367 }, { "auxiliary_loss_clip": 0.01126681, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.0460279, "balance_loss_mlp": 1.02003396, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.7711620281976992, "language_loss": 0.7832669, "learning_rate": 4.463782683315913e-07, "loss": 0.80480278, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.6738953590393066 }, { "auxiliary_loss_clip": 0.01168626, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.05000067, "balance_loss_mlp": 1.01974845, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 5.197110155481517, "language_loss": 0.73338008, "learning_rate": 4.458878412740523e-07, "loss": 0.75533259, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.7229931354522705 }, { "auxiliary_loss_clip": 0.01152228, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04954362, "balance_loss_mlp": 1.0160979, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 3.385871284205846, "language_loss": 0.77929395, "learning_rate": 4.453976499740919e-07, "loss": 0.80104721, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.63403582572937 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01026138, "balance_loss_clip": 1.04887426, "balance_loss_mlp": 1.01871705, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 1.9673894558522684, "language_loss": 0.77938849, "learning_rate": 4.4490769450607215e-07, "loss": 0.80117345, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.5651683807373047 }, { "auxiliary_loss_clip": 0.01115754, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.04037702, "balance_loss_mlp": 1.01850581, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 2.2550993537250688, "language_loss": 0.72591656, "learning_rate": 4.4441797494431845e-07, "loss": 0.74733144, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.8510100841522217 }, { "auxiliary_loss_clip": 0.01151318, "auxiliary_loss_mlp": 0.01022761, "balance_loss_clip": 1.04773986, "balance_loss_mlp": 1.01587093, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 2.4400792591775167, "language_loss": 0.77562678, "learning_rate": 4.439284913631207e-07, "loss": 0.79736757, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.5795323848724365 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.04701328, "balance_loss_mlp": 1.02109194, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 2.6431756280233976, "language_loss": 0.83698583, "learning_rate": 4.434392438367347e-07, "loss": 0.85853666, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.714090585708618 }, { "auxiliary_loss_clip": 0.01158565, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04848814, "balance_loss_mlp": 1.01622629, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 3.6741269195192987, "language_loss": 0.74370193, "learning_rate": 4.4295023243937677e-07, "loss": 0.76551998, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.68658185005188 }, { "auxiliary_loss_clip": 0.0115663, "auxiliary_loss_mlp": 0.01023562, "balance_loss_clip": 1.0505172, "balance_loss_mlp": 1.01562846, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 2.167584207746486, "language_loss": 0.80462813, "learning_rate": 4.4246145724523123e-07, "loss": 0.82643002, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.696437120437622 }, { "auxiliary_loss_clip": 0.0111967, "auxiliary_loss_mlp": 0.01024417, "balance_loss_clip": 1.04611492, "balance_loss_mlp": 1.01742291, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.4134014003202187, "language_loss": 0.77052486, "learning_rate": 4.41972918328444e-07, "loss": 0.79196566, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.6299266815185547 }, { "auxiliary_loss_clip": 0.01152593, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.05003428, "balance_loss_mlp": 1.02421772, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 2.282551557611438, "language_loss": 0.77394086, "learning_rate": 4.4148461576312646e-07, "loss": 0.79578269, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.686990261077881 }, { "auxiliary_loss_clip": 0.01154041, "auxiliary_loss_mlp": 0.01022073, "balance_loss_clip": 1.04980683, "balance_loss_mlp": 1.01558852, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.5419260668279289, "language_loss": 0.74561799, "learning_rate": 4.4099654962335343e-07, "loss": 0.76737916, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.6306979656219482 }, { "auxiliary_loss_clip": 0.01146975, "auxiliary_loss_mlp": 0.01026519, "balance_loss_clip": 1.04964495, "balance_loss_mlp": 1.01958072, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.8812416983981561, "language_loss": 0.75296628, "learning_rate": 4.405087199831636e-07, "loss": 0.77470124, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.6841726303100586 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.00711781, "balance_loss_clip": 1.0447222, "balance_loss_mlp": 1.00068021, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 3.199576982704768, "language_loss": 0.66965306, "learning_rate": 4.400211269165619e-07, "loss": 0.68814456, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 3.5219438076019287 }, { "auxiliary_loss_clip": 0.01170525, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 1.05197918, "balance_loss_mlp": 1.02109051, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.6639734141153257, "language_loss": 0.77011395, "learning_rate": 4.3953377049751416e-07, "loss": 0.79209816, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.6006486415863037 }, { "auxiliary_loss_clip": 0.01142508, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.04816246, "balance_loss_mlp": 1.01895452, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.2562952071541202, "language_loss": 0.77926683, "learning_rate": 4.390466507999537e-07, "loss": 0.80095255, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 3.608961343765259 }, { "auxiliary_loss_clip": 0.01117075, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.04368031, "balance_loss_mlp": 1.01823974, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.560505382122229, "language_loss": 0.76331007, "learning_rate": 4.385597678977748e-07, "loss": 0.78473532, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 4.401703357696533 }, { "auxiliary_loss_clip": 0.01134792, "auxiliary_loss_mlp": 0.01023815, "balance_loss_clip": 1.04478109, "balance_loss_mlp": 1.0161742, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 2.0084349765951726, "language_loss": 0.75683117, "learning_rate": 4.3807312186483726e-07, "loss": 0.77841723, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 2.6397879123687744 }, { "auxiliary_loss_clip": 0.01153002, "auxiliary_loss_mlp": 0.01026269, "balance_loss_clip": 1.05132675, "balance_loss_mlp": 1.0190568, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 2.40657789878473, "language_loss": 0.78278291, "learning_rate": 4.375867127749655e-07, "loss": 0.80457562, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.630378246307373 }, { "auxiliary_loss_clip": 0.01122883, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.04763556, "balance_loss_mlp": 1.01849771, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 1.8518546276466665, "language_loss": 0.67210293, "learning_rate": 4.3710054070194744e-07, "loss": 0.69358754, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.713977813720703 }, { "auxiliary_loss_clip": 0.01169269, "auxiliary_loss_mlp": 0.0071164, "balance_loss_clip": 1.04881382, "balance_loss_mlp": 1.00070524, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 3.442648402889075, "language_loss": 0.66735053, "learning_rate": 4.3661460571953455e-07, "loss": 0.68615961, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.553812265396118 }, { "auxiliary_loss_clip": 0.01153008, "auxiliary_loss_mlp": 0.01022868, "balance_loss_clip": 1.04595971, "balance_loss_mlp": 1.01550066, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 2.0184232178915487, "language_loss": 0.68506783, "learning_rate": 4.36128907901443e-07, "loss": 0.70682657, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.5783140659332275 }, { "auxiliary_loss_clip": 0.01123977, "auxiliary_loss_mlp": 0.01024391, "balance_loss_clip": 1.04574597, "balance_loss_mlp": 1.01728654, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 1.974253577942296, "language_loss": 0.72735, "learning_rate": 4.356434473213519e-07, "loss": 0.74883366, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.647240400314331 }, { "auxiliary_loss_clip": 0.01137351, "auxiliary_loss_mlp": 0.0102439, "balance_loss_clip": 1.04919589, "balance_loss_mlp": 1.01720142, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 2.6139004991827384, "language_loss": 0.80021065, "learning_rate": 4.351582240529068e-07, "loss": 0.82182807, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.6292383670806885 }, { "auxiliary_loss_clip": 0.01049385, "auxiliary_loss_mlp": 0.01000497, "balance_loss_clip": 1.01859152, "balance_loss_mlp": 0.99962693, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6753509175762528, "language_loss": 0.58156657, "learning_rate": 4.346732381697149e-07, "loss": 0.60206544, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.251767158508301 }, { "auxiliary_loss_clip": 0.01133722, "auxiliary_loss_mlp": 0.01023252, "balance_loss_clip": 1.04804885, "balance_loss_mlp": 1.01645994, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 1.9769065354523683, "language_loss": 0.8112042, "learning_rate": 4.3418848974534825e-07, "loss": 0.83277392, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.5598154067993164 }, { "auxiliary_loss_clip": 0.01127055, "auxiliary_loss_mlp": 0.01028628, "balance_loss_clip": 1.04673445, "balance_loss_mlp": 1.02165473, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.7574297582149283, "language_loss": 0.69118363, "learning_rate": 4.3370397885334276e-07, "loss": 0.71274048, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.763350486755371 }, { "auxiliary_loss_clip": 0.01147748, "auxiliary_loss_mlp": 0.01029228, "balance_loss_clip": 1.04836202, "balance_loss_mlp": 1.02208161, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.8779819571098333, "language_loss": 0.75597161, "learning_rate": 4.3321970556719777e-07, "loss": 0.77774137, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.5511062145233154 }, { "auxiliary_loss_clip": 0.01171374, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.05189085, "balance_loss_mlp": 1.02036834, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.4099584181644813, "language_loss": 0.72864014, "learning_rate": 4.3273566996037856e-07, "loss": 0.75063634, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.587583303451538 }, { "auxiliary_loss_clip": 0.0113664, "auxiliary_loss_mlp": 0.0102442, "balance_loss_clip": 1.04671955, "balance_loss_mlp": 1.01730585, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 2.7145050519179, "language_loss": 0.80608273, "learning_rate": 4.322518721063113e-07, "loss": 0.82769334, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.668768882751465 }, { "auxiliary_loss_clip": 0.01154219, "auxiliary_loss_mlp": 0.01028388, "balance_loss_clip": 1.04925847, "balance_loss_mlp": 1.02088141, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 2.3155012511174577, "language_loss": 0.70185226, "learning_rate": 4.3176831207838906e-07, "loss": 0.72367835, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.755187511444092 }, { "auxiliary_loss_clip": 0.01153, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.05104291, "balance_loss_mlp": 1.0194279, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 1.8389221848247073, "language_loss": 0.74768978, "learning_rate": 4.3128498994996685e-07, "loss": 0.76948285, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.6485278606414795 }, { "auxiliary_loss_clip": 0.0115757, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.04845488, "balance_loss_mlp": 1.01908088, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 3.6808270994652528, "language_loss": 0.71483994, "learning_rate": 4.308019057943646e-07, "loss": 0.73667842, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.606463670730591 }, { "auxiliary_loss_clip": 0.01114063, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.04459071, "balance_loss_mlp": 1.02108777, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.8329948409674952, "language_loss": 0.7432133, "learning_rate": 4.3031905968486535e-07, "loss": 0.76463449, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.7549238204956055 }, { "auxiliary_loss_clip": 0.01106649, "auxiliary_loss_mlp": 0.01028698, "balance_loss_clip": 1.04752755, "balance_loss_mlp": 1.021626, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.0727582523614374, "language_loss": 0.68798906, "learning_rate": 4.298364516947162e-07, "loss": 0.70934248, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.6349849700927734 }, { "auxiliary_loss_clip": 0.01104472, "auxiliary_loss_mlp": 0.01030117, "balance_loss_clip": 1.04428649, "balance_loss_mlp": 1.0227741, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 5.448951397178814, "language_loss": 0.65787756, "learning_rate": 4.293540818971295e-07, "loss": 0.67922348, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.698711395263672 }, { "auxiliary_loss_clip": 0.01157907, "auxiliary_loss_mlp": 0.01025355, "balance_loss_clip": 1.04823291, "balance_loss_mlp": 1.01848578, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 3.637561132500278, "language_loss": 0.767717, "learning_rate": 4.2887195036527934e-07, "loss": 0.78954959, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.5769240856170654 }, { "auxiliary_loss_clip": 0.01144329, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.0443964, "balance_loss_mlp": 1.01809978, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.8914706352481616, "language_loss": 0.73540664, "learning_rate": 4.28390057172306e-07, "loss": 0.75710523, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.58579683303833 }, { "auxiliary_loss_clip": 0.01116429, "auxiliary_loss_mlp": 0.01021365, "balance_loss_clip": 1.04308248, "balance_loss_mlp": 1.01410508, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 28.165031515548232, "language_loss": 0.71910298, "learning_rate": 4.279084023913111e-07, "loss": 0.74048096, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.664017915725708 }, { "auxiliary_loss_clip": 0.01153186, "auxiliary_loss_mlp": 0.01023613, "balance_loss_clip": 1.0493387, "balance_loss_mlp": 1.01631784, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 5.151421573210266, "language_loss": 0.69411683, "learning_rate": 4.2742698609536096e-07, "loss": 0.7158848, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 3.600651741027832 }, { "auxiliary_loss_clip": 0.01142732, "auxiliary_loss_mlp": 0.01021923, "balance_loss_clip": 1.04907084, "balance_loss_mlp": 1.0149312, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 2.056939161255601, "language_loss": 0.78579259, "learning_rate": 4.2694580835748706e-07, "loss": 0.80743915, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.660454750061035 }, { "auxiliary_loss_clip": 0.01136755, "auxiliary_loss_mlp": 0.01027079, "balance_loss_clip": 1.04585302, "balance_loss_mlp": 1.02015638, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.19054756561207, "language_loss": 0.74199295, "learning_rate": 4.264648692506836e-07, "loss": 0.76363128, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 4.495037794113159 }, { "auxiliary_loss_clip": 0.01132884, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.04709554, "balance_loss_mlp": 1.02442777, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 3.0019168873922943, "language_loss": 0.72081208, "learning_rate": 4.2598416884790824e-07, "loss": 0.74245888, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 3.5165469646453857 }, { "auxiliary_loss_clip": 0.01147772, "auxiliary_loss_mlp": 0.01028558, "balance_loss_clip": 1.04708755, "balance_loss_mlp": 1.02074397, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 2.5672747398306544, "language_loss": 0.80907857, "learning_rate": 4.255037072220828e-07, "loss": 0.8308419, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 2.6685054302215576 }, { "auxiliary_loss_clip": 0.01166166, "auxiliary_loss_mlp": 0.01023858, "balance_loss_clip": 1.04804838, "balance_loss_mlp": 1.01693773, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.7007780869673295, "language_loss": 0.7191571, "learning_rate": 4.2502348444609293e-07, "loss": 0.74105734, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.559838056564331 }, { "auxiliary_loss_clip": 0.01103749, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04241633, "balance_loss_mlp": 1.0201664, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 2.4213958585361635, "language_loss": 0.69846988, "learning_rate": 4.2454350059278844e-07, "loss": 0.71977913, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.690706729888916 }, { "auxiliary_loss_clip": 0.01131464, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.042871, "balance_loss_mlp": 1.0184021, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.7647081666409523, "language_loss": 0.84585178, "learning_rate": 4.240637557349824e-07, "loss": 0.86742389, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.623934507369995 }, { "auxiliary_loss_clip": 0.01124587, "auxiliary_loss_mlp": 0.01023981, "balance_loss_clip": 1.04465842, "balance_loss_mlp": 1.01689076, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 2.126558964825417, "language_loss": 0.66479039, "learning_rate": 4.235842499454516e-07, "loss": 0.68627608, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.7063894271850586 }, { "auxiliary_loss_clip": 0.01138439, "auxiliary_loss_mlp": 0.0102585, "balance_loss_clip": 1.04846621, "balance_loss_mlp": 1.01897502, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.8517276948794326, "language_loss": 0.82756615, "learning_rate": 4.2310498329693687e-07, "loss": 0.84920901, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.691051721572876 }, { "auxiliary_loss_clip": 0.01155279, "auxiliary_loss_mlp": 0.01028884, "balance_loss_clip": 1.04828942, "balance_loss_mlp": 1.02057493, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.5962079421171034, "language_loss": 0.80923337, "learning_rate": 4.2262595586214164e-07, "loss": 0.83107501, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.7019588947296143 }, { "auxiliary_loss_clip": 0.01159468, "auxiliary_loss_mlp": 0.01026165, "balance_loss_clip": 1.05004835, "balance_loss_mlp": 1.01863647, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.6037682125923098, "language_loss": 0.76953971, "learning_rate": 4.221471677137358e-07, "loss": 0.79139602, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.726064682006836 }, { "auxiliary_loss_clip": 0.0112605, "auxiliary_loss_mlp": 0.01022856, "balance_loss_clip": 1.04437685, "balance_loss_mlp": 1.01601887, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.6640161275261063, "language_loss": 0.69864416, "learning_rate": 4.216686189243492e-07, "loss": 0.72013319, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.75671124458313 }, { "auxiliary_loss_clip": 0.0111677, "auxiliary_loss_mlp": 0.01025005, "balance_loss_clip": 1.0448761, "balance_loss_mlp": 1.01788568, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.866830288135807, "language_loss": 0.73041981, "learning_rate": 4.211903095665785e-07, "loss": 0.75183755, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.8690185546875 }, { "auxiliary_loss_clip": 0.01149496, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.04858565, "balance_loss_mlp": 1.02180457, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.8412552107193014, "language_loss": 0.75518024, "learning_rate": 4.2071223971298277e-07, "loss": 0.77696425, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.666146993637085 }, { "auxiliary_loss_clip": 0.01153363, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.04685235, "balance_loss_mlp": 1.02609074, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.9186334332426016, "language_loss": 0.60823083, "learning_rate": 4.2023440943608433e-07, "loss": 0.63010269, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.748781204223633 }, { "auxiliary_loss_clip": 0.01153804, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.04755676, "balance_loss_mlp": 1.02041817, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.7046462771246507, "language_loss": 0.78122103, "learning_rate": 4.1975681880837023e-07, "loss": 0.80302995, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.7829208374023438 }, { "auxiliary_loss_clip": 0.01115958, "auxiliary_loss_mlp": 0.01019653, "balance_loss_clip": 1.04172266, "balance_loss_mlp": 1.01261711, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.7354335763703714, "language_loss": 0.82631212, "learning_rate": 4.192794679022895e-07, "loss": 0.84766823, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.688035488128662 }, { "auxiliary_loss_clip": 0.01154502, "auxiliary_loss_mlp": 0.01022366, "balance_loss_clip": 1.0473516, "balance_loss_mlp": 1.01519012, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.9925890476446921, "language_loss": 0.72155035, "learning_rate": 4.1880235679025743e-07, "loss": 0.74331897, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.6414718627929688 }, { "auxiliary_loss_clip": 0.01090915, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.04212046, "balance_loss_mlp": 1.02144337, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.9862219473857463, "language_loss": 0.639099, "learning_rate": 4.1832548554464986e-07, "loss": 0.66030169, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.818605422973633 }, { "auxiliary_loss_clip": 0.01057012, "auxiliary_loss_mlp": 0.01001871, "balance_loss_clip": 1.01950908, "balance_loss_mlp": 1.00101268, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7404826625495692, "language_loss": 0.58705747, "learning_rate": 4.178488542378098e-07, "loss": 0.60764623, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.087916374206543 }, { "auxiliary_loss_clip": 0.01172643, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.05015004, "balance_loss_mlp": 1.0210222, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.6319170572550619, "language_loss": 0.88875425, "learning_rate": 4.173724629420401e-07, "loss": 0.91076779, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.619262456893921 }, { "auxiliary_loss_clip": 0.01140751, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.04637599, "balance_loss_mlp": 1.01963115, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.33050147256005, "language_loss": 0.68419927, "learning_rate": 4.168963117296087e-07, "loss": 0.70587784, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.5914952754974365 }, { "auxiliary_loss_clip": 0.01170664, "auxiliary_loss_mlp": 0.01027552, "balance_loss_clip": 1.05141532, "balance_loss_mlp": 1.02051914, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.199965190794434, "language_loss": 0.75614297, "learning_rate": 4.1642040067274876e-07, "loss": 0.77812505, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.572049856185913 }, { "auxiliary_loss_clip": 0.01143131, "auxiliary_loss_mlp": 0.01022898, "balance_loss_clip": 1.04678035, "balance_loss_mlp": 1.01570034, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 2.0747854325823014, "language_loss": 0.72470868, "learning_rate": 4.1594472984365493e-07, "loss": 0.746369, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.652209997177124 }, { "auxiliary_loss_clip": 0.01149598, "auxiliary_loss_mlp": 0.01028017, "balance_loss_clip": 1.04763746, "balance_loss_mlp": 1.02110291, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 2.511280480465031, "language_loss": 0.77937007, "learning_rate": 4.154692993144862e-07, "loss": 0.80114615, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.738544464111328 }, { "auxiliary_loss_clip": 0.01169381, "auxiliary_loss_mlp": 0.00711201, "balance_loss_clip": 1.05052209, "balance_loss_mlp": 1.00081646, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.00231451468593, "language_loss": 0.71523267, "learning_rate": 4.1499410915736476e-07, "loss": 0.73403847, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 3.4627928733825684 }, { "auxiliary_loss_clip": 0.01060204, "auxiliary_loss_mlp": 0.01002014, "balance_loss_clip": 1.01901329, "balance_loss_mlp": 1.00108457, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.7770086260610872, "language_loss": 0.64276826, "learning_rate": 4.145191594443762e-07, "loss": 0.6633904, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.336726427078247 }, { "auxiliary_loss_clip": 0.01116537, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.04547644, "balance_loss_mlp": 1.02303243, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 2.070218358171074, "language_loss": 0.70501846, "learning_rate": 4.140444502475713e-07, "loss": 0.72648787, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 3.608452081680298 }, { "auxiliary_loss_clip": 0.01148709, "auxiliary_loss_mlp": 0.01029015, "balance_loss_clip": 1.04609895, "balance_loss_mlp": 1.02175498, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 1.968710934738966, "language_loss": 0.69995743, "learning_rate": 4.1356998163896216e-07, "loss": 0.72173464, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 4.394735097885132 }, { "auxiliary_loss_clip": 0.01127419, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.04746425, "balance_loss_mlp": 1.02041698, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 2.060576042391598, "language_loss": 0.74866921, "learning_rate": 4.130957536905255e-07, "loss": 0.77021933, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.7802066802978516 }, { "auxiliary_loss_clip": 0.01146995, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.04799151, "balance_loss_mlp": 1.02412891, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.2729613287098727, "language_loss": 0.71310973, "learning_rate": 4.1262176647420134e-07, "loss": 0.73489952, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.5711400508880615 }, { "auxiliary_loss_clip": 0.01146697, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.0494318, "balance_loss_mlp": 1.02072871, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 2.0527699363405034, "language_loss": 0.80107462, "learning_rate": 4.121480200618923e-07, "loss": 0.8228178, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.6394503116607666 }, { "auxiliary_loss_clip": 0.01129504, "auxiliary_loss_mlp": 0.01026698, "balance_loss_clip": 1.04438663, "balance_loss_mlp": 1.01951265, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 2.5524618613119774, "language_loss": 0.80212855, "learning_rate": 4.116745145254674e-07, "loss": 0.82369053, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.607969045639038 }, { "auxiliary_loss_clip": 0.01040562, "auxiliary_loss_mlp": 0.01002069, "balance_loss_clip": 1.01713812, "balance_loss_mlp": 1.00113893, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7706704759547588, "language_loss": 0.58037043, "learning_rate": 4.1120124993675476e-07, "loss": 0.6007967, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.2567930221557617 }, { "auxiliary_loss_clip": 0.01146008, "auxiliary_loss_mlp": 0.01029327, "balance_loss_clip": 1.0466181, "balance_loss_mlp": 1.02204871, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 7.437341520420707, "language_loss": 0.6207056, "learning_rate": 4.107282263675498e-07, "loss": 0.64245892, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.61810302734375 }, { "auxiliary_loss_clip": 0.01042087, "auxiliary_loss_mlp": 0.00701171, "balance_loss_clip": 1.01896751, "balance_loss_mlp": 1.00009632, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7666143039444175, "language_loss": 0.52447504, "learning_rate": 4.1025544388960907e-07, "loss": 0.54190761, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.1894545555114746 }, { "auxiliary_loss_clip": 0.0114883, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.04721689, "balance_loss_mlp": 1.02032483, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 2.1286741353503063, "language_loss": 0.71640992, "learning_rate": 4.097829025746538e-07, "loss": 0.73816693, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.6534042358398438 }, { "auxiliary_loss_clip": 0.01057667, "auxiliary_loss_mlp": 0.01001189, "balance_loss_clip": 1.01876998, "balance_loss_mlp": 1.00034285, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6592043468374381, "language_loss": 0.6099025, "learning_rate": 4.0931060249436757e-07, "loss": 0.63049102, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.217193841934204 }, { "auxiliary_loss_clip": 0.01152327, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.0507623, "balance_loss_mlp": 1.02281523, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 1.981296378147469, "language_loss": 0.70008314, "learning_rate": 4.088385437203978e-07, "loss": 0.72190732, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.639737606048584 }, { "auxiliary_loss_clip": 0.01169147, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.04851151, "balance_loss_mlp": 1.01871288, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 2.4510256056060107, "language_loss": 0.77943707, "learning_rate": 4.083667263243564e-07, "loss": 0.80138469, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.53383469581604 }, { "auxiliary_loss_clip": 0.01150751, "auxiliary_loss_mlp": 0.01023866, "balance_loss_clip": 1.04932415, "balance_loss_mlp": 1.01707411, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.7601248917114507, "language_loss": 0.71810257, "learning_rate": 4.0789515037781653e-07, "loss": 0.73984873, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.6269614696502686 }, { "auxiliary_loss_clip": 0.01157316, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.04926372, "balance_loss_mlp": 1.01917648, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 3.2941896444889034, "language_loss": 0.82709545, "learning_rate": 4.0742381595231755e-07, "loss": 0.84893024, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.5761396884918213 }, { "auxiliary_loss_clip": 0.0112844, "auxiliary_loss_mlp": 0.01022747, "balance_loss_clip": 1.04774785, "balance_loss_mlp": 1.0160352, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 1.9896928994504073, "language_loss": 0.78115308, "learning_rate": 4.06952723119359e-07, "loss": 0.802665, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.7260141372680664 }, { "auxiliary_loss_clip": 0.01130998, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.04755497, "balance_loss_mlp": 1.02103639, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 2.07744051055506, "language_loss": 0.67359066, "learning_rate": 4.0648187195040504e-07, "loss": 0.69517905, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.8037877082824707 }, { "auxiliary_loss_clip": 0.01056757, "auxiliary_loss_mlp": 0.01000595, "balance_loss_clip": 1.01935506, "balance_loss_mlp": 0.99975491, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.815906068407652, "language_loss": 0.6753819, "learning_rate": 4.060112625168848e-07, "loss": 0.6959554, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.2508466243743896 }, { "auxiliary_loss_clip": 0.01169666, "auxiliary_loss_mlp": 0.01025125, "balance_loss_clip": 1.05038786, "balance_loss_mlp": 1.01780868, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 3.6170636602410546, "language_loss": 0.739434, "learning_rate": 4.055408948901886e-07, "loss": 0.76138192, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.638972520828247 }, { "auxiliary_loss_clip": 0.01158905, "auxiliary_loss_mlp": 0.01024803, "balance_loss_clip": 1.05097413, "balance_loss_mlp": 1.01736736, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 2.2170863331840085, "language_loss": 0.71773827, "learning_rate": 4.050707691416708e-07, "loss": 0.73957527, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.706064462661743 }, { "auxiliary_loss_clip": 0.01056317, "auxiliary_loss_mlp": 0.01001053, "balance_loss_clip": 1.01904225, "balance_loss_mlp": 1.00021231, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6784165638452383, "language_loss": 0.5970248, "learning_rate": 4.046008853426495e-07, "loss": 0.61759853, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.3545219898223877 }, { "auxiliary_loss_clip": 0.01114243, "auxiliary_loss_mlp": 0.01029562, "balance_loss_clip": 1.04294562, "balance_loss_mlp": 1.0221771, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 2.6048311209966957, "language_loss": 0.62755072, "learning_rate": 4.0413124356440464e-07, "loss": 0.64898878, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.751891613006592 }, { "auxiliary_loss_clip": 0.0110807, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 1.04324186, "balance_loss_mlp": 1.01990747, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 1.8476155778237373, "language_loss": 0.82385933, "learning_rate": 4.0366184387818223e-07, "loss": 0.84521139, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.6897313594818115 }, { "auxiliary_loss_clip": 0.01175542, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.05143666, "balance_loss_mlp": 1.01924849, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 1.8649224967384062, "language_loss": 0.85403615, "learning_rate": 4.0319268635518797e-07, "loss": 0.87606066, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 213.69413924217224 }, { "auxiliary_loss_clip": 0.01152506, "auxiliary_loss_mlp": 0.01021647, "balance_loss_clip": 1.04726148, "balance_loss_mlp": 1.01465821, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.7055529731978805, "language_loss": 0.75333321, "learning_rate": 4.027237710665943e-07, "loss": 0.77507472, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 3.524895429611206 }, { "auxiliary_loss_clip": 0.01126275, "auxiliary_loss_mlp": 0.01023988, "balance_loss_clip": 1.0451417, "balance_loss_mlp": 1.01643896, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 2.1046434332310606, "language_loss": 0.69459176, "learning_rate": 4.022550980835344e-07, "loss": 0.71609443, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 3.703817129135132 }, { "auxiliary_loss_clip": 0.01116332, "auxiliary_loss_mlp": 0.01024998, "balance_loss_clip": 1.04114819, "balance_loss_mlp": 1.01746392, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.269902656627892, "language_loss": 0.79954898, "learning_rate": 4.017866674771051e-07, "loss": 0.82096225, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 4.63840651512146 }, { "auxiliary_loss_clip": 0.01100934, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04312849, "balance_loss_mlp": 1.01842082, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.7506446614487197, "language_loss": 0.74496472, "learning_rate": 4.013184793183688e-07, "loss": 0.76623583, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.7105884552001953 }, { "auxiliary_loss_clip": 0.01152407, "auxiliary_loss_mlp": 0.0102848, "balance_loss_clip": 1.04607666, "balance_loss_mlp": 1.0216434, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 2.036439791826605, "language_loss": 0.7279675, "learning_rate": 4.008505336783472e-07, "loss": 0.74977636, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.5996387004852295 }, { "auxiliary_loss_clip": 0.01143967, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.0465672, "balance_loss_mlp": 1.01624858, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 2.066870754225071, "language_loss": 0.80666631, "learning_rate": 4.003828306280284e-07, "loss": 0.82833326, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.586925983428955 }, { "auxiliary_loss_clip": 0.01153673, "auxiliary_loss_mlp": 0.01025744, "balance_loss_clip": 1.04794562, "balance_loss_mlp": 1.01895773, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.9075963952187265, "language_loss": 0.78183949, "learning_rate": 3.999153702383626e-07, "loss": 0.80363369, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.6121246814727783 }, { "auxiliary_loss_clip": 0.01160342, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.04981875, "balance_loss_mlp": 1.01859856, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 1.926758036658096, "language_loss": 0.74159569, "learning_rate": 3.9944815258026263e-07, "loss": 0.76345628, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.689671277999878 }, { "auxiliary_loss_clip": 0.01157599, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.04959083, "balance_loss_mlp": 1.0186919, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 2.4290160195341604, "language_loss": 0.83204901, "learning_rate": 3.989811777246057e-07, "loss": 0.85388833, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.695364236831665 }, { "auxiliary_loss_clip": 0.01069981, "auxiliary_loss_mlp": 0.01000502, "balance_loss_clip": 1.01837301, "balance_loss_mlp": 0.99965584, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8530511016004884, "language_loss": 0.66157281, "learning_rate": 3.985144457422305e-07, "loss": 0.68227768, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.08990478515625 }, { "auxiliary_loss_clip": 0.01172101, "auxiliary_loss_mlp": 0.010272, "balance_loss_clip": 1.05077243, "balance_loss_mlp": 1.02015471, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 2.4899225182746916, "language_loss": 0.76929826, "learning_rate": 3.9804795670394096e-07, "loss": 0.7912913, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.6009929180145264 }, { "auxiliary_loss_clip": 0.0112915, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.04530287, "balance_loss_mlp": 1.02543676, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 2.135619778875568, "language_loss": 0.70773125, "learning_rate": 3.975817106805022e-07, "loss": 0.72935188, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.7058708667755127 }, { "auxiliary_loss_clip": 0.01123424, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.04637122, "balance_loss_mlp": 1.0216068, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 2.3600714626615007, "language_loss": 0.64823234, "learning_rate": 3.97115707742645e-07, "loss": 0.66975272, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.821582317352295 }, { "auxiliary_loss_clip": 0.01141814, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.050179, "balance_loss_mlp": 1.02214456, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 2.064303915219861, "language_loss": 0.64830005, "learning_rate": 3.966499479610599e-07, "loss": 0.67001128, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.6648218631744385 }, { "auxiliary_loss_clip": 0.01120856, "auxiliary_loss_mlp": 0.01025561, "balance_loss_clip": 1.04865742, "balance_loss_mlp": 1.01834559, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 3.1626046748550314, "language_loss": 0.65388536, "learning_rate": 3.9618443140640225e-07, "loss": 0.67534947, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.71114182472229 }, { "auxiliary_loss_clip": 0.01019654, "auxiliary_loss_mlp": 0.01001173, "balance_loss_clip": 1.01799345, "balance_loss_mlp": 1.00021982, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6866377509822201, "language_loss": 0.51303053, "learning_rate": 3.957191581492918e-07, "loss": 0.53323883, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.2953219413757324 }, { "auxiliary_loss_clip": 0.01132941, "auxiliary_loss_mlp": 0.01019875, "balance_loss_clip": 1.04594731, "balance_loss_mlp": 1.01260662, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 3.7480098320637154, "language_loss": 0.70992076, "learning_rate": 3.952541282603097e-07, "loss": 0.73144901, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.6614294052124023 }, { "auxiliary_loss_clip": 0.0114927, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.04720044, "balance_loss_mlp": 1.01864171, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 1.9788970239814265, "language_loss": 0.83575022, "learning_rate": 3.9478934181000013e-07, "loss": 0.85749972, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.618159055709839 }, { "auxiliary_loss_clip": 0.01173522, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.05032468, "balance_loss_mlp": 1.02464736, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.982865730369195, "language_loss": 0.84251243, "learning_rate": 3.943247988688714e-07, "loss": 0.86457253, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.602694511413574 }, { "auxiliary_loss_clip": 0.01154397, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.04786527, "balance_loss_mlp": 1.0195601, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.9739321477819431, "language_loss": 0.71791732, "learning_rate": 3.938604995073933e-07, "loss": 0.73972118, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.635695457458496 }, { "auxiliary_loss_clip": 0.01141938, "auxiliary_loss_mlp": 0.01025773, "balance_loss_clip": 1.04831386, "balance_loss_mlp": 1.01878774, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 3.038037096071451, "language_loss": 0.65427089, "learning_rate": 3.9339644379600157e-07, "loss": 0.67594796, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 3.024228096008301 }, { "auxiliary_loss_clip": 0.01156556, "auxiliary_loss_mlp": 0.01030453, "balance_loss_clip": 1.05009604, "balance_loss_mlp": 1.02324688, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 2.251023937658301, "language_loss": 0.7108137, "learning_rate": 3.929326318050907e-07, "loss": 0.73268378, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.562361478805542 }, { "auxiliary_loss_clip": 0.01167026, "auxiliary_loss_mlp": 0.01022029, "balance_loss_clip": 1.04840863, "balance_loss_mlp": 1.01469207, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 1.9829342427244274, "language_loss": 0.78995889, "learning_rate": 3.924690636050225e-07, "loss": 0.81184947, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.6177666187286377 }, { "auxiliary_loss_clip": 0.01154534, "auxiliary_loss_mlp": 0.01023736, "balance_loss_clip": 1.04902911, "balance_loss_mlp": 1.01662254, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 4.1155628373265145, "language_loss": 0.73246515, "learning_rate": 3.9200573926611915e-07, "loss": 0.75424778, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.6678566932678223 }, { "auxiliary_loss_clip": 0.01151993, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.04925036, "balance_loss_mlp": 1.02025294, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 2.127806663205187, "language_loss": 0.73452473, "learning_rate": 3.9154265885866613e-07, "loss": 0.75631589, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.6473655700683594 }, { "auxiliary_loss_clip": 0.01151101, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.04805779, "balance_loss_mlp": 1.01885188, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 5.736677957364852, "language_loss": 0.74617362, "learning_rate": 3.9107982245291394e-07, "loss": 0.76795077, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.58906888961792 }, { "auxiliary_loss_clip": 0.0112217, "auxiliary_loss_mlp": 0.01022744, "balance_loss_clip": 1.04672813, "balance_loss_mlp": 1.01568341, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 1.8940223152373772, "language_loss": 0.77594107, "learning_rate": 3.9061723011907245e-07, "loss": 0.79739022, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 3.682718515396118 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.0460608, "balance_loss_mlp": 1.0185535, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.888891851446994, "language_loss": 0.79208064, "learning_rate": 3.901548819273179e-07, "loss": 0.81368881, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 3.5723350048065186 }, { "auxiliary_loss_clip": 0.01156077, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.05045438, "balance_loss_mlp": 1.020715, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 3.538076080284294, "language_loss": 0.69662905, "learning_rate": 3.896927779477881e-07, "loss": 0.71847248, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 4.415302276611328 }, { "auxiliary_loss_clip": 0.01122568, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.04492259, "balance_loss_mlp": 1.02383375, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.5613520527203555, "language_loss": 0.66931248, "learning_rate": 3.892309182505833e-07, "loss": 0.69084752, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.702571392059326 }, { "auxiliary_loss_clip": 0.01168122, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.04856014, "balance_loss_mlp": 1.01801872, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 2.246616620280872, "language_loss": 0.86599511, "learning_rate": 3.887693029057675e-07, "loss": 0.88792676, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.6766130924224854 }, { "auxiliary_loss_clip": 0.01141255, "auxiliary_loss_mlp": 0.01022174, "balance_loss_clip": 1.04846203, "balance_loss_mlp": 1.01583838, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.7348370853865114, "language_loss": 0.81464583, "learning_rate": 3.8830793198336684e-07, "loss": 0.83628011, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.6283977031707764 }, { "auxiliary_loss_clip": 0.01160442, "auxiliary_loss_mlp": 0.01025231, "balance_loss_clip": 1.04936385, "balance_loss_mlp": 1.0182364, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.7953286346664137, "language_loss": 0.70357609, "learning_rate": 3.878468055533721e-07, "loss": 0.72543281, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.7919869422912598 }, { "auxiliary_loss_clip": 0.01131362, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.0484041, "balance_loss_mlp": 1.02489889, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 3.218021386139445, "language_loss": 0.85169709, "learning_rate": 3.8738592368573464e-07, "loss": 0.87333727, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.6398167610168457 }, { "auxiliary_loss_clip": 0.01114094, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.04542923, "balance_loss_mlp": 1.01898158, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 3.1650380923374146, "language_loss": 0.88178003, "learning_rate": 3.8692528645037137e-07, "loss": 0.90317982, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.6921749114990234 }, { "auxiliary_loss_clip": 0.01169002, "auxiliary_loss_mlp": 0.01026777, "balance_loss_clip": 1.05020678, "balance_loss_mlp": 1.01955247, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.6066596629651104, "language_loss": 0.77770233, "learning_rate": 3.8646489391715907e-07, "loss": 0.79966009, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.536039352416992 }, { "auxiliary_loss_clip": 0.01140658, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 1.04857254, "balance_loss_mlp": 1.02082872, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 3.3375972188719345, "language_loss": 0.88108647, "learning_rate": 3.8600474615593903e-07, "loss": 0.90277559, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.5895392894744873 }, { "auxiliary_loss_clip": 0.01036652, "auxiliary_loss_mlp": 0.010011, "balance_loss_clip": 1.01980567, "balance_loss_mlp": 1.00023007, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.7823140115308251, "language_loss": 0.59626871, "learning_rate": 3.8554484323651605e-07, "loss": 0.61664623, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.2606728076934814 }, { "auxiliary_loss_clip": 0.01151412, "auxiliary_loss_mlp": 0.00711371, "balance_loss_clip": 1.04893136, "balance_loss_mlp": 1.00065541, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.520780711804715, "language_loss": 0.79201901, "learning_rate": 3.85085185228657e-07, "loss": 0.81064677, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.6235311031341553 }, { "auxiliary_loss_clip": 0.0113294, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.04643059, "balance_loss_mlp": 1.01836467, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 1.8337922008068368, "language_loss": 0.73045683, "learning_rate": 3.8462577220209114e-07, "loss": 0.75204086, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.762892484664917 }, { "auxiliary_loss_clip": 0.01071349, "auxiliary_loss_mlp": 0.01001307, "balance_loss_clip": 1.01957655, "balance_loss_mlp": 1.00045419, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.716996876307391, "language_loss": 0.58931303, "learning_rate": 3.8416660422651127e-07, "loss": 0.61003953, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.272383213043213 }, { "auxiliary_loss_clip": 0.01125014, "auxiliary_loss_mlp": 0.01024462, "balance_loss_clip": 1.04371035, "balance_loss_mlp": 1.01723838, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 1.901609345146889, "language_loss": 0.68266249, "learning_rate": 3.837076813715723e-07, "loss": 0.70415723, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.7011544704437256 }, { "auxiliary_loss_clip": 0.0112048, "auxiliary_loss_mlp": 0.01019646, "balance_loss_clip": 1.04425275, "balance_loss_mlp": 1.01244318, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 2.4877497129912065, "language_loss": 0.75043142, "learning_rate": 3.832490037068941e-07, "loss": 0.7718327, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.691577196121216 }, { "auxiliary_loss_clip": 0.01084451, "auxiliary_loss_mlp": 0.0102567, "balance_loss_clip": 1.04096293, "balance_loss_mlp": 1.01854146, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 2.0008460591438504, "language_loss": 0.76383281, "learning_rate": 3.827905713020554e-07, "loss": 0.78493404, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.7736992835998535 }, { "auxiliary_loss_clip": 0.01124215, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.04350364, "balance_loss_mlp": 1.0200448, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.646471725848557, "language_loss": 0.69466639, "learning_rate": 3.823323842266017e-07, "loss": 0.71619058, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.761091470718384 }, { "auxiliary_loss_clip": 0.01153367, "auxiliary_loss_mlp": 0.01024919, "balance_loss_clip": 1.04538941, "balance_loss_mlp": 1.01768279, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.743355802786915, "language_loss": 0.72937191, "learning_rate": 3.818744425500393e-07, "loss": 0.75115478, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.590034246444702 }, { "auxiliary_loss_clip": 0.01115119, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.04227221, "balance_loss_mlp": 1.02435613, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 2.4079826155639874, "language_loss": 0.80538511, "learning_rate": 3.8141674634183675e-07, "loss": 0.82685876, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.7127912044525146 }, { "auxiliary_loss_clip": 0.01105889, "auxiliary_loss_mlp": 0.01022785, "balance_loss_clip": 1.04648483, "balance_loss_mlp": 1.01630855, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 1.8631484920376298, "language_loss": 0.66416276, "learning_rate": 3.809592956714278e-07, "loss": 0.68544948, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.7628817558288574 }, { "auxiliary_loss_clip": 0.01161808, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.05229783, "balance_loss_mlp": 1.01988304, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 2.5462110397183566, "language_loss": 0.74788249, "learning_rate": 3.805020906082057e-07, "loss": 0.76976895, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.6478335857391357 }, { "auxiliary_loss_clip": 0.01141254, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.04796696, "balance_loss_mlp": 1.01856148, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 3.264325149824604, "language_loss": 0.80957502, "learning_rate": 3.8004513122152917e-07, "loss": 0.8312481, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.612626791000366 }, { "auxiliary_loss_clip": 0.0112815, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.04628968, "balance_loss_mlp": 1.01934695, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.7675912930992193, "language_loss": 0.67549527, "learning_rate": 3.79588417580718e-07, "loss": 0.69703567, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.6945724487304688 }, { "auxiliary_loss_clip": 0.01157141, "auxiliary_loss_mlp": 0.01030509, "balance_loss_clip": 1.05130172, "balance_loss_mlp": 1.02311254, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 2.0963198189388734, "language_loss": 0.76673383, "learning_rate": 3.791319497550558e-07, "loss": 0.78861034, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.587266445159912 }, { "auxiliary_loss_clip": 0.01128931, "auxiliary_loss_mlp": 0.00711109, "balance_loss_clip": 1.04831421, "balance_loss_mlp": 1.00064683, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 2.1630002769762497, "language_loss": 0.70701933, "learning_rate": 3.78675727813788e-07, "loss": 0.72541976, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 3.6712112426757812 }, { "auxiliary_loss_clip": 0.01139163, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.04983449, "balance_loss_mlp": 1.02095604, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.80402970735391, "language_loss": 0.73359829, "learning_rate": 3.782197518261225e-07, "loss": 0.755274, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 3.554657220840454 }, { "auxiliary_loss_clip": 0.0114406, "auxiliary_loss_mlp": 0.01023171, "balance_loss_clip": 1.04788399, "balance_loss_mlp": 1.01639962, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 2.310800424114206, "language_loss": 0.95839548, "learning_rate": 3.777640218612319e-07, "loss": 0.98006773, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 4.510067939758301 }, { "auxiliary_loss_clip": 0.01150935, "auxiliary_loss_mlp": 0.01025602, "balance_loss_clip": 1.04950643, "balance_loss_mlp": 1.01834536, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.515715431816915, "language_loss": 0.71963435, "learning_rate": 3.773085379882488e-07, "loss": 0.74139977, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.5822536945343018 }, { "auxiliary_loss_clip": 0.0115357, "auxiliary_loss_mlp": 0.00711904, "balance_loss_clip": 1.04639888, "balance_loss_mlp": 1.00073755, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 1.744058045562616, "language_loss": 0.75681925, "learning_rate": 3.768533002762715e-07, "loss": 0.77547395, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.780813217163086 }, { "auxiliary_loss_clip": 0.01136679, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.04445052, "balance_loss_mlp": 1.01794732, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.767479723601473, "language_loss": 0.77108729, "learning_rate": 3.763983087943572e-07, "loss": 0.79270464, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.6575310230255127 }, { "auxiliary_loss_clip": 0.01144644, "auxiliary_loss_mlp": 0.00711341, "balance_loss_clip": 1.04572427, "balance_loss_mlp": 1.00068891, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.6032754060176975, "language_loss": 0.81211299, "learning_rate": 3.759435636115282e-07, "loss": 0.83067286, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.669968605041504 }, { "auxiliary_loss_clip": 0.01086321, "auxiliary_loss_mlp": 0.00711482, "balance_loss_clip": 1.0444901, "balance_loss_mlp": 1.00069332, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.8917703749230876, "language_loss": 0.73081493, "learning_rate": 3.7548906479676967e-07, "loss": 0.74879301, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.7466866970062256 }, { "auxiliary_loss_clip": 0.01156216, "auxiliary_loss_mlp": 0.01028405, "balance_loss_clip": 1.04679585, "balance_loss_mlp": 1.02086234, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 1.9982663887823555, "language_loss": 0.71690357, "learning_rate": 3.7503481241902855e-07, "loss": 0.7387498, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.651386260986328 }, { "auxiliary_loss_clip": 0.01136181, "auxiliary_loss_mlp": 0.00711121, "balance_loss_clip": 1.04564786, "balance_loss_mlp": 1.00064015, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.9654439350208823, "language_loss": 0.80387032, "learning_rate": 3.745808065472145e-07, "loss": 0.82234335, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.638274908065796 }, { "auxiliary_loss_clip": 0.01150916, "auxiliary_loss_mlp": 0.01027342, "balance_loss_clip": 1.05173576, "balance_loss_mlp": 1.02035892, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 2.4898375704041737, "language_loss": 0.76247907, "learning_rate": 3.741270472501994e-07, "loss": 0.78426164, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.6103360652923584 }, { "auxiliary_loss_clip": 0.01138175, "auxiliary_loss_mlp": 0.01028875, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.02195466, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.8207277057361442, "language_loss": 0.72758269, "learning_rate": 3.736735345968183e-07, "loss": 0.74925321, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.6280996799468994 }, { "auxiliary_loss_clip": 0.01154906, "auxiliary_loss_mlp": 0.01022094, "balance_loss_clip": 1.04897988, "balance_loss_mlp": 1.01543951, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 2.162442257406868, "language_loss": 0.78854573, "learning_rate": 3.7322026865586986e-07, "loss": 0.81031573, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.6316909790039062 }, { "auxiliary_loss_clip": 0.01162432, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 1.05284858, "balance_loss_mlp": 1.01845086, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 1.8422126494215336, "language_loss": 0.73495138, "learning_rate": 3.7276724949611206e-07, "loss": 0.75683194, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.6101930141448975 }, { "auxiliary_loss_clip": 0.01140138, "auxiliary_loss_mlp": 0.01021421, "balance_loss_clip": 1.04814029, "balance_loss_mlp": 1.01368427, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.9303965509418197, "language_loss": 0.75443447, "learning_rate": 3.723144771862694e-07, "loss": 0.77605003, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.772521495819092 }, { "auxiliary_loss_clip": 0.0112609, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.0448029, "balance_loss_mlp": 1.02111685, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.871672629627573, "language_loss": 0.77098012, "learning_rate": 3.718619517950263e-07, "loss": 0.7925266, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.7395389080047607 }, { "auxiliary_loss_clip": 0.01170747, "auxiliary_loss_mlp": 0.01028186, "balance_loss_clip": 1.05263209, "balance_loss_mlp": 1.02094102, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 2.3995283701827734, "language_loss": 0.76787883, "learning_rate": 3.714096733910301e-07, "loss": 0.78986812, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.5272860527038574 }, { "auxiliary_loss_clip": 0.01163086, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.05203617, "balance_loss_mlp": 1.02205014, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.5660177907313133, "language_loss": 0.70547819, "learning_rate": 3.709576420428926e-07, "loss": 0.72740513, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.665275812149048 }, { "auxiliary_loss_clip": 0.01138247, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.04439831, "balance_loss_mlp": 1.01685524, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 4.281652211979833, "language_loss": 0.73352814, "learning_rate": 3.7050585781918463e-07, "loss": 0.75514829, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.6762051582336426 }, { "auxiliary_loss_clip": 0.01157306, "auxiliary_loss_mlp": 0.01027063, "balance_loss_clip": 1.04823565, "balance_loss_mlp": 1.01965427, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 2.178562794572237, "language_loss": 0.68841046, "learning_rate": 3.700543207884428e-07, "loss": 0.71025419, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.592437505722046 }, { "auxiliary_loss_clip": 0.01150466, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.04806733, "balance_loss_mlp": 1.02007937, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 1.9051411671481369, "language_loss": 0.71100467, "learning_rate": 3.6960303101916466e-07, "loss": 0.73278189, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.660703182220459 }, { "auxiliary_loss_clip": 0.01070649, "auxiliary_loss_mlp": 0.0070128, "balance_loss_clip": 1.01913548, "balance_loss_mlp": 1.00006509, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7368648167712142, "language_loss": 0.55503714, "learning_rate": 3.6915198857981047e-07, "loss": 0.57275641, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.219383716583252 }, { "auxiliary_loss_clip": 0.01119887, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 1.04504573, "balance_loss_mlp": 1.0189923, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 2.2541870643823487, "language_loss": 0.68440711, "learning_rate": 3.687011935388027e-07, "loss": 0.70587152, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.6684041023254395 }, { "auxiliary_loss_clip": 0.01153582, "auxiliary_loss_mlp": 0.0102814, "balance_loss_clip": 1.04974556, "balance_loss_mlp": 1.02084708, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 2.1105666753667562, "language_loss": 0.72838211, "learning_rate": 3.6825064596452646e-07, "loss": 0.75019944, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.671180486679077 }, { "auxiliary_loss_clip": 0.01153509, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.04807663, "balance_loss_mlp": 1.01815581, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 2.0160361702498317, "language_loss": 0.70622969, "learning_rate": 3.678003459253305e-07, "loss": 0.72801691, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.638162851333618 }, { "auxiliary_loss_clip": 0.01119279, "auxiliary_loss_mlp": 0.01025957, "balance_loss_clip": 1.04507256, "balance_loss_mlp": 1.01873577, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 1.9014048414200164, "language_loss": 0.74051094, "learning_rate": 3.673502934895236e-07, "loss": 0.76196337, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.732029914855957 }, { "auxiliary_loss_clip": 0.01070439, "auxiliary_loss_mlp": 0.01000542, "balance_loss_clip": 1.0190742, "balance_loss_mlp": 0.99961865, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6874432615440826, "language_loss": 0.57918847, "learning_rate": 3.669004887253802e-07, "loss": 0.59989822, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 4.198696136474609 }, { "auxiliary_loss_clip": 0.01143862, "auxiliary_loss_mlp": 0.01026491, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.01956248, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.8231275352159324, "language_loss": 0.78833961, "learning_rate": 3.664509317011335e-07, "loss": 0.81004316, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 3.6175944805145264 }, { "auxiliary_loss_clip": 0.01153755, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.05156267, "balance_loss_mlp": 1.02311873, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 2.89558725733652, "language_loss": 0.74146354, "learning_rate": 3.6600162248498134e-07, "loss": 0.76330876, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 4.4754414558410645 }, { "auxiliary_loss_clip": 0.0106961, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.0374136, "balance_loss_mlp": 1.01897705, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.9897027967299457, "language_loss": 0.76081765, "learning_rate": 3.6555256114508426e-07, "loss": 0.78176951, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.8165273666381836 }, { "auxiliary_loss_clip": 0.01136437, "auxiliary_loss_mlp": 0.01027489, "balance_loss_clip": 1.043468, "balance_loss_mlp": 1.02050281, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 2.0561298532071692, "language_loss": 0.73311472, "learning_rate": 3.651037477495642e-07, "loss": 0.75475395, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.713444232940674 }, { "auxiliary_loss_clip": 0.01169159, "auxiliary_loss_mlp": 0.01026446, "balance_loss_clip": 1.04918623, "balance_loss_mlp": 1.01888835, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 2.6104081476520946, "language_loss": 0.68010557, "learning_rate": 3.6465518236650584e-07, "loss": 0.70206159, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.6050708293914795 }, { "auxiliary_loss_clip": 0.01120701, "auxiliary_loss_mlp": 0.0102576, "balance_loss_clip": 1.04426503, "balance_loss_mlp": 1.01891148, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.7616666072323033, "language_loss": 0.7859242, "learning_rate": 3.642068650639558e-07, "loss": 0.80738884, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.7865805625915527 }, { "auxiliary_loss_clip": 0.01130217, "auxiliary_loss_mlp": 0.01028474, "balance_loss_clip": 1.04234338, "balance_loss_mlp": 1.021155, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 2.292968003556532, "language_loss": 0.64589572, "learning_rate": 3.6375879590992334e-07, "loss": 0.66748255, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.7193431854248047 }, { "auxiliary_loss_clip": 0.01133323, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.04571831, "balance_loss_mlp": 1.01804984, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 1.8510053451681254, "language_loss": 0.81201577, "learning_rate": 3.6331097497238173e-07, "loss": 0.83359957, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.6966803073883057 }, { "auxiliary_loss_clip": 0.01123585, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.0463264, "balance_loss_mlp": 1.02052999, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.666806253496364, "language_loss": 0.80640399, "learning_rate": 3.628634023192627e-07, "loss": 0.82791793, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.6784961223602295 }, { "auxiliary_loss_clip": 0.01154994, "auxiliary_loss_mlp": 0.01021038, "balance_loss_clip": 1.04877222, "balance_loss_mlp": 1.01354861, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.502860333942604, "language_loss": 0.75455713, "learning_rate": 3.624160780184644e-07, "loss": 0.77631742, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.6170132160186768 }, { "auxiliary_loss_clip": 0.0113142, "auxiliary_loss_mlp": 0.0102175, "balance_loss_clip": 1.04639482, "balance_loss_mlp": 1.01474047, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 2.0192149571901647, "language_loss": 0.7431531, "learning_rate": 3.6196900213784496e-07, "loss": 0.7646848, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.752206563949585 }, { "auxiliary_loss_clip": 0.01155866, "auxiliary_loss_mlp": 0.01023123, "balance_loss_clip": 1.04855728, "balance_loss_mlp": 1.01618469, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 1.8798243125945602, "language_loss": 0.86633736, "learning_rate": 3.6152217474522527e-07, "loss": 0.88812727, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.7684528827667236 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.0102522, "balance_loss_clip": 1.0510999, "balance_loss_mlp": 1.01842511, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 2.463900485146771, "language_loss": 0.72992551, "learning_rate": 3.6107559590838975e-07, "loss": 0.75172317, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.7143304347991943 }, { "auxiliary_loss_clip": 0.01087632, "auxiliary_loss_mlp": 0.0103049, "balance_loss_clip": 1.0426327, "balance_loss_mlp": 1.02297926, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.5602669344587574, "language_loss": 0.6658957, "learning_rate": 3.606292656950822e-07, "loss": 0.68707687, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.796515464782715 }, { "auxiliary_loss_clip": 0.01131438, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.04391766, "balance_loss_mlp": 1.02293956, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 2.071650827984737, "language_loss": 0.86826581, "learning_rate": 3.601831841730121e-07, "loss": 0.88988423, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.7054495811462402 }, { "auxiliary_loss_clip": 0.01151766, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.04834461, "balance_loss_mlp": 1.01816416, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 2.588700217946974, "language_loss": 0.72725213, "learning_rate": 3.5973735140984916e-07, "loss": 0.74902403, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.663121223449707 }, { "auxiliary_loss_clip": 0.01104299, "auxiliary_loss_mlp": 0.00711068, "balance_loss_clip": 1.04228187, "balance_loss_mlp": 1.00064516, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 2.776923623953182, "language_loss": 0.79307288, "learning_rate": 3.5929176747322607e-07, "loss": 0.81122661, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.8707332611083984 }, { "auxiliary_loss_clip": 0.01049409, "auxiliary_loss_mlp": 0.01001887, "balance_loss_clip": 1.01945698, "balance_loss_mlp": 1.00094497, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8082342271156876, "language_loss": 0.56112027, "learning_rate": 3.588464324307372e-07, "loss": 0.58163321, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.3710293769836426 }, { "auxiliary_loss_clip": 0.01154308, "auxiliary_loss_mlp": 0.01021623, "balance_loss_clip": 1.04667354, "balance_loss_mlp": 1.01481903, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 1.903593056747876, "language_loss": 0.75556803, "learning_rate": 3.584013463499391e-07, "loss": 0.77732736, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.801312208175659 }, { "auxiliary_loss_clip": 0.01049063, "auxiliary_loss_mlp": 0.01000802, "balance_loss_clip": 1.02142024, "balance_loss_mlp": 0.99989015, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7370847552333836, "language_loss": 0.64382195, "learning_rate": 3.579565092983521e-07, "loss": 0.66432059, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.2114150524139404 }, { "auxiliary_loss_clip": 0.01169072, "auxiliary_loss_mlp": 0.01025381, "balance_loss_clip": 1.05057096, "balance_loss_mlp": 1.01843131, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 2.3017107819594615, "language_loss": 0.84273887, "learning_rate": 3.575119213434565e-07, "loss": 0.86468339, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.6802632808685303 }, { "auxiliary_loss_clip": 0.01148522, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.0463438, "balance_loss_mlp": 1.02482152, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 2.049759614280312, "language_loss": 0.81843007, "learning_rate": 3.5706758255269765e-07, "loss": 0.84023398, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.738953113555908 }, { "auxiliary_loss_clip": 0.0114248, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.04869914, "balance_loss_mlp": 1.01882601, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.690900667819407, "language_loss": 0.69765437, "learning_rate": 3.566234929934795e-07, "loss": 0.71933973, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.754732370376587 }, { "auxiliary_loss_clip": 0.01150497, "auxiliary_loss_mlp": 0.0102523, "balance_loss_clip": 1.04919493, "balance_loss_mlp": 1.0181489, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.6929124682566885, "language_loss": 0.71811944, "learning_rate": 3.561796527331706e-07, "loss": 0.73987675, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.7235372066497803 }, { "auxiliary_loss_clip": 0.0112448, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.04548335, "balance_loss_mlp": 1.0181222, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 2.103369885375184, "language_loss": 0.77673876, "learning_rate": 3.5573606183910163e-07, "loss": 0.79823953, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.713852643966675 }, { "auxiliary_loss_clip": 0.0115885, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02203786, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 2.015841275933619, "language_loss": 0.78642553, "learning_rate": 3.5529272037856493e-07, "loss": 0.80830956, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 3.574723243713379 }, { "auxiliary_loss_clip": 0.01013397, "auxiliary_loss_mlp": 0.01001375, "balance_loss_clip": 1.01682544, "balance_loss_mlp": 1.00041568, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7242880266533276, "language_loss": 0.53856742, "learning_rate": 3.548496284188149e-07, "loss": 0.55871511, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 4.377364158630371 }, { "auxiliary_loss_clip": 0.01097086, "auxiliary_loss_mlp": 0.0102632, "balance_loss_clip": 1.04396796, "balance_loss_mlp": 1.01946235, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 1.8677213433262962, "language_loss": 0.79342067, "learning_rate": 3.544067860270681e-07, "loss": 0.81465471, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 4.913619756698608 }, { "auxiliary_loss_clip": 0.01124714, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.04446268, "balance_loss_mlp": 1.0206182, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 2.2667747318093476, "language_loss": 0.71380174, "learning_rate": 3.539641932705029e-07, "loss": 0.73532766, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.6945102214813232 }, { "auxiliary_loss_clip": 0.01172336, "auxiliary_loss_mlp": 0.01025527, "balance_loss_clip": 1.05044258, "balance_loss_mlp": 1.01815391, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.233255637920953, "language_loss": 0.77362472, "learning_rate": 3.53521850216262e-07, "loss": 0.79560333, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.611021041870117 }, { "auxiliary_loss_clip": 0.01169029, "auxiliary_loss_mlp": 0.01027386, "balance_loss_clip": 1.04943228, "balance_loss_mlp": 1.01987576, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 3.082558366883709, "language_loss": 0.76725423, "learning_rate": 3.530797569314461e-07, "loss": 0.78921843, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.5759663581848145 }, { "auxiliary_loss_clip": 0.01169701, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.05105996, "balance_loss_mlp": 1.02370524, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 2.294053471993704, "language_loss": 0.78089011, "learning_rate": 3.5263791348312235e-07, "loss": 0.80289632, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.642148017883301 }, { "auxiliary_loss_clip": 0.01134515, "auxiliary_loss_mlp": 0.01022763, "balance_loss_clip": 1.04399979, "balance_loss_mlp": 1.01586342, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 2.586695082232031, "language_loss": 0.70535403, "learning_rate": 3.521963199383171e-07, "loss": 0.7269268, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.706319808959961 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.0102864, "balance_loss_clip": 1.04455137, "balance_loss_mlp": 1.02156448, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 2.1169640580438953, "language_loss": 0.76897359, "learning_rate": 3.517549763640197e-07, "loss": 0.79032707, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.738288402557373 }, { "auxiliary_loss_clip": 0.01151606, "auxiliary_loss_mlp": 0.00710915, "balance_loss_clip": 1.04959059, "balance_loss_mlp": 1.00058675, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 2.0429341548937185, "language_loss": 0.71394229, "learning_rate": 3.513138828271829e-07, "loss": 0.73256749, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.648834466934204 }, { "auxiliary_loss_clip": 0.01116945, "auxiliary_loss_mlp": 0.01021998, "balance_loss_clip": 1.04430699, "balance_loss_mlp": 1.01516402, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 2.5638798255640483, "language_loss": 0.70267344, "learning_rate": 3.508730393947179e-07, "loss": 0.7240628, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.831684112548828 }, { "auxiliary_loss_clip": 0.0112017, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.0471611, "balance_loss_mlp": 1.02187777, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 2.48094012976917, "language_loss": 0.72317255, "learning_rate": 3.504324461335024e-07, "loss": 0.74466145, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.710350751876831 }, { "auxiliary_loss_clip": 0.01096662, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.0400486, "balance_loss_mlp": 1.02536988, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 1.6525568144667566, "language_loss": 0.88351125, "learning_rate": 3.499921031103732e-07, "loss": 0.90480494, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.708282709121704 }, { "auxiliary_loss_clip": 0.01131027, "auxiliary_loss_mlp": 0.01022945, "balance_loss_clip": 1.04335332, "balance_loss_mlp": 1.01551843, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.8135182132193741, "language_loss": 0.78914922, "learning_rate": 3.4955201039212987e-07, "loss": 0.81068897, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.744337320327759 }, { "auxiliary_loss_clip": 0.01159132, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.04911637, "balance_loss_mlp": 1.01899958, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 2.3209241542055157, "language_loss": 0.65398377, "learning_rate": 3.4911216804553465e-07, "loss": 0.67583531, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.6111948490142822 }, { "auxiliary_loss_clip": 0.0113697, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.04670739, "balance_loss_mlp": 1.02582788, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.437863309855023, "language_loss": 0.7066533, "learning_rate": 3.4867257613731017e-07, "loss": 0.72836, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.6508891582489014 }, { "auxiliary_loss_clip": 0.01139461, "auxiliary_loss_mlp": 0.01022157, "balance_loss_clip": 1.04700518, "balance_loss_mlp": 1.01515317, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 2.6840757098145254, "language_loss": 0.85838491, "learning_rate": 3.4823323473414343e-07, "loss": 0.88000107, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.631624460220337 }, { "auxiliary_loss_clip": 0.01128196, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.04546726, "balance_loss_mlp": 1.01827478, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 2.1701131859117853, "language_loss": 0.76007926, "learning_rate": 3.477941439026812e-07, "loss": 0.78162348, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.712040901184082 }, { "auxiliary_loss_clip": 0.01136884, "auxiliary_loss_mlp": 0.01026517, "balance_loss_clip": 1.04729867, "balance_loss_mlp": 1.01951325, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.8254139411104542, "language_loss": 0.73020446, "learning_rate": 3.473553037095349e-07, "loss": 0.75183839, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.5718235969543457 }, { "auxiliary_loss_clip": 0.01128955, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04398465, "balance_loss_mlp": 1.01862013, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 2.7610192061681462, "language_loss": 0.83353174, "learning_rate": 3.469167142212743e-07, "loss": 0.85506964, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.7035958766937256 }, { "auxiliary_loss_clip": 0.01155433, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.04940796, "balance_loss_mlp": 1.02311611, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.655127803012257, "language_loss": 0.62900877, "learning_rate": 3.4647837550443337e-07, "loss": 0.65086943, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.66526198387146 }, { "auxiliary_loss_clip": 0.0112408, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.04605412, "balance_loss_mlp": 1.02219796, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 1.9627804746228834, "language_loss": 0.74642253, "learning_rate": 3.460402876255086e-07, "loss": 0.76795781, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.6776089668273926 }, { "auxiliary_loss_clip": 0.01154111, "auxiliary_loss_mlp": 0.01024104, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.01722264, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.486813946494876, "language_loss": 0.71911967, "learning_rate": 3.456024506509574e-07, "loss": 0.74090183, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.6577987670898438 }, { "auxiliary_loss_clip": 0.01156204, "auxiliary_loss_mlp": 0.00710818, "balance_loss_clip": 1.05265045, "balance_loss_mlp": 1.00058985, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.9256325341464295, "language_loss": 0.74344361, "learning_rate": 3.4516486464719873e-07, "loss": 0.76211381, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.710805654525757 }, { "auxiliary_loss_clip": 0.0109708, "auxiliary_loss_mlp": 0.01024388, "balance_loss_clip": 1.04054236, "balance_loss_mlp": 1.01714599, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 4.650538602130934, "language_loss": 0.61810243, "learning_rate": 3.4472752968061445e-07, "loss": 0.63931704, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.922780990600586 }, { "auxiliary_loss_clip": 0.01151981, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04608512, "balance_loss_mlp": 1.01797211, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 2.092011584901851, "language_loss": 0.74344319, "learning_rate": 3.442904458175475e-07, "loss": 0.76521176, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 3.558345317840576 }, { "auxiliary_loss_clip": 0.01150562, "auxiliary_loss_mlp": 0.0102552, "balance_loss_clip": 1.04599011, "balance_loss_mlp": 1.01834965, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.6674488857589758, "language_loss": 0.76214683, "learning_rate": 3.438536131243044e-07, "loss": 0.78390765, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 3.6291344165802 }, { "auxiliary_loss_clip": 0.01141328, "auxiliary_loss_mlp": 0.01025258, "balance_loss_clip": 1.04710555, "balance_loss_mlp": 1.01731229, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.1334210552252744, "language_loss": 0.62127125, "learning_rate": 3.434170316671503e-07, "loss": 0.64293712, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 3.717918634414673 }, { "auxiliary_loss_clip": 0.01116174, "auxiliary_loss_mlp": 0.01023756, "balance_loss_clip": 1.04728019, "balance_loss_mlp": 1.01722622, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 3.7386217860359223, "language_loss": 0.89091235, "learning_rate": 3.4298070151231583e-07, "loss": 0.91231167, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 3.5736496448516846 }, { "auxiliary_loss_clip": 0.01143437, "auxiliary_loss_mlp": 0.01026931, "balance_loss_clip": 1.04700077, "balance_loss_mlp": 1.01962924, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 1.8486307934538255, "language_loss": 0.60235608, "learning_rate": 3.425446227259916e-07, "loss": 0.62405968, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.6823925971984863 }, { "auxiliary_loss_clip": 0.01139224, "auxiliary_loss_mlp": 0.01021669, "balance_loss_clip": 1.04586887, "balance_loss_mlp": 1.01510012, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 8.024684280842374, "language_loss": 0.82619298, "learning_rate": 3.421087953743296e-07, "loss": 0.84780192, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.66576886177063 }, { "auxiliary_loss_clip": 0.01152435, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.04657674, "balance_loss_mlp": 1.01784503, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 2.4881242963406973, "language_loss": 0.80509216, "learning_rate": 3.416732195234464e-07, "loss": 0.82686496, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.5926053524017334 }, { "auxiliary_loss_clip": 0.01153084, "auxiliary_loss_mlp": 0.01020985, "balance_loss_clip": 1.0456208, "balance_loss_mlp": 1.01406455, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.5647762824535871, "language_loss": 0.79503095, "learning_rate": 3.4123789523941613e-07, "loss": 0.81677163, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.6139798164367676 }, { "auxiliary_loss_clip": 0.01145298, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.0445447, "balance_loss_mlp": 1.0184536, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 3.348465977407638, "language_loss": 0.63151103, "learning_rate": 3.4080282258827884e-07, "loss": 0.65322107, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.6259753704071045 }, { "auxiliary_loss_clip": 0.0115502, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.04790664, "balance_loss_mlp": 1.02099299, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.3154724576213366, "language_loss": 0.72441733, "learning_rate": 3.403680016360342e-07, "loss": 0.74624413, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.622298002243042 }, { "auxiliary_loss_clip": 0.01147619, "auxiliary_loss_mlp": 0.01023695, "balance_loss_clip": 1.04826856, "balance_loss_mlp": 1.01651859, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.4664685191438596, "language_loss": 0.67546451, "learning_rate": 3.3993343244864403e-07, "loss": 0.69717765, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.6039466857910156 }, { "auxiliary_loss_clip": 0.01150326, "auxiliary_loss_mlp": 0.01022034, "balance_loss_clip": 1.04729807, "balance_loss_mlp": 1.01532865, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.6148169349719925, "language_loss": 0.72888947, "learning_rate": 3.394991150920323e-07, "loss": 0.75061303, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.710663318634033 }, { "auxiliary_loss_clip": 0.01107563, "auxiliary_loss_mlp": 0.00712462, "balance_loss_clip": 1.04315412, "balance_loss_mlp": 1.00063872, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 2.6474787957706725, "language_loss": 0.74545687, "learning_rate": 3.3906504963208396e-07, "loss": 0.76365709, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.6389379501342773 }, { "auxiliary_loss_clip": 0.01101884, "auxiliary_loss_mlp": 0.01026727, "balance_loss_clip": 1.04549456, "balance_loss_mlp": 1.01917505, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 2.2064539428317818, "language_loss": 0.66563725, "learning_rate": 3.3863123613464774e-07, "loss": 0.68692338, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.7647531032562256 }, { "auxiliary_loss_clip": 0.01138648, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04311991, "balance_loss_mlp": 1.01696014, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 3.1410803958141194, "language_loss": 0.74829376, "learning_rate": 3.381976746655317e-07, "loss": 0.76992089, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.6268420219421387 }, { "auxiliary_loss_clip": 0.01097373, "auxiliary_loss_mlp": 0.01020878, "balance_loss_clip": 1.04417491, "balance_loss_mlp": 1.0132966, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.1002654137445913, "language_loss": 0.67094928, "learning_rate": 3.3776436529050756e-07, "loss": 0.69213182, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.68646240234375 }, { "auxiliary_loss_clip": 0.01165859, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.04751289, "balance_loss_mlp": 1.01634002, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.6635189017181236, "language_loss": 0.72630644, "learning_rate": 3.373313080753073e-07, "loss": 0.74820048, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.660935401916504 }, { "auxiliary_loss_clip": 0.01148193, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 1.04601765, "balance_loss_mlp": 1.01695848, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.7323416114264931, "language_loss": 0.77751642, "learning_rate": 3.3689850308562527e-07, "loss": 0.79923916, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.583094358444214 }, { "auxiliary_loss_clip": 0.01100058, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.04599094, "balance_loss_mlp": 1.01737952, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 2.0488476659987165, "language_loss": 0.77499825, "learning_rate": 3.364659503871183e-07, "loss": 0.79624248, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.657792091369629 }, { "auxiliary_loss_clip": 0.01117264, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 1.04257834, "balance_loss_mlp": 1.01860917, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 2.0263907314276457, "language_loss": 0.83895481, "learning_rate": 3.3603365004540417e-07, "loss": 0.86037868, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.7166547775268555 }, { "auxiliary_loss_clip": 0.01169575, "auxiliary_loss_mlp": 0.01026066, "balance_loss_clip": 1.05117381, "balance_loss_mlp": 1.01899993, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 1.8859625329763994, "language_loss": 0.77228272, "learning_rate": 3.356016021260624e-07, "loss": 0.79423916, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.6434786319732666 }, { "auxiliary_loss_clip": 0.01151874, "auxiliary_loss_mlp": 0.01030647, "balance_loss_clip": 1.04716027, "balance_loss_mlp": 1.02319002, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 4.634700263756824, "language_loss": 0.65764254, "learning_rate": 3.35169806694634e-07, "loss": 0.6794678, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.5592334270477295 }, { "auxiliary_loss_clip": 0.01031541, "auxiliary_loss_mlp": 0.01001406, "balance_loss_clip": 1.01915598, "balance_loss_mlp": 1.00047612, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.71733088158616, "language_loss": 0.60636675, "learning_rate": 3.3473826381662186e-07, "loss": 0.62669623, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.380267858505249 }, { "auxiliary_loss_clip": 0.01147696, "auxiliary_loss_mlp": 0.01029019, "balance_loss_clip": 1.04803586, "balance_loss_mlp": 1.02206337, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 2.020854680813391, "language_loss": 0.81686825, "learning_rate": 3.3430697355749216e-07, "loss": 0.83863539, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.564176559448242 }, { "auxiliary_loss_clip": 0.01101236, "auxiliary_loss_mlp": 0.01022535, "balance_loss_clip": 1.04288709, "balance_loss_mlp": 1.01498902, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 2.4642571810545237, "language_loss": 0.75266194, "learning_rate": 3.3387593598266907e-07, "loss": 0.77389973, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.65790057182312 }, { "auxiliary_loss_clip": 0.01113256, "auxiliary_loss_mlp": 0.01022939, "balance_loss_clip": 1.04166186, "balance_loss_mlp": 1.01586986, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 2.4760211630898383, "language_loss": 0.78516251, "learning_rate": 3.3344515115754225e-07, "loss": 0.80652446, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.674781322479248 }, { "auxiliary_loss_clip": 0.0112577, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.04396605, "balance_loss_mlp": 1.0176127, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 5.334685882188104, "language_loss": 0.79967558, "learning_rate": 3.33014619147461e-07, "loss": 0.8211844, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 3.6051812171936035 }, { "auxiliary_loss_clip": 0.01137745, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.04844189, "balance_loss_mlp": 1.0203979, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 2.924348517866467, "language_loss": 0.71768355, "learning_rate": 3.325843400177362e-07, "loss": 0.73933512, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 3.5900657176971436 }, { "auxiliary_loss_clip": 0.01158492, "auxiliary_loss_mlp": 0.00711671, "balance_loss_clip": 1.04950047, "balance_loss_mlp": 1.00064158, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.7710674060272213, "language_loss": 0.74159312, "learning_rate": 3.32154313833642e-07, "loss": 0.7602948, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 3.765821933746338 }, { "auxiliary_loss_clip": 0.01167886, "auxiliary_loss_mlp": 0.01024708, "balance_loss_clip": 1.04748416, "balance_loss_mlp": 1.01730764, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.246156449573734, "language_loss": 0.59527409, "learning_rate": 3.3172454066041164e-07, "loss": 0.61719996, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 3.5643346309661865 }, { "auxiliary_loss_clip": 0.01090132, "auxiliary_loss_mlp": 0.00710587, "balance_loss_clip": 1.04325271, "balance_loss_mlp": 1.00058079, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 1.9155449615623237, "language_loss": 0.76563078, "learning_rate": 3.3129502056324234e-07, "loss": 0.783638, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 2.789825916290283 }, { "auxiliary_loss_clip": 0.00996677, "auxiliary_loss_mlp": 0.01001986, "balance_loss_clip": 1.01683283, "balance_loss_mlp": 1.00102603, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8030902562244416, "language_loss": 0.59771568, "learning_rate": 3.3086575360729165e-07, "loss": 0.61770231, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.25518798828125 }, { "auxiliary_loss_clip": 0.01134749, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.04610515, "balance_loss_mlp": 1.02249706, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.7795246573852574, "language_loss": 0.71255255, "learning_rate": 3.3043673985767906e-07, "loss": 0.73420322, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 2.8741180896759033 }, { "auxiliary_loss_clip": 0.01108844, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.04131007, "balance_loss_mlp": 1.01980889, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 2.0212622910433278, "language_loss": 0.77758062, "learning_rate": 3.3000797937948564e-07, "loss": 0.79893827, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.8626465797424316 }, { "auxiliary_loss_clip": 0.01034426, "auxiliary_loss_mlp": 0.01001684, "balance_loss_clip": 1.01801157, "balance_loss_mlp": 1.00072432, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9365045732750945, "language_loss": 0.64941168, "learning_rate": 3.295794722377534e-07, "loss": 0.66977274, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 3.275491714477539 }, { "auxiliary_loss_clip": 0.01166452, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.04907179, "balance_loss_mlp": 1.01872468, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 3.049216661139392, "language_loss": 0.7998687, "learning_rate": 3.291512184974876e-07, "loss": 0.82179409, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.6142308712005615 }, { "auxiliary_loss_clip": 0.01132727, "auxiliary_loss_mlp": 0.01027426, "balance_loss_clip": 1.04298806, "balance_loss_mlp": 1.02023721, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 5.707582170388541, "language_loss": 0.66341949, "learning_rate": 3.2872321822365346e-07, "loss": 0.68502098, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.7051899433135986 }, { "auxiliary_loss_clip": 0.01151919, "auxiliary_loss_mlp": 0.01025176, "balance_loss_clip": 1.04888356, "balance_loss_mlp": 1.0181669, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 2.187154272173444, "language_loss": 0.73587334, "learning_rate": 3.282954714811783e-07, "loss": 0.7576443, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.6581361293792725 }, { "auxiliary_loss_clip": 0.01123123, "auxiliary_loss_mlp": 0.01024774, "balance_loss_clip": 1.04198837, "balance_loss_mlp": 1.01726651, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.5766215414128433, "language_loss": 0.71058631, "learning_rate": 3.2786797833495093e-07, "loss": 0.73206532, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.6041595935821533 }, { "auxiliary_loss_clip": 0.01166498, "auxiliary_loss_mlp": 0.01023372, "balance_loss_clip": 1.04861188, "balance_loss_mlp": 1.0164938, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.9298138325574437, "language_loss": 0.73151863, "learning_rate": 3.274407388498213e-07, "loss": 0.75341725, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.6241984367370605 }, { "auxiliary_loss_clip": 0.01115328, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.04323912, "balance_loss_mlp": 1.02059257, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 1.8188450254579018, "language_loss": 0.74527156, "learning_rate": 3.270137530906021e-07, "loss": 0.76670074, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.621889591217041 }, { "auxiliary_loss_clip": 0.01099629, "auxiliary_loss_mlp": 0.0102521, "balance_loss_clip": 1.04605627, "balance_loss_mlp": 1.01821268, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 2.124390330538584, "language_loss": 0.84044814, "learning_rate": 3.265870211220665e-07, "loss": 0.86169654, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.6856460571289062 }, { "auxiliary_loss_clip": 0.0111908, "auxiliary_loss_mlp": 0.0102588, "balance_loss_clip": 1.04676604, "balance_loss_mlp": 1.01791644, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.100723658161377, "language_loss": 0.82192206, "learning_rate": 3.2616054300894934e-07, "loss": 0.84337175, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.6714365482330322 }, { "auxiliary_loss_clip": 0.01128634, "auxiliary_loss_mlp": 0.01034377, "balance_loss_clip": 1.04607892, "balance_loss_mlp": 1.02732587, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 2.5327117164438078, "language_loss": 0.84780186, "learning_rate": 3.2573431881594693e-07, "loss": 0.86943197, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.7067062854766846 }, { "auxiliary_loss_clip": 0.01087876, "auxiliary_loss_mlp": 0.01024258, "balance_loss_clip": 1.04111147, "balance_loss_mlp": 1.01762128, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 3.351569033874439, "language_loss": 0.66000974, "learning_rate": 3.2530834860771663e-07, "loss": 0.68113112, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.826357841491699 }, { "auxiliary_loss_clip": 0.01154408, "auxiliary_loss_mlp": 0.01029202, "balance_loss_clip": 1.04777217, "balance_loss_mlp": 1.02221298, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 1.9896582380839118, "language_loss": 0.74159825, "learning_rate": 3.248826324488794e-07, "loss": 0.76343435, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.5815064907073975 }, { "auxiliary_loss_clip": 0.01168192, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.0517683, "balance_loss_mlp": 1.01908672, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.8956992775119494, "language_loss": 0.88227707, "learning_rate": 3.244571704040138e-07, "loss": 0.90421605, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.63154673576355 }, { "auxiliary_loss_clip": 0.01149102, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.04577565, "balance_loss_mlp": 1.01897645, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 1.949651510797053, "language_loss": 0.73960733, "learning_rate": 3.2403196253766374e-07, "loss": 0.76136643, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.6124563217163086 }, { "auxiliary_loss_clip": 0.01151202, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.04806018, "balance_loss_mlp": 1.02021289, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.4197416531589466, "language_loss": 0.79267126, "learning_rate": 3.2360700891433254e-07, "loss": 0.81446499, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.7243292331695557 }, { "auxiliary_loss_clip": 0.01022986, "auxiliary_loss_mlp": 0.01002433, "balance_loss_clip": 1.01852667, "balance_loss_mlp": 1.00144374, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.7927838631461159, "language_loss": 0.57252574, "learning_rate": 3.231823095984847e-07, "loss": 0.59277993, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.168984889984131 }, { "auxiliary_loss_clip": 0.01134426, "auxiliary_loss_mlp": 0.01023175, "balance_loss_clip": 1.04457164, "balance_loss_mlp": 1.01636839, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.153981062397388, "language_loss": 0.7629146, "learning_rate": 3.2275786465454814e-07, "loss": 0.78449059, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.6489148139953613 }, { "auxiliary_loss_clip": 0.01117611, "auxiliary_loss_mlp": 0.01026224, "balance_loss_clip": 1.0437665, "balance_loss_mlp": 1.01919079, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 2.3144576005938435, "language_loss": 0.75906706, "learning_rate": 3.2233367414690917e-07, "loss": 0.78050542, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.6810126304626465 }, { "auxiliary_loss_clip": 0.01116971, "auxiliary_loss_mlp": 0.01026103, "balance_loss_clip": 1.04310405, "balance_loss_mlp": 1.01953149, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.272554084958892, "language_loss": 0.84820813, "learning_rate": 3.219097381399183e-07, "loss": 0.86963886, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 3.5507688522338867 }, { "auxiliary_loss_clip": 0.01143843, "auxiliary_loss_mlp": 0.01025901, "balance_loss_clip": 1.04700994, "balance_loss_mlp": 1.01912355, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 2.0030486651674466, "language_loss": 0.8114118, "learning_rate": 3.2148605669788584e-07, "loss": 0.83310926, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 3.7302651405334473 }, { "auxiliary_loss_clip": 0.01139472, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04869366, "balance_loss_mlp": 1.02119875, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.6956350504568034, "language_loss": 0.77613372, "learning_rate": 3.2106262988508405e-07, "loss": 0.7978062, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 3.5884854793548584 }, { "auxiliary_loss_clip": 0.01136732, "auxiliary_loss_mlp": 0.01025109, "balance_loss_clip": 1.04682565, "balance_loss_mlp": 1.0177474, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 2.6320123857850324, "language_loss": 0.73917091, "learning_rate": 3.206394577657465e-07, "loss": 0.76078933, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 3.533830165863037 }, { "auxiliary_loss_clip": 0.01157613, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.04913449, "balance_loss_mlp": 1.02301502, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.670809668709447, "language_loss": 0.72945809, "learning_rate": 3.202165404040675e-07, "loss": 0.7513417, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.6273410320281982 }, { "auxiliary_loss_clip": 0.01090037, "auxiliary_loss_mlp": 0.01029874, "balance_loss_clip": 1.04385734, "balance_loss_mlp": 1.02226865, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 2.2496009143454634, "language_loss": 0.75066417, "learning_rate": 3.1979387786420396e-07, "loss": 0.77186322, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.7495288848876953 }, { "auxiliary_loss_clip": 0.01138196, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.04506767, "balance_loss_mlp": 1.02200651, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 2.0340297593052115, "language_loss": 0.81945968, "learning_rate": 3.1937147021027346e-07, "loss": 0.84113383, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.6800625324249268 }, { "auxiliary_loss_clip": 0.01151387, "auxiliary_loss_mlp": 0.01018941, "balance_loss_clip": 1.04905581, "balance_loss_mlp": 1.01262546, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.5135932390735003, "language_loss": 0.76343405, "learning_rate": 3.189493175063547e-07, "loss": 0.7851373, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.5408589839935303 }, { "auxiliary_loss_clip": 0.01142518, "auxiliary_loss_mlp": 0.01025022, "balance_loss_clip": 1.05046535, "balance_loss_mlp": 1.01805997, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 1.9273592923835774, "language_loss": 0.67707908, "learning_rate": 3.1852741981648776e-07, "loss": 0.69875443, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.644036293029785 }, { "auxiliary_loss_clip": 0.01110182, "auxiliary_loss_mlp": 0.01024924, "balance_loss_clip": 1.04399562, "balance_loss_mlp": 1.01775932, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 2.078508893711827, "language_loss": 0.700275, "learning_rate": 3.1810577720467404e-07, "loss": 0.72162604, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.688185930252075 }, { "auxiliary_loss_clip": 0.01140443, "auxiliary_loss_mlp": 0.01024936, "balance_loss_clip": 1.04649019, "balance_loss_mlp": 1.01795948, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 2.447516852980891, "language_loss": 0.56162691, "learning_rate": 3.176843897348769e-07, "loss": 0.58328074, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.75254225730896 }, { "auxiliary_loss_clip": 0.01134135, "auxiliary_loss_mlp": 0.01024097, "balance_loss_clip": 1.04648542, "balance_loss_mlp": 1.01695609, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.6213895835328795, "language_loss": 0.75982487, "learning_rate": 3.1726325747102034e-07, "loss": 0.78140718, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.588078260421753 }, { "auxiliary_loss_clip": 0.01098667, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.04019558, "balance_loss_mlp": 1.02079582, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.5565673766987436, "language_loss": 0.6404714, "learning_rate": 3.1684238047698974e-07, "loss": 0.66174299, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.052532196044922 }, { "auxiliary_loss_clip": 0.01141047, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.04743147, "balance_loss_mlp": 1.01620698, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 4.107727704176172, "language_loss": 0.52848446, "learning_rate": 3.1642175881663155e-07, "loss": 0.55013382, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.7198963165283203 }, { "auxiliary_loss_clip": 0.01166308, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.04797626, "balance_loss_mlp": 1.01789021, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 3.0126924031544293, "language_loss": 0.83606666, "learning_rate": 3.160013925537537e-07, "loss": 0.85797453, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.5667827129364014 }, { "auxiliary_loss_clip": 0.01124891, "auxiliary_loss_mlp": 0.01021512, "balance_loss_clip": 1.04457545, "balance_loss_mlp": 1.01456201, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.560132041545911, "language_loss": 0.75632036, "learning_rate": 3.155812817521266e-07, "loss": 0.77778441, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.720794916152954 }, { "auxiliary_loss_clip": 0.01142572, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.0498147, "balance_loss_mlp": 1.02327418, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.1652812041911123, "language_loss": 0.77989715, "learning_rate": 3.151614264754787e-07, "loss": 0.8016293, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.6114916801452637 }, { "auxiliary_loss_clip": 0.01167362, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04741955, "balance_loss_mlp": 1.01568222, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.336199217022041, "language_loss": 0.79312038, "learning_rate": 3.147418267875035e-07, "loss": 0.8150304, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.573906421661377 }, { "auxiliary_loss_clip": 0.01084944, "auxiliary_loss_mlp": 0.00711408, "balance_loss_clip": 1.03888559, "balance_loss_mlp": 1.00059569, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 2.801909194753442, "language_loss": 0.65536976, "learning_rate": 3.1432248275185315e-07, "loss": 0.67333323, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.7428512573242188 }, { "auxiliary_loss_clip": 0.01151414, "auxiliary_loss_mlp": 0.01029981, "balance_loss_clip": 1.04870331, "balance_loss_mlp": 1.02311182, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 1.9814865358917102, "language_loss": 0.77304763, "learning_rate": 3.139033944321412e-07, "loss": 0.79486156, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.641627073287964 }, { "auxiliary_loss_clip": 0.01154928, "auxiliary_loss_mlp": 0.0102531, "balance_loss_clip": 1.04721439, "balance_loss_mlp": 1.01815104, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.8019665750319573, "language_loss": 0.79013324, "learning_rate": 3.1348456189194507e-07, "loss": 0.81193554, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.6211252212524414 }, { "auxiliary_loss_clip": 0.01112147, "auxiliary_loss_mlp": 0.01022471, "balance_loss_clip": 1.04265499, "balance_loss_mlp": 1.01566434, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.7849257456995582, "language_loss": 0.82960904, "learning_rate": 3.1306598519479876e-07, "loss": 0.85095525, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.6579596996307373 }, { "auxiliary_loss_clip": 0.01133892, "auxiliary_loss_mlp": 0.01021926, "balance_loss_clip": 1.04699159, "balance_loss_mlp": 1.01500034, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 2.2275250708823724, "language_loss": 0.78008133, "learning_rate": 3.1264766440420177e-07, "loss": 0.8016395, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.7239716053009033 }, { "auxiliary_loss_clip": 0.01152648, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.05110335, "balance_loss_mlp": 1.01785922, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 2.1485582474162865, "language_loss": 0.69331044, "learning_rate": 3.122295995836124e-07, "loss": 0.7150861, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.623565912246704 }, { "auxiliary_loss_clip": 0.01154503, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.04523993, "balance_loss_mlp": 1.02081466, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.8169779854109203, "language_loss": 0.77581501, "learning_rate": 3.118117907964508e-07, "loss": 0.79763931, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.6448049545288086 }, { "auxiliary_loss_clip": 0.01125923, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.04479945, "balance_loss_mlp": 1.01977742, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 2.7580372555784267, "language_loss": 0.80532801, "learning_rate": 3.1139423810609856e-07, "loss": 0.82684982, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.763542652130127 }, { "auxiliary_loss_clip": 0.01168552, "auxiliary_loss_mlp": 0.01023828, "balance_loss_clip": 1.04793525, "balance_loss_mlp": 1.01687765, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 1.9041560195277443, "language_loss": 0.75773221, "learning_rate": 3.1097694157589714e-07, "loss": 0.77965599, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 3.481367588043213 }, { "auxiliary_loss_clip": 0.0114982, "auxiliary_loss_mlp": 0.01025692, "balance_loss_clip": 1.04919291, "balance_loss_mlp": 1.0183692, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 3.83964609140478, "language_loss": 0.75731248, "learning_rate": 3.105599012691511e-07, "loss": 0.77906764, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 3.4947142601013184 }, { "auxiliary_loss_clip": 0.01149958, "auxiliary_loss_mlp": 0.01025346, "balance_loss_clip": 1.04759514, "balance_loss_mlp": 1.01895976, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 2.1154758690787308, "language_loss": 0.82442498, "learning_rate": 3.101431172491249e-07, "loss": 0.84617805, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 3.5693695545196533 }, { "auxiliary_loss_clip": 0.01122887, "auxiliary_loss_mlp": 0.0071149, "balance_loss_clip": 1.04307163, "balance_loss_mlp": 1.00062442, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 2.9452125878269775, "language_loss": 0.72262096, "learning_rate": 3.097265895790444e-07, "loss": 0.74096477, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 3.5093326568603516 }, { "auxiliary_loss_clip": 0.01120355, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04466987, "balance_loss_mlp": 1.01827979, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.237827586862529, "language_loss": 0.83043379, "learning_rate": 3.093103183220962e-07, "loss": 0.85189861, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 2.685659408569336 }, { "auxiliary_loss_clip": 0.01057644, "auxiliary_loss_mlp": 0.0100049, "balance_loss_clip": 1.01779222, "balance_loss_mlp": 0.99959624, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8163271555485633, "language_loss": 0.59309065, "learning_rate": 3.0889430354142796e-07, "loss": 0.61367196, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.1773452758789062 }, { "auxiliary_loss_clip": 0.01122327, "auxiliary_loss_mlp": 0.01020261, "balance_loss_clip": 1.04333007, "balance_loss_mlp": 1.01297092, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 4.053336118863719, "language_loss": 0.7038669, "learning_rate": 3.084785453001497e-07, "loss": 0.72529274, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.741267442703247 }, { "auxiliary_loss_clip": 0.01136301, "auxiliary_loss_mlp": 0.00711458, "balance_loss_clip": 1.04733086, "balance_loss_mlp": 1.00063968, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.1551746599651014, "language_loss": 0.82195359, "learning_rate": 3.080630436613314e-07, "loss": 0.84043121, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.6548047065734863 }, { "auxiliary_loss_clip": 0.01144399, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.04585505, "balance_loss_mlp": 1.01827228, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 2.1783417698559124, "language_loss": 0.8620218, "learning_rate": 3.076477986880039e-07, "loss": 0.88371813, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.6095290184020996 }, { "auxiliary_loss_clip": 0.01138948, "auxiliary_loss_mlp": 0.01023745, "balance_loss_clip": 1.04954767, "balance_loss_mlp": 1.01671195, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.451613349445368, "language_loss": 0.69433761, "learning_rate": 3.0723281044315986e-07, "loss": 0.7159645, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.6271681785583496 }, { "auxiliary_loss_clip": 0.01164129, "auxiliary_loss_mlp": 0.01022032, "balance_loss_clip": 1.04644442, "balance_loss_mlp": 1.01576138, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 2.5350525240240165, "language_loss": 0.76491362, "learning_rate": 3.068180789897521e-07, "loss": 0.78677523, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.5638558864593506 }, { "auxiliary_loss_clip": 0.01156322, "auxiliary_loss_mlp": 0.01023277, "balance_loss_clip": 1.04860425, "balance_loss_mlp": 1.01585031, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 2.012715296118382, "language_loss": 0.81500274, "learning_rate": 3.064036043906966e-07, "loss": 0.83679873, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.7183024883270264 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.0462091, "balance_loss_mlp": 1.02045918, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 1.9948020571334266, "language_loss": 0.67792535, "learning_rate": 3.059893867088668e-07, "loss": 0.69950438, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.845229148864746 }, { "auxiliary_loss_clip": 0.01152732, "auxiliary_loss_mlp": 0.0102544, "balance_loss_clip": 1.04860973, "balance_loss_mlp": 1.01849556, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 2.2951458448164477, "language_loss": 0.67384851, "learning_rate": 3.055754260071004e-07, "loss": 0.69563025, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.642789125442505 }, { "auxiliary_loss_clip": 0.01153308, "auxiliary_loss_mlp": 0.01026006, "balance_loss_clip": 1.04719019, "balance_loss_mlp": 1.01892185, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 2.1256188635821105, "language_loss": 0.73539114, "learning_rate": 3.051617223481948e-07, "loss": 0.75718427, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.666477680206299 }, { "auxiliary_loss_clip": 0.01131234, "auxiliary_loss_mlp": 0.01034967, "balance_loss_clip": 1.044698, "balance_loss_mlp": 1.02754068, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 1.9600924189956024, "language_loss": 0.75366163, "learning_rate": 3.047482757949078e-07, "loss": 0.77532369, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.604329824447632 }, { "auxiliary_loss_clip": 0.01117329, "auxiliary_loss_mlp": 0.00710995, "balance_loss_clip": 1.04231727, "balance_loss_mlp": 1.0006454, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 2.0233923852406277, "language_loss": 0.85838073, "learning_rate": 3.043350864099605e-07, "loss": 0.87666398, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.7371346950531006 }, { "auxiliary_loss_clip": 0.01152912, "auxiliary_loss_mlp": 0.01024104, "balance_loss_clip": 1.0451051, "balance_loss_mlp": 1.01723409, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.2764722142323386, "language_loss": 0.80985749, "learning_rate": 3.039221542560315e-07, "loss": 0.83162761, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.5573890209198 }, { "auxiliary_loss_clip": 0.01149024, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.04691494, "balance_loss_mlp": 1.01847839, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.9792386861778382, "language_loss": 0.73891604, "learning_rate": 3.0350947939576356e-07, "loss": 0.76065999, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.5683224201202393 }, { "auxiliary_loss_clip": 0.01157739, "auxiliary_loss_mlp": 0.01024543, "balance_loss_clip": 1.04888344, "balance_loss_mlp": 1.01742625, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.8542820931128416, "language_loss": 0.7245422, "learning_rate": 3.0309706189175876e-07, "loss": 0.74636495, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.5537991523742676 }, { "auxiliary_loss_clip": 0.01045998, "auxiliary_loss_mlp": 0.0100154, "balance_loss_clip": 1.01745844, "balance_loss_mlp": 1.00063455, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7569522577591308, "language_loss": 0.57340121, "learning_rate": 3.0268490180658045e-07, "loss": 0.5938766, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.2188403606414795 }, { "auxiliary_loss_clip": 0.01172819, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.05150127, "balance_loss_mlp": 1.02045608, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.439519475711947, "language_loss": 0.79123056, "learning_rate": 3.0227299920275305e-07, "loss": 0.81323922, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.5932981967926025 }, { "auxiliary_loss_clip": 0.01125396, "auxiliary_loss_mlp": 0.01024983, "balance_loss_clip": 1.04778993, "balance_loss_mlp": 1.01690114, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.978620596548364, "language_loss": 0.85777545, "learning_rate": 3.018613541427613e-07, "loss": 0.87927926, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.702393054962158 }, { "auxiliary_loss_clip": 0.01168299, "auxiliary_loss_mlp": 0.0102213, "balance_loss_clip": 1.04810321, "balance_loss_mlp": 1.01491761, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.7355701692460392, "language_loss": 0.73634255, "learning_rate": 3.0144996668905243e-07, "loss": 0.7582469, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.6118016242980957 }, { "auxiliary_loss_clip": 0.01089821, "auxiliary_loss_mlp": 0.00711468, "balance_loss_clip": 1.0384922, "balance_loss_mlp": 1.00061226, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 5.1232443589578285, "language_loss": 0.82087731, "learning_rate": 3.010388369040331e-07, "loss": 0.83889019, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.729692220687866 }, { "auxiliary_loss_clip": 0.01154638, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.04998398, "balance_loss_mlp": 1.01977158, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.667911253694247, "language_loss": 0.82899928, "learning_rate": 3.0062796485007156e-07, "loss": 0.85081875, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.7017440795898438 }, { "auxiliary_loss_clip": 0.01169742, "auxiliary_loss_mlp": 0.00711397, "balance_loss_clip": 1.04953051, "balance_loss_mlp": 1.0006026, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 3.894427331938547, "language_loss": 0.65961927, "learning_rate": 3.002173505894965e-07, "loss": 0.67843062, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 3.4987354278564453 }, { "auxiliary_loss_clip": 0.01157628, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.04751444, "balance_loss_mlp": 1.02082181, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 2.8754155584210657, "language_loss": 0.62775123, "learning_rate": 2.998069941845973e-07, "loss": 0.64960843, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 3.535520076751709 }, { "auxiliary_loss_clip": 0.01069288, "auxiliary_loss_mlp": 0.01001055, "balance_loss_clip": 1.01796603, "balance_loss_mlp": 1.00018489, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7076321531861867, "language_loss": 0.57478327, "learning_rate": 2.993968956976258e-07, "loss": 0.5954867, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 4.174325942993164 }, { "auxiliary_loss_clip": 0.01175351, "auxiliary_loss_mlp": 0.01030372, "balance_loss_clip": 1.05199313, "balance_loss_mlp": 1.02274251, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 2.1532147168484084, "language_loss": 0.70189333, "learning_rate": 2.9898705519079313e-07, "loss": 0.72395051, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 3.525637626647949 }, { "auxiliary_loss_clip": 0.01130174, "auxiliary_loss_mlp": 0.01025785, "balance_loss_clip": 1.04577982, "balance_loss_mlp": 1.01876342, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.7688566711394522, "language_loss": 0.75019675, "learning_rate": 2.985774727262715e-07, "loss": 0.77175629, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.6362617015838623 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.01026423, "balance_loss_clip": 1.04786527, "balance_loss_mlp": 1.0197289, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 2.051803395133399, "language_loss": 0.82039928, "learning_rate": 2.981681483661949e-07, "loss": 0.84231627, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.6307218074798584 }, { "auxiliary_loss_clip": 0.01155728, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.05184174, "balance_loss_mlp": 1.0251956, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.8046139560784047, "language_loss": 0.7117883, "learning_rate": 2.9775908217265633e-07, "loss": 0.73366731, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 2.8877766132354736 }, { "auxiliary_loss_clip": 0.01008317, "auxiliary_loss_mlp": 0.0100118, "balance_loss_clip": 1.01910067, "balance_loss_mlp": 1.00033343, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8271460776360009, "language_loss": 0.50370765, "learning_rate": 2.9735027420771253e-07, "loss": 0.52380258, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.319310188293457 }, { "auxiliary_loss_clip": 0.01131583, "auxiliary_loss_mlp": 0.01023855, "balance_loss_clip": 1.04766321, "balance_loss_mlp": 1.01712012, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.9405948446966292, "language_loss": 0.71566308, "learning_rate": 2.969417245333774e-07, "loss": 0.73721755, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 3.3319358825683594 }, { "auxiliary_loss_clip": 0.01117148, "auxiliary_loss_mlp": 0.01021929, "balance_loss_clip": 1.04634619, "balance_loss_mlp": 1.01502967, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 2.218316938555747, "language_loss": 0.78040564, "learning_rate": 2.9653343321162915e-07, "loss": 0.80179644, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.653310775756836 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.04789591, "balance_loss_mlp": 1.02068996, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 2.018335044525828, "language_loss": 0.64911187, "learning_rate": 2.9612540030440446e-07, "loss": 0.67061496, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.712956666946411 }, { "auxiliary_loss_clip": 0.01043217, "auxiliary_loss_mlp": 0.01001495, "balance_loss_clip": 1.0171771, "balance_loss_mlp": 1.00062466, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8527391949347034, "language_loss": 0.64010429, "learning_rate": 2.9571762587360206e-07, "loss": 0.66055143, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.178870677947998 }, { "auxiliary_loss_clip": 0.01102473, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 1.03823006, "balance_loss_mlp": 1.01929092, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.698347610462262, "language_loss": 0.7414487, "learning_rate": 2.953101099810806e-07, "loss": 0.76274145, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.713555097579956 }, { "auxiliary_loss_clip": 0.01148883, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.04876077, "balance_loss_mlp": 1.0209074, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 1.947115939931832, "language_loss": 0.83068562, "learning_rate": 2.9490285268865965e-07, "loss": 0.85245234, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.5659589767456055 }, { "auxiliary_loss_clip": 0.01158761, "auxiliary_loss_mlp": 0.01025635, "balance_loss_clip": 1.05048633, "balance_loss_mlp": 1.01813412, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.1801141158689177, "language_loss": 0.79865927, "learning_rate": 2.9449585405812085e-07, "loss": 0.82050323, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.6526262760162354 }, { "auxiliary_loss_clip": 0.0112474, "auxiliary_loss_mlp": 0.01030538, "balance_loss_clip": 1.04541183, "balance_loss_mlp": 1.02327847, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 2.2158734558520585, "language_loss": 0.74303699, "learning_rate": 2.940891141512043e-07, "loss": 0.76458973, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.638715982437134 }, { "auxiliary_loss_clip": 0.01136149, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.04573667, "balance_loss_mlp": 1.02146506, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.4040542478489892, "language_loss": 0.72000426, "learning_rate": 2.9368263302961385e-07, "loss": 0.74165076, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.69389271736145 }, { "auxiliary_loss_clip": 0.01085699, "auxiliary_loss_mlp": 0.01026635, "balance_loss_clip": 1.03935921, "balance_loss_mlp": 1.01963758, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 2.2377524232104418, "language_loss": 0.79892743, "learning_rate": 2.9327641075501075e-07, "loss": 0.82005078, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.8088746070861816 }, { "auxiliary_loss_clip": 0.01130016, "auxiliary_loss_mlp": 0.01026999, "balance_loss_clip": 1.04391193, "balance_loss_mlp": 1.02014399, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.910177403562513, "language_loss": 0.67134279, "learning_rate": 2.9287044738901866e-07, "loss": 0.69291294, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.7702455520629883 }, { "auxiliary_loss_clip": 0.01154874, "auxiliary_loss_mlp": 0.00710924, "balance_loss_clip": 1.04838145, "balance_loss_mlp": 1.00058675, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 4.781145694793384, "language_loss": 0.9067874, "learning_rate": 2.9246474299322274e-07, "loss": 0.92544532, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.6185617446899414 }, { "auxiliary_loss_clip": 0.01031077, "auxiliary_loss_mlp": 0.0100121, "balance_loss_clip": 1.01705003, "balance_loss_mlp": 1.00035143, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8898912401899687, "language_loss": 0.6310184, "learning_rate": 2.920592976291678e-07, "loss": 0.65134132, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.1676673889160156 }, { "auxiliary_loss_clip": 0.01151843, "auxiliary_loss_mlp": 0.01027047, "balance_loss_clip": 1.04818726, "balance_loss_mlp": 1.01990092, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 2.6161771783274963, "language_loss": 0.80408287, "learning_rate": 2.916541113583595e-07, "loss": 0.82587183, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.6641061305999756 }, { "auxiliary_loss_clip": 0.01124439, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.04732907, "balance_loss_mlp": 1.01654816, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.269833995621929, "language_loss": 0.66922224, "learning_rate": 2.912491842422642e-07, "loss": 0.69070435, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.634720802307129 }, { "auxiliary_loss_clip": 0.01154999, "auxiliary_loss_mlp": 0.01028725, "balance_loss_clip": 1.04884243, "balance_loss_mlp": 1.02165627, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.7163978568439733, "language_loss": 0.71092093, "learning_rate": 2.9084451634230857e-07, "loss": 0.73275816, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.634572744369507 }, { "auxiliary_loss_clip": 0.01120461, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.04400134, "balance_loss_mlp": 1.02115023, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.554836117506502, "language_loss": 0.7144562, "learning_rate": 2.9044010771988125e-07, "loss": 0.73594034, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.701998233795166 }, { "auxiliary_loss_clip": 0.01129522, "auxiliary_loss_mlp": 0.01025259, "balance_loss_clip": 1.04427755, "balance_loss_mlp": 1.01823497, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 2.255352622549232, "language_loss": 0.72124553, "learning_rate": 2.900359584363303e-07, "loss": 0.74279332, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.8486883640289307 }, { "auxiliary_loss_clip": 0.01103476, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.04677403, "balance_loss_mlp": 1.02105308, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.3527936672504577, "language_loss": 0.84636736, "learning_rate": 2.8963206855296494e-07, "loss": 0.86768389, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 3.5647504329681396 }, { "auxiliary_loss_clip": 0.01152518, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.04627681, "balance_loss_mlp": 1.02001405, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 1.643164623415379, "language_loss": 0.77087379, "learning_rate": 2.892284381310548e-07, "loss": 0.7926712, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 4.726649761199951 }, { "auxiliary_loss_clip": 0.01135528, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04623842, "balance_loss_mlp": 1.01805699, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 5.194552100143023, "language_loss": 0.72711658, "learning_rate": 2.888250672318302e-07, "loss": 0.74872732, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.6239230632781982 }, { "auxiliary_loss_clip": 0.01172368, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.05217516, "balance_loss_mlp": 1.0183152, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.5854637755320635, "language_loss": 0.68731654, "learning_rate": 2.884219559164831e-07, "loss": 0.70929503, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.780775785446167 }, { "auxiliary_loss_clip": 0.01152564, "auxiliary_loss_mlp": 0.01023843, "balance_loss_clip": 1.04882812, "balance_loss_mlp": 1.0167737, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 2.302991891799594, "language_loss": 0.81234229, "learning_rate": 2.880191042461635e-07, "loss": 0.83410633, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 3.5007100105285645 }, { "auxiliary_loss_clip": 0.01113465, "auxiliary_loss_mlp": 0.01029023, "balance_loss_clip": 1.04426479, "balance_loss_mlp": 1.02254641, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.8152421399235692, "language_loss": 0.80331445, "learning_rate": 2.876165122819849e-07, "loss": 0.82473934, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.783191442489624 }, { "auxiliary_loss_clip": 0.01165465, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.0477891, "balance_loss_mlp": 1.02094364, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.6495763447120058, "language_loss": 0.79006231, "learning_rate": 2.872141800850201e-07, "loss": 0.8119964, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.5761568546295166 }, { "auxiliary_loss_clip": 0.01166982, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.0490762, "balance_loss_mlp": 1.02448738, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.7402430778711224, "language_loss": 0.73534018, "learning_rate": 2.868121077163024e-07, "loss": 0.75732529, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.6250391006469727 }, { "auxiliary_loss_clip": 0.01154056, "auxiliary_loss_mlp": 0.01024645, "balance_loss_clip": 1.04603231, "balance_loss_mlp": 1.01747453, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 2.0394845840540445, "language_loss": 0.72279477, "learning_rate": 2.864102952368257e-07, "loss": 0.74458176, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 2.5932815074920654 }, { "auxiliary_loss_clip": 0.01093346, "auxiliary_loss_mlp": 0.01024961, "balance_loss_clip": 1.03866577, "balance_loss_mlp": 1.01814854, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 2.222100476723113, "language_loss": 0.59327537, "learning_rate": 2.860087427075444e-07, "loss": 0.61445844, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.8336472511291504 }, { "auxiliary_loss_clip": 0.0113239, "auxiliary_loss_mlp": 0.01025294, "balance_loss_clip": 1.04563022, "balance_loss_mlp": 1.01823354, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.6392008007600944, "language_loss": 0.86205202, "learning_rate": 2.856074501893744e-07, "loss": 0.88362885, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.6246912479400635 }, { "auxiliary_loss_clip": 0.01156543, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.05051482, "balance_loss_mlp": 1.02205825, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.822640355993204, "language_loss": 0.81477875, "learning_rate": 2.8520641774319054e-07, "loss": 0.83663821, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.5706470012664795 }, { "auxiliary_loss_clip": 0.01138498, "auxiliary_loss_mlp": 0.01025863, "balance_loss_clip": 1.04399621, "balance_loss_mlp": 1.01863933, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.2391542360118213, "language_loss": 0.75631523, "learning_rate": 2.848056454298309e-07, "loss": 0.77795881, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.6405534744262695 }, { "auxiliary_loss_clip": 0.01138886, "auxiliary_loss_mlp": 0.01024544, "balance_loss_clip": 1.05075693, "balance_loss_mlp": 1.0170753, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.4416155506294053, "language_loss": 0.65074611, "learning_rate": 2.844051333100905e-07, "loss": 0.67238045, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.6041359901428223 }, { "auxiliary_loss_clip": 0.01140963, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.05065966, "balance_loss_mlp": 1.02304721, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.78366269199941, "language_loss": 0.83865756, "learning_rate": 2.840048814447269e-07, "loss": 0.86036164, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.6684041023254395 }, { "auxiliary_loss_clip": 0.01126446, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.04245222, "balance_loss_mlp": 1.01914859, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.5112516084654524, "language_loss": 0.73944449, "learning_rate": 2.836048898944587e-07, "loss": 0.7609719, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.6198573112487793 }, { "auxiliary_loss_clip": 0.01134725, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.04545748, "balance_loss_mlp": 1.01586199, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.4675287668747443, "language_loss": 0.72418028, "learning_rate": 2.832051587199642e-07, "loss": 0.74575669, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.703028678894043 }, { "auxiliary_loss_clip": 0.01060495, "auxiliary_loss_mlp": 0.01003888, "balance_loss_clip": 1.01993883, "balance_loss_mlp": 1.00298178, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8101025488825148, "language_loss": 0.5771004, "learning_rate": 2.828056879818821e-07, "loss": 0.59774423, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.1251838207244873 }, { "auxiliary_loss_clip": 0.01118221, "auxiliary_loss_mlp": 0.01024224, "balance_loss_clip": 1.04098403, "balance_loss_mlp": 1.01771486, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 1.9048342063367882, "language_loss": 0.83392715, "learning_rate": 2.824064777408117e-07, "loss": 0.85535157, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.7385470867156982 }, { "auxiliary_loss_clip": 0.01150381, "auxiliary_loss_mlp": 0.01029605, "balance_loss_clip": 1.04786134, "balance_loss_mlp": 1.02198148, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 2.0089731948158778, "language_loss": 0.75716102, "learning_rate": 2.8200752805731263e-07, "loss": 0.77896082, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.6830074787139893 }, { "auxiliary_loss_clip": 0.01154263, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 1.04990005, "balance_loss_mlp": 1.0207721, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 2.022890050555038, "language_loss": 0.81096977, "learning_rate": 2.8160883899190625e-07, "loss": 0.83279169, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.6396801471710205 }, { "auxiliary_loss_clip": 0.01110087, "auxiliary_loss_mlp": 0.01024244, "balance_loss_clip": 1.04364908, "balance_loss_mlp": 1.01744914, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 3.042172900708205, "language_loss": 0.7325207, "learning_rate": 2.8121041060507234e-07, "loss": 0.75386393, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.716867208480835 }, { "auxiliary_loss_clip": 0.01157518, "auxiliary_loss_mlp": 0.01028939, "balance_loss_clip": 1.04786766, "balance_loss_mlp": 1.02148795, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.6043025075450563, "language_loss": 0.71455669, "learning_rate": 2.808122429572528e-07, "loss": 0.73642129, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.6245784759521484 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.01025981, "balance_loss_clip": 1.04604149, "balance_loss_mlp": 1.01851821, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 2.650246271289091, "language_loss": 0.76231682, "learning_rate": 2.804143361088489e-07, "loss": 0.78387856, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.6764190196990967 }, { "auxiliary_loss_clip": 0.01131113, "auxiliary_loss_mlp": 0.01026152, "balance_loss_clip": 1.04549694, "balance_loss_mlp": 1.01911247, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.2003503036149272, "language_loss": 0.77917069, "learning_rate": 2.8001669012022277e-07, "loss": 0.80074334, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.6430516242980957 }, { "auxiliary_loss_clip": 0.0115288, "auxiliary_loss_mlp": 0.01024182, "balance_loss_clip": 1.04986715, "balance_loss_mlp": 1.0162009, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 1.8381704925694038, "language_loss": 0.69303453, "learning_rate": 2.7961930505169795e-07, "loss": 0.71480513, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.6917386054992676 }, { "auxiliary_loss_clip": 0.01157396, "auxiliary_loss_mlp": 0.00711623, "balance_loss_clip": 1.05060148, "balance_loss_mlp": 1.00056291, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 1.9013118511508809, "language_loss": 0.76350552, "learning_rate": 2.792221809635558e-07, "loss": 0.78219569, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 3.584550619125366 }, { "auxiliary_loss_clip": 0.01075852, "auxiliary_loss_mlp": 0.010226, "balance_loss_clip": 1.04325616, "balance_loss_mlp": 1.01544118, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.1515793376827865, "language_loss": 0.75058484, "learning_rate": 2.788253179160411e-07, "loss": 0.77156937, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 3.879770517349243 }, { "auxiliary_loss_clip": 0.01135836, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.04629517, "balance_loss_mlp": 1.02031064, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 2.648522347058799, "language_loss": 0.64611804, "learning_rate": 2.7842871596935725e-07, "loss": 0.66774338, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.8534369468688965 }, { "auxiliary_loss_clip": 0.01157413, "auxiliary_loss_mlp": 0.01024688, "balance_loss_clip": 1.04726028, "balance_loss_mlp": 1.01742852, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.8914454997621664, "language_loss": 0.6940366, "learning_rate": 2.780323751836682e-07, "loss": 0.71585763, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.6356234550476074 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.00711249, "balance_loss_clip": 1.04336512, "balance_loss_mlp": 1.00061226, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.5584397209053447, "language_loss": 0.78640628, "learning_rate": 2.7763629561909876e-07, "loss": 0.80486834, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 3.7381367683410645 }, { "auxiliary_loss_clip": 0.0116551, "auxiliary_loss_mlp": 0.01023537, "balance_loss_clip": 1.04743314, "balance_loss_mlp": 1.01672959, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 9.119900167239988, "language_loss": 0.7718336, "learning_rate": 2.772404773357335e-07, "loss": 0.79372412, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.5918736457824707 }, { "auxiliary_loss_clip": 0.01113799, "auxiliary_loss_mlp": 0.01028156, "balance_loss_clip": 1.04393995, "balance_loss_mlp": 1.02070808, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 1.891879978625405, "language_loss": 0.7843864, "learning_rate": 2.7684492039361853e-07, "loss": 0.80580592, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.6614603996276855 }, { "auxiliary_loss_clip": 0.0116941, "auxiliary_loss_mlp": 0.01024635, "balance_loss_clip": 1.05045462, "balance_loss_mlp": 1.01711917, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 2.2549378284304638, "language_loss": 0.83552831, "learning_rate": 2.764496248527586e-07, "loss": 0.85746878, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.5851316452026367 }, { "auxiliary_loss_clip": 0.01130331, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.04472542, "balance_loss_mlp": 1.02000713, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 2.1558864744092125, "language_loss": 0.78656447, "learning_rate": 2.760545907731211e-07, "loss": 0.80814695, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.6963050365448 }, { "auxiliary_loss_clip": 0.01153424, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 1.0460465, "balance_loss_mlp": 1.01773083, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 2.623916621452808, "language_loss": 0.68106222, "learning_rate": 2.75659818214631e-07, "loss": 0.70284462, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.6933069229125977 }, { "auxiliary_loss_clip": 0.01140769, "auxiliary_loss_mlp": 0.01025949, "balance_loss_clip": 1.04681873, "balance_loss_mlp": 1.01882589, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.8462220452515337, "language_loss": 0.78000695, "learning_rate": 2.752653072371749e-07, "loss": 0.80167413, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.6572515964508057 }, { "auxiliary_loss_clip": 0.01116428, "auxiliary_loss_mlp": 0.01024106, "balance_loss_clip": 1.04605889, "balance_loss_mlp": 1.01715565, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.7788380883472354, "language_loss": 0.74759895, "learning_rate": 2.7487105790060105e-07, "loss": 0.76900423, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.7354440689086914 }, { "auxiliary_loss_clip": 0.01153145, "auxiliary_loss_mlp": 0.01029369, "balance_loss_clip": 1.04617631, "balance_loss_mlp": 1.02269626, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.9715821998173633, "language_loss": 0.69095802, "learning_rate": 2.7447707026471587e-07, "loss": 0.71278316, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.7574679851531982 }, { "auxiliary_loss_clip": 0.01120107, "auxiliary_loss_mlp": 0.01028253, "balance_loss_clip": 1.0436064, "balance_loss_mlp": 1.02119899, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 2.204200037960776, "language_loss": 0.79854876, "learning_rate": 2.740833443892874e-07, "loss": 0.82003236, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.714989185333252 }, { "auxiliary_loss_clip": 0.0113813, "auxiliary_loss_mlp": 0.01027924, "balance_loss_clip": 1.04721642, "balance_loss_mlp": 1.02084923, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.8856107713320334, "language_loss": 0.79532635, "learning_rate": 2.7368988033404327e-07, "loss": 0.81698686, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.61965274810791 }, { "auxiliary_loss_clip": 0.01123375, "auxiliary_loss_mlp": 0.01021878, "balance_loss_clip": 1.0450182, "balance_loss_mlp": 1.01462126, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.5524252578358835, "language_loss": 0.84730709, "learning_rate": 2.732966781586712e-07, "loss": 0.86875963, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.6847355365753174 }, { "auxiliary_loss_clip": 0.01146351, "auxiliary_loss_mlp": 0.01023043, "balance_loss_clip": 1.04516804, "balance_loss_mlp": 1.01608086, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.728968536720314, "language_loss": 0.66840911, "learning_rate": 2.729037379228205e-07, "loss": 0.69010305, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.584808349609375 }, { "auxiliary_loss_clip": 0.01138335, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.04994178, "balance_loss_mlp": 1.02518642, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.695674388481875, "language_loss": 0.80425978, "learning_rate": 2.725110596860998e-07, "loss": 0.82596648, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.5996756553649902 }, { "auxiliary_loss_clip": 0.01103414, "auxiliary_loss_mlp": 0.01021679, "balance_loss_clip": 1.04510665, "balance_loss_mlp": 1.01473188, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 1.991528865943765, "language_loss": 0.70202225, "learning_rate": 2.7211864350807776e-07, "loss": 0.72327316, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.5991673469543457 }, { "auxiliary_loss_clip": 0.01170488, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.05000687, "balance_loss_mlp": 1.01890516, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.7125403811332462, "language_loss": 0.73960578, "learning_rate": 2.717264894482836e-07, "loss": 0.76157349, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.552205801010132 }, { "auxiliary_loss_clip": 0.01158162, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 1.05127335, "balance_loss_mlp": 1.01796651, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 2.315313426064069, "language_loss": 0.81262195, "learning_rate": 2.7133459756620646e-07, "loss": 0.83445376, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.490786552429199 }, { "auxiliary_loss_clip": 0.01147848, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.04754353, "balance_loss_mlp": 1.01763844, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 2.081721443760367, "language_loss": 0.73990953, "learning_rate": 2.7094296792129733e-07, "loss": 0.76163268, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.6142215728759766 }, { "auxiliary_loss_clip": 0.0115248, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.04742312, "balance_loss_mlp": 1.01710343, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.7995957553043413, "language_loss": 0.75280637, "learning_rate": 2.7055160057296424e-07, "loss": 0.77456987, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.566145896911621 }, { "auxiliary_loss_clip": 0.01123065, "auxiliary_loss_mlp": 0.01026029, "balance_loss_clip": 1.04645538, "balance_loss_mlp": 1.01864386, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.7755562420098474, "language_loss": 0.727633, "learning_rate": 2.7016049558057896e-07, "loss": 0.74912393, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.708483934402466 }, { "auxiliary_loss_clip": 0.01152868, "auxiliary_loss_mlp": 0.01024139, "balance_loss_clip": 1.04903793, "balance_loss_mlp": 1.0171473, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.98125660869287, "language_loss": 0.70657206, "learning_rate": 2.6976965300347074e-07, "loss": 0.72834206, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.7907752990722656 }, { "auxiliary_loss_clip": 0.01133186, "auxiliary_loss_mlp": 0.010217, "balance_loss_clip": 1.0445596, "balance_loss_mlp": 1.014938, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.7066082234624087, "language_loss": 0.69826889, "learning_rate": 2.693790729009309e-07, "loss": 0.71981782, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.6827542781829834 }, { "auxiliary_loss_clip": 0.01138048, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.04714274, "balance_loss_mlp": 1.01686716, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.8908458662548442, "language_loss": 0.88705599, "learning_rate": 2.6898875533220946e-07, "loss": 0.90867507, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 4.357943534851074 }, { "auxiliary_loss_clip": 0.01164292, "auxiliary_loss_mlp": 0.01027162, "balance_loss_clip": 1.04897463, "balance_loss_mlp": 1.02087665, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 1.7776280653724585, "language_loss": 0.81975341, "learning_rate": 2.685987003565171e-07, "loss": 0.84166789, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 3.5215296745300293 }, { "auxiliary_loss_clip": 0.01115999, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04904902, "balance_loss_mlp": 1.0182184, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 3.679452891503334, "language_loss": 0.75090849, "learning_rate": 2.6820890803302566e-07, "loss": 0.77232039, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.637524366378784 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04827106, "balance_loss_mlp": 1.02218199, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.145155291289586, "language_loss": 0.81805086, "learning_rate": 2.6781937842086557e-07, "loss": 0.83972353, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.642840623855591 }, { "auxiliary_loss_clip": 0.01154097, "auxiliary_loss_mlp": 0.01022772, "balance_loss_clip": 1.04815865, "balance_loss_mlp": 1.01608682, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 2.402593063922324, "language_loss": 0.67263865, "learning_rate": 2.6743011157912933e-07, "loss": 0.69440734, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 3.615976095199585 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01024534, "balance_loss_clip": 1.04049504, "balance_loss_mlp": 1.01716065, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 1.9595142272600208, "language_loss": 0.65342999, "learning_rate": 2.6704110756686725e-07, "loss": 0.67471308, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.7574291229248047 }, { "auxiliary_loss_clip": 0.01131883, "auxiliary_loss_mlp": 0.00711398, "balance_loss_clip": 1.0432353, "balance_loss_mlp": 1.00048149, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 2.250235852492453, "language_loss": 0.84008706, "learning_rate": 2.6665236644309085e-07, "loss": 0.85851991, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.6685330867767334 }, { "auxiliary_loss_clip": 0.01152543, "auxiliary_loss_mlp": 0.01024993, "balance_loss_clip": 1.04715645, "balance_loss_mlp": 1.01842809, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 2.5203324774266282, "language_loss": 0.79626656, "learning_rate": 2.662638882667727e-07, "loss": 0.81804192, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.6750237941741943 }, { "auxiliary_loss_clip": 0.01170582, "auxiliary_loss_mlp": 0.01029038, "balance_loss_clip": 1.04912806, "balance_loss_mlp": 1.0216527, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 2.379505430142216, "language_loss": 0.73238879, "learning_rate": 2.658756730968443e-07, "loss": 0.75438499, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.5958409309387207 }, { "auxiliary_loss_clip": 0.01141687, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.04966068, "balance_loss_mlp": 1.018152, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 2.6888439808974756, "language_loss": 0.88680488, "learning_rate": 2.654877209921975e-07, "loss": 0.90847552, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.637108087539673 }, { "auxiliary_loss_clip": 0.01111783, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.04309464, "balance_loss_mlp": 1.0227325, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 6.390054397691262, "language_loss": 0.63122594, "learning_rate": 2.651000320116843e-07, "loss": 0.65264714, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.829740047454834 }, { "auxiliary_loss_clip": 0.01118261, "auxiliary_loss_mlp": 0.00711579, "balance_loss_clip": 1.04388058, "balance_loss_mlp": 1.0005734, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 1.9420111956287784, "language_loss": 0.76055086, "learning_rate": 2.647126062141163e-07, "loss": 0.77884924, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.631232976913452 }, { "auxiliary_loss_clip": 0.01138623, "auxiliary_loss_mlp": 0.01020544, "balance_loss_clip": 1.04393578, "balance_loss_mlp": 1.01323688, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 2.346180919179995, "language_loss": 0.84269673, "learning_rate": 2.643254436582669e-07, "loss": 0.86428833, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.6374266147613525 }, { "auxiliary_loss_clip": 0.01108865, "auxiliary_loss_mlp": 0.01024187, "balance_loss_clip": 1.04426837, "balance_loss_mlp": 1.01727319, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 2.0878826808088276, "language_loss": 0.82653284, "learning_rate": 2.6393854440286743e-07, "loss": 0.84786338, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.6606404781341553 }, { "auxiliary_loss_clip": 0.01170439, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.05208182, "balance_loss_mlp": 1.02299905, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 1.927760939494712, "language_loss": 0.70910794, "learning_rate": 2.6355190850661045e-07, "loss": 0.7311101, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.580270290374756 }, { "auxiliary_loss_clip": 0.01138323, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.04867733, "balance_loss_mlp": 1.01647353, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.5676352775848488, "language_loss": 0.86552447, "learning_rate": 2.631655360281486e-07, "loss": 0.8871454, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.640990972518921 }, { "auxiliary_loss_clip": 0.01158091, "auxiliary_loss_mlp": 0.00711708, "balance_loss_clip": 1.04787111, "balance_loss_mlp": 1.00064898, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.9784227468444442, "language_loss": 0.66223973, "learning_rate": 2.6277942702609323e-07, "loss": 0.68093777, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.6268038749694824 }, { "auxiliary_loss_clip": 0.01124129, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.04695654, "balance_loss_mlp": 1.02055669, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 2.3110818164883606, "language_loss": 0.8764267, "learning_rate": 2.623935815590186e-07, "loss": 0.89794278, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.638775110244751 }, { "auxiliary_loss_clip": 0.01139372, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 1.04864264, "balance_loss_mlp": 1.01884198, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.919319337132204, "language_loss": 0.81482649, "learning_rate": 2.6200799968545516e-07, "loss": 0.83647829, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.684105396270752 }, { "auxiliary_loss_clip": 0.0103972, "auxiliary_loss_mlp": 0.01001549, "balance_loss_clip": 1.01762199, "balance_loss_mlp": 1.00059557, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.9468453779034643, "language_loss": 0.56425899, "learning_rate": 2.616226814638969e-07, "loss": 0.58467168, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.2601051330566406 }, { "auxiliary_loss_clip": 0.01139046, "auxiliary_loss_mlp": 0.01031585, "balance_loss_clip": 1.04639053, "balance_loss_mlp": 1.02453113, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 2.629362533187054, "language_loss": 0.77272505, "learning_rate": 2.612376269527954e-07, "loss": 0.79443133, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.620479106903076 }, { "auxiliary_loss_clip": 0.01136903, "auxiliary_loss_mlp": 0.01023826, "balance_loss_clip": 1.04904437, "balance_loss_mlp": 1.01642299, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 2.037159278459587, "language_loss": 0.67583656, "learning_rate": 2.608528362105635e-07, "loss": 0.6974439, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.6379311084747314 }, { "auxiliary_loss_clip": 0.0112129, "auxiliary_loss_mlp": 0.01021384, "balance_loss_clip": 1.04229307, "balance_loss_mlp": 1.01440418, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.7944407911354243, "language_loss": 0.73276573, "learning_rate": 2.6046830929557374e-07, "loss": 0.75419247, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.7438063621520996 }, { "auxiliary_loss_clip": 0.01116336, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.04349995, "balance_loss_mlp": 1.01936674, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 2.9845904056411956, "language_loss": 0.84982342, "learning_rate": 2.6008404626615776e-07, "loss": 0.87125629, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.7183616161346436 }, { "auxiliary_loss_clip": 0.01155405, "auxiliary_loss_mlp": 0.010261, "balance_loss_clip": 1.04947877, "balance_loss_mlp": 1.01859605, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 3.869599618184447, "language_loss": 0.74548101, "learning_rate": 2.597000471806092e-07, "loss": 0.76729608, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.5710439682006836 }, { "auxiliary_loss_clip": 0.01135608, "auxiliary_loss_mlp": 0.01026576, "balance_loss_clip": 1.04902732, "balance_loss_mlp": 1.01891685, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 2.000739332545438, "language_loss": 0.73468179, "learning_rate": 2.593163120971793e-07, "loss": 0.75630361, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 3.6028196811676025 }, { "auxiliary_loss_clip": 0.01095215, "auxiliary_loss_mlp": 0.01022152, "balance_loss_clip": 1.0399363, "balance_loss_mlp": 1.01431966, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 1.9793335203126088, "language_loss": 0.6897136, "learning_rate": 2.5893284107408165e-07, "loss": 0.71088731, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 3.661442756652832 }, { "auxiliary_loss_clip": 0.01107823, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.04636502, "balance_loss_mlp": 1.02639806, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 2.5538726461101007, "language_loss": 0.78188288, "learning_rate": 2.5854963416948726e-07, "loss": 0.80329502, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 3.6395604610443115 }, { "auxiliary_loss_clip": 0.01102211, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.03810072, "balance_loss_mlp": 1.02302647, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.7817426691103888, "language_loss": 0.69551975, "learning_rate": 2.5816669144152816e-07, "loss": 0.71684909, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.7537357807159424 }, { "auxiliary_loss_clip": 0.01068564, "auxiliary_loss_mlp": 0.01001099, "balance_loss_clip": 1.01716638, "balance_loss_mlp": 1.00022912, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8511867706660963, "language_loss": 0.66265988, "learning_rate": 2.5778401294829777e-07, "loss": 0.68335652, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.2267701625823975 }, { "auxiliary_loss_clip": 0.01152895, "auxiliary_loss_mlp": 0.00711041, "balance_loss_clip": 1.0498817, "balance_loss_mlp": 1.00054932, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 2.4376321131434553, "language_loss": 0.65203965, "learning_rate": 2.574015987478473e-07, "loss": 0.67067897, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 3.6615376472473145 }, { "auxiliary_loss_clip": 0.01142271, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04692495, "balance_loss_mlp": 1.01791644, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 2.2615448745579854, "language_loss": 0.867975, "learning_rate": 2.570194488981887e-07, "loss": 0.88965487, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.5941455364227295 }, { "auxiliary_loss_clip": 0.01068968, "auxiliary_loss_mlp": 0.01000801, "balance_loss_clip": 1.01756918, "balance_loss_mlp": 0.99988919, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8469346419540315, "language_loss": 0.60255837, "learning_rate": 2.566375634572939e-07, "loss": 0.62325609, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.141427993774414 }, { "auxiliary_loss_clip": 0.0112491, "auxiliary_loss_mlp": 0.01021729, "balance_loss_clip": 1.0425508, "balance_loss_mlp": 1.01474357, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 2.2986945909904253, "language_loss": 0.7589643, "learning_rate": 2.562559424830943e-07, "loss": 0.78043067, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.6407697200775146 }, { "auxiliary_loss_clip": 0.01132125, "auxiliary_loss_mlp": 0.01026012, "balance_loss_clip": 1.04443955, "balance_loss_mlp": 1.01903176, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 2.212987654407493, "language_loss": 0.70441395, "learning_rate": 2.5587458603348256e-07, "loss": 0.7259953, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 2.650336503982544 }, { "auxiliary_loss_clip": 0.0111581, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.04393554, "balance_loss_mlp": 1.02006817, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 2.1361175860597803, "language_loss": 0.84214032, "learning_rate": 2.554934941663085e-07, "loss": 0.86357552, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.6716737747192383 }, { "auxiliary_loss_clip": 0.0111965, "auxiliary_loss_mlp": 0.01026624, "balance_loss_clip": 1.04370451, "balance_loss_mlp": 1.01879835, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 5.533912145577112, "language_loss": 0.73425174, "learning_rate": 2.5511266693938484e-07, "loss": 0.75571454, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.727163314819336 }, { "auxiliary_loss_clip": 0.01136419, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.04906249, "balance_loss_mlp": 1.01856709, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 3.1604137434511266, "language_loss": 0.77897471, "learning_rate": 2.547321044104822e-07, "loss": 0.80060142, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.6718530654907227 }, { "auxiliary_loss_clip": 0.01175522, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.05202341, "balance_loss_mlp": 1.01950645, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.977554634637497, "language_loss": 0.77142465, "learning_rate": 2.5435180663733113e-07, "loss": 0.79344642, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.641411781311035 }, { "auxiliary_loss_clip": 0.01112849, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.04274654, "balance_loss_mlp": 1.02133322, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.396986969370604, "language_loss": 0.71765083, "learning_rate": 2.539717736776241e-07, "loss": 0.73906505, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.6883387565612793 }, { "auxiliary_loss_clip": 0.01152168, "auxiliary_loss_mlp": 0.01023833, "balance_loss_clip": 1.05006921, "balance_loss_mlp": 1.01680315, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.6997319277780607, "language_loss": 0.76537466, "learning_rate": 2.535920055890097e-07, "loss": 0.78713471, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.727932929992676 }, { "auxiliary_loss_clip": 0.01100332, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.04222035, "balance_loss_mlp": 1.01932192, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 2.626842224428805, "language_loss": 0.64693362, "learning_rate": 2.5321250242910006e-07, "loss": 0.66820341, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.6815452575683594 }, { "auxiliary_loss_clip": 0.01167759, "auxiliary_loss_mlp": 0.01022877, "balance_loss_clip": 1.05040503, "balance_loss_mlp": 1.0156703, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 3.5882258077569635, "language_loss": 0.86544603, "learning_rate": 2.5283326425546493e-07, "loss": 0.88735235, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.6423068046569824 }, { "auxiliary_loss_clip": 0.01115241, "auxiliary_loss_mlp": 0.01028526, "balance_loss_clip": 1.04692888, "balance_loss_mlp": 1.02098036, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.565529189170172, "language_loss": 0.69343734, "learning_rate": 2.5245429112563443e-07, "loss": 0.71487504, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.767143964767456 }, { "auxiliary_loss_clip": 0.01152672, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.04909027, "balance_loss_mlp": 1.02147412, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.527621125097873, "language_loss": 0.81805754, "learning_rate": 2.5207558309709865e-07, "loss": 0.83987427, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.608116388320923 }, { "auxiliary_loss_clip": 0.01035661, "auxiliary_loss_mlp": 0.00701457, "balance_loss_clip": 1.01621044, "balance_loss_mlp": 0.99994642, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6588140123285945, "language_loss": 0.56309903, "learning_rate": 2.516971402273065e-07, "loss": 0.5804702, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.335390329360962 }, { "auxiliary_loss_clip": 0.01136159, "auxiliary_loss_mlp": 0.01022426, "balance_loss_clip": 1.04415989, "balance_loss_mlp": 1.01508832, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 1.9008557019891534, "language_loss": 0.67402601, "learning_rate": 2.513189625736687e-07, "loss": 0.69561183, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.7120201587677 }, { "auxiliary_loss_clip": 0.01128294, "auxiliary_loss_mlp": 0.01027094, "balance_loss_clip": 1.04676294, "balance_loss_mlp": 1.01955998, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.319641604500551, "language_loss": 0.71681356, "learning_rate": 2.509410501935534e-07, "loss": 0.73836744, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.689664363861084 }, { "auxiliary_loss_clip": 0.01141705, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04741478, "balance_loss_mlp": 1.0202142, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.624905765292672, "language_loss": 0.75798124, "learning_rate": 2.5056340314429116e-07, "loss": 0.77967644, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.6387758255004883 }, { "auxiliary_loss_clip": 0.01107156, "auxiliary_loss_mlp": 0.01022455, "balance_loss_clip": 1.04189181, "balance_loss_mlp": 1.01486766, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.7510105048938835, "language_loss": 0.80289876, "learning_rate": 2.5018602148316904e-07, "loss": 0.82419491, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.7156734466552734 }, { "auxiliary_loss_clip": 0.01167914, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.05072403, "balance_loss_mlp": 1.02182961, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 1.75774634163242, "language_loss": 0.80455041, "learning_rate": 2.498089052674359e-07, "loss": 0.8265186, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.579453229904175 }, { "auxiliary_loss_clip": 0.01154207, "auxiliary_loss_mlp": 0.01026604, "balance_loss_clip": 1.04951847, "balance_loss_mlp": 1.01889718, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 5.5539733979502035, "language_loss": 0.75594938, "learning_rate": 2.494320545543007e-07, "loss": 0.77775753, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 3.6039743423461914 }, { "auxiliary_loss_clip": 0.01172127, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04997373, "balance_loss_mlp": 1.01771832, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.9739782219179227, "language_loss": 0.66635811, "learning_rate": 2.490554694009308e-07, "loss": 0.6883328, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 3.577657461166382 }, { "auxiliary_loss_clip": 0.01157554, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.04785979, "balance_loss_mlp": 1.019593, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.8082070720604781, "language_loss": 0.78374356, "learning_rate": 2.4867914986445426e-07, "loss": 0.80558777, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 3.6258537769317627 }, { "auxiliary_loss_clip": 0.01139292, "auxiliary_loss_mlp": 0.01024171, "balance_loss_clip": 1.0439713, "balance_loss_mlp": 1.01725698, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 2.431771492051577, "language_loss": 0.71134657, "learning_rate": 2.483030960019581e-07, "loss": 0.7329812, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.8584203720092773 }, { "auxiliary_loss_clip": 0.01018674, "auxiliary_loss_mlp": 0.01002052, "balance_loss_clip": 1.0142808, "balance_loss_mlp": 1.00106847, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7266290213920338, "language_loss": 0.55443144, "learning_rate": 2.479273078704891e-07, "loss": 0.57463866, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.1027677059173584 }, { "auxiliary_loss_clip": 0.01013652, "auxiliary_loss_mlp": 0.01002792, "balance_loss_clip": 1.01904297, "balance_loss_mlp": 1.00183821, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7831192797097231, "language_loss": 0.64681631, "learning_rate": 2.475517855270552e-07, "loss": 0.66698074, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 4.139306545257568 }, { "auxiliary_loss_clip": 0.01167332, "auxiliary_loss_mlp": 0.01022493, "balance_loss_clip": 1.04895735, "balance_loss_mlp": 1.01546597, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 2.129789339216211, "language_loss": 0.7307995, "learning_rate": 2.4717652902862143e-07, "loss": 0.75269783, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.5822935104370117 }, { "auxiliary_loss_clip": 0.0114067, "auxiliary_loss_mlp": 0.01027057, "balance_loss_clip": 1.04657102, "balance_loss_mlp": 1.01977968, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 2.778721583014014, "language_loss": 0.8162992, "learning_rate": 2.4680153843211495e-07, "loss": 0.83797646, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.6396923065185547 }, { "auxiliary_loss_clip": 0.01137974, "auxiliary_loss_mlp": 0.01027545, "balance_loss_clip": 1.0498507, "balance_loss_mlp": 1.02030587, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.7663205810079865, "language_loss": 0.72280502, "learning_rate": 2.464268137944212e-07, "loss": 0.74446023, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.692377805709839 }, { "auxiliary_loss_clip": 0.01091624, "auxiliary_loss_mlp": 0.01030703, "balance_loss_clip": 1.04006314, "balance_loss_mlp": 1.02306199, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 2.3619478465280053, "language_loss": 0.783912, "learning_rate": 2.46052355172385e-07, "loss": 0.80513525, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.7822790145874023 }, { "auxiliary_loss_clip": 0.01171224, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.04976809, "balance_loss_mlp": 1.01953125, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.9797601020407634, "language_loss": 0.74800062, "learning_rate": 2.456781626228128e-07, "loss": 0.7699821, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.5593650341033936 }, { "auxiliary_loss_clip": 0.01020505, "auxiliary_loss_mlp": 0.0070144, "balance_loss_clip": 1.01889741, "balance_loss_mlp": 0.99990332, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 1.028280703529923, "language_loss": 0.66253567, "learning_rate": 2.453042362024675e-07, "loss": 0.67975509, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.3595998287200928 }, { "auxiliary_loss_clip": 0.01167573, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.04925537, "balance_loss_mlp": 1.01882744, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.5505120195738737, "language_loss": 0.73372698, "learning_rate": 2.449305759680751e-07, "loss": 0.75566268, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.6333272457122803 }, { "auxiliary_loss_clip": 0.01117268, "auxiliary_loss_mlp": 0.01022004, "balance_loss_clip": 1.04491305, "balance_loss_mlp": 1.01462841, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.4911965063006845, "language_loss": 0.7558248, "learning_rate": 2.445571819763188e-07, "loss": 0.77721751, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.7462751865386963 }, { "auxiliary_loss_clip": 0.01170764, "auxiliary_loss_mlp": 0.01023372, "balance_loss_clip": 1.05161762, "balance_loss_mlp": 1.01596916, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.722905440108279, "language_loss": 0.58191562, "learning_rate": 2.4418405428384227e-07, "loss": 0.60385698, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.5614211559295654 }, { "auxiliary_loss_clip": 0.01167626, "auxiliary_loss_mlp": 0.00711606, "balance_loss_clip": 1.04916644, "balance_loss_mlp": 1.00057387, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 1.675338728191355, "language_loss": 0.7174291, "learning_rate": 2.4381119294724864e-07, "loss": 0.73622143, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.6048688888549805 }, { "auxiliary_loss_clip": 0.01169952, "auxiliary_loss_mlp": 0.01026198, "balance_loss_clip": 1.05038667, "balance_loss_mlp": 1.01949883, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 2.15472599951979, "language_loss": 0.54305577, "learning_rate": 2.434385980231004e-07, "loss": 0.56501734, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.5531680583953857 }, { "auxiliary_loss_clip": 0.01152266, "auxiliary_loss_mlp": 0.01021681, "balance_loss_clip": 1.04751015, "balance_loss_mlp": 1.01455784, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.633429068514964, "language_loss": 0.65739596, "learning_rate": 2.4306626956792043e-07, "loss": 0.67913544, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.8827965259552 }, { "auxiliary_loss_clip": 0.01151905, "auxiliary_loss_mlp": 0.01024793, "balance_loss_clip": 1.04563653, "balance_loss_mlp": 1.01817656, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.6458998889172003, "language_loss": 0.75801921, "learning_rate": 2.4269420763819017e-07, "loss": 0.77978617, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.5944325923919678 }, { "auxiliary_loss_clip": 0.01149673, "auxiliary_loss_mlp": 0.01023425, "balance_loss_clip": 1.04724288, "balance_loss_mlp": 1.01605821, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 2.700117046817582, "language_loss": 0.83687794, "learning_rate": 2.4232241229035223e-07, "loss": 0.85860896, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.6369104385375977 }, { "auxiliary_loss_clip": 0.01060352, "auxiliary_loss_mlp": 0.01000659, "balance_loss_clip": 1.01920462, "balance_loss_mlp": 0.99975884, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7461002037361629, "language_loss": 0.56718427, "learning_rate": 2.419508835808064e-07, "loss": 0.58779436, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.156611442565918 }, { "auxiliary_loss_clip": 0.01138644, "auxiliary_loss_mlp": 0.01024951, "balance_loss_clip": 1.0477432, "balance_loss_mlp": 1.01757777, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 6.028670005108096, "language_loss": 0.63182378, "learning_rate": 2.415796215659134e-07, "loss": 0.65345973, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.608649969100952 }, { "auxiliary_loss_clip": 0.01124845, "auxiliary_loss_mlp": 0.01029663, "balance_loss_clip": 1.04357004, "balance_loss_mlp": 1.02210188, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 2.1042770324597386, "language_loss": 0.77016711, "learning_rate": 2.412086263019939e-07, "loss": 0.79171216, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.686099052429199 }, { "auxiliary_loss_clip": 0.01166033, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.05125487, "balance_loss_mlp": 1.01883245, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 2.105541773645798, "language_loss": 0.80061197, "learning_rate": 2.408378978453276e-07, "loss": 0.82252634, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.59062123298645 }, { "auxiliary_loss_clip": 0.01058676, "auxiliary_loss_mlp": 0.01002535, "balance_loss_clip": 1.01803052, "balance_loss_mlp": 1.00170088, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.812787659974132, "language_loss": 0.64001435, "learning_rate": 2.404674362521533e-07, "loss": 0.66062647, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.1293716430664062 }, { "auxiliary_loss_clip": 0.01149631, "auxiliary_loss_mlp": 0.01029959, "balance_loss_clip": 1.04814029, "balance_loss_mlp": 1.02273226, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.411252334133554, "language_loss": 0.7433024, "learning_rate": 2.4009724157866997e-07, "loss": 0.76509827, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.648113965988159 }, { "auxiliary_loss_clip": 0.01166686, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.04916143, "balance_loss_mlp": 1.01828408, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 2.1400783044386573, "language_loss": 0.7654379, "learning_rate": 2.3972731388103564e-07, "loss": 0.78735417, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.598475456237793 }, { "auxiliary_loss_clip": 0.01000261, "auxiliary_loss_mlp": 0.01000596, "balance_loss_clip": 1.01715183, "balance_loss_mlp": 0.99968988, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.7993515251610093, "language_loss": 0.62324572, "learning_rate": 2.393576532153687e-07, "loss": 0.64325428, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 5.294727802276611 }, { "auxiliary_loss_clip": 0.01055142, "auxiliary_loss_mlp": 0.01000644, "balance_loss_clip": 1.01711583, "balance_loss_mlp": 0.99977362, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9278162783274857, "language_loss": 0.57729411, "learning_rate": 2.389882596377453e-07, "loss": 0.59785199, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 4.65644383430481 }, { "auxiliary_loss_clip": 0.01165991, "auxiliary_loss_mlp": 0.0102719, "balance_loss_clip": 1.04754758, "balance_loss_mlp": 1.01972699, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.9513506117741277, "language_loss": 0.76427591, "learning_rate": 2.386191332042031e-07, "loss": 0.78620774, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.7309203147888184 }, { "auxiliary_loss_clip": 0.01172835, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.0506773, "balance_loss_mlp": 1.01700342, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 1.8073382939174183, "language_loss": 0.72933573, "learning_rate": 2.3825027397073794e-07, "loss": 0.75131547, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.6012768745422363 }, { "auxiliary_loss_clip": 0.01148775, "auxiliary_loss_mlp": 0.01023039, "balance_loss_clip": 1.04907942, "balance_loss_mlp": 1.01620579, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 2.7938091767553135, "language_loss": 0.66654801, "learning_rate": 2.3788168199330515e-07, "loss": 0.68826616, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 3.6137516498565674 }, { "auxiliary_loss_clip": 0.01121927, "auxiliary_loss_mlp": 0.01023918, "balance_loss_clip": 1.04181743, "balance_loss_mlp": 1.01725423, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.8742559081968484, "language_loss": 0.72563839, "learning_rate": 2.3751335732782074e-07, "loss": 0.7470969, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.775362014770508 }, { "auxiliary_loss_clip": 0.01152446, "auxiliary_loss_mlp": 0.01026946, "balance_loss_clip": 1.04989648, "balance_loss_mlp": 1.01949871, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 2.2394254153027324, "language_loss": 0.79551601, "learning_rate": 2.371453000301582e-07, "loss": 0.81730998, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.6130290031433105 }, { "auxiliary_loss_clip": 0.01119562, "auxiliary_loss_mlp": 0.01023058, "balance_loss_clip": 1.04626954, "balance_loss_mlp": 1.01615846, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 8.160182143610864, "language_loss": 0.74612916, "learning_rate": 2.3677751015615222e-07, "loss": 0.76755536, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.7865443229675293 }, { "auxiliary_loss_clip": 0.01125414, "auxiliary_loss_mlp": 0.01022287, "balance_loss_clip": 1.04302132, "balance_loss_mlp": 1.01520562, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 1.985299213643856, "language_loss": 0.855564, "learning_rate": 2.3640998776159593e-07, "loss": 0.87704098, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 2.6549031734466553 }, { "auxiliary_loss_clip": 0.01139667, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.04778874, "balance_loss_mlp": 1.02061784, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.7176716584259222, "language_loss": 0.81506217, "learning_rate": 2.3604273290224253e-07, "loss": 0.83673215, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.6275153160095215 }, { "auxiliary_loss_clip": 0.01140137, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.04930484, "balance_loss_mlp": 1.02697492, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 1.8221249983200767, "language_loss": 0.74685466, "learning_rate": 2.356757456338039e-07, "loss": 0.7686013, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.6470556259155273 }, { "auxiliary_loss_clip": 0.01040203, "auxiliary_loss_mlp": 0.01002493, "balance_loss_clip": 1.0155499, "balance_loss_mlp": 1.00161695, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.755894255680084, "language_loss": 0.59027857, "learning_rate": 2.3530902601195147e-07, "loss": 0.61070549, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.3302299976348877 }, { "auxiliary_loss_clip": 0.01151365, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.04828072, "balance_loss_mlp": 1.02012181, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 3.9233810419112696, "language_loss": 0.78764427, "learning_rate": 2.34942574092317e-07, "loss": 0.80943179, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.583956718444824 }, { "auxiliary_loss_clip": 0.01158258, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.04918981, "balance_loss_mlp": 1.02452683, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 2.318610212896562, "language_loss": 0.7632919, "learning_rate": 2.3457638993049045e-07, "loss": 0.78519261, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01095833, "auxiliary_loss_mlp": 0.01028333, "balance_loss_clip": 1.04664183, "balance_loss_mlp": 1.02038753, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 1.922853701501111, "language_loss": 0.63971496, "learning_rate": 2.3421047358202252e-07, "loss": 0.66095662, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.7132632732391357 }, { "auxiliary_loss_clip": 0.01156333, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 1.04951334, "balance_loss_mlp": 1.0196259, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 2.5902646592952623, "language_loss": 0.83369726, "learning_rate": 2.3384482510242144e-07, "loss": 0.85553002, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.670380115509033 }, { "auxiliary_loss_clip": 0.0116946, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04880774, "balance_loss_mlp": 1.01773942, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 2.1925786910980594, "language_loss": 0.77378154, "learning_rate": 2.3347944454715575e-07, "loss": 0.79573047, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.547727584838867 }, { "auxiliary_loss_clip": 0.01171726, "auxiliary_loss_mlp": 0.01021046, "balance_loss_clip": 1.05007827, "balance_loss_mlp": 1.01386356, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 1.9738755464731177, "language_loss": 0.67334771, "learning_rate": 2.331143319716542e-07, "loss": 0.69527543, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.7526144981384277 }, { "auxiliary_loss_clip": 0.01128684, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.0474416, "balance_loss_mlp": 1.01984429, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 2.2000561491671338, "language_loss": 0.65763712, "learning_rate": 2.3274948743130363e-07, "loss": 0.67919964, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.694145679473877 }, { "auxiliary_loss_clip": 0.0117056, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.04910827, "balance_loss_mlp": 1.02232027, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.7136796201710616, "language_loss": 0.79605919, "learning_rate": 2.3238491098145085e-07, "loss": 0.81806278, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.5912837982177734 }, { "auxiliary_loss_clip": 0.0115343, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04840493, "balance_loss_mlp": 1.01762354, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.5594835787537598, "language_loss": 0.7304697, "learning_rate": 2.3202060267740141e-07, "loss": 0.7522527, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.5756261348724365 }, { "auxiliary_loss_clip": 0.01099422, "auxiliary_loss_mlp": 0.010254, "balance_loss_clip": 1.04018354, "balance_loss_mlp": 1.01857805, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.429227727057168, "language_loss": 0.76920438, "learning_rate": 2.3165656257442044e-07, "loss": 0.7904526, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.7618649005889893 }, { "auxiliary_loss_clip": 0.01150531, "auxiliary_loss_mlp": 0.01027328, "balance_loss_clip": 1.04819465, "balance_loss_mlp": 1.02038455, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 2.8932165344429195, "language_loss": 0.90500641, "learning_rate": 2.31292790727734e-07, "loss": 0.92678499, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.595524311065674 }, { "auxiliary_loss_clip": 0.01164723, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.04637969, "balance_loss_mlp": 1.01956928, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.4402666816582954, "language_loss": 0.80601889, "learning_rate": 2.3092928719252392e-07, "loss": 0.82792884, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.619694948196411 }, { "auxiliary_loss_clip": 0.0115144, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.04760253, "balance_loss_mlp": 1.01925671, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 2.3416528397046137, "language_loss": 0.78540951, "learning_rate": 2.3056605202393475e-07, "loss": 0.80718946, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.5901827812194824 }, { "auxiliary_loss_clip": 0.01147489, "auxiliary_loss_mlp": 0.00711808, "balance_loss_clip": 1.0448277, "balance_loss_mlp": 1.00055885, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 1.9049511640122796, "language_loss": 0.66864932, "learning_rate": 2.3020308527706888e-07, "loss": 0.68724227, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.619182825088501 }, { "auxiliary_loss_clip": 0.01141463, "auxiliary_loss_mlp": 0.01027801, "balance_loss_clip": 1.04479742, "balance_loss_mlp": 1.02131867, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.808071705270611, "language_loss": 0.8917135, "learning_rate": 2.2984038700698715e-07, "loss": 0.91340613, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 4.580150842666626 }, { "auxiliary_loss_clip": 0.01152136, "auxiliary_loss_mlp": 0.01022532, "balance_loss_clip": 1.05084372, "balance_loss_mlp": 1.01549315, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.8580132870995287, "language_loss": 0.79180384, "learning_rate": 2.2947795726871222e-07, "loss": 0.81355053, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 3.615010976791382 }, { "auxiliary_loss_clip": 0.01153301, "auxiliary_loss_mlp": 0.00711463, "balance_loss_clip": 1.05149484, "balance_loss_mlp": 1.00065875, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 1.8655276058369887, "language_loss": 0.85894573, "learning_rate": 2.2911579611722253e-07, "loss": 0.87759334, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.6424400806427 }, { "auxiliary_loss_clip": 0.01134655, "auxiliary_loss_mlp": 0.01024806, "balance_loss_clip": 1.04643023, "balance_loss_mlp": 1.01802921, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 1.8952723997179972, "language_loss": 0.87439442, "learning_rate": 2.2875390360745905e-07, "loss": 0.89598906, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.67035174369812 }, { "auxiliary_loss_clip": 0.0112903, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.04789639, "balance_loss_mlp": 1.01789451, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 1.9184655836228774, "language_loss": 0.77536201, "learning_rate": 2.2839227979432008e-07, "loss": 0.79690218, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 3.5348615646362305 }, { "auxiliary_loss_clip": 0.01139188, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.04522693, "balance_loss_mlp": 1.01623774, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 1.9764089118104027, "language_loss": 0.84962392, "learning_rate": 2.2803092473266373e-07, "loss": 0.87125051, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.628018617630005 }, { "auxiliary_loss_clip": 0.01171389, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.05079281, "balance_loss_mlp": 1.02017856, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.2680208038243093, "language_loss": 0.86815983, "learning_rate": 2.2766983847730724e-07, "loss": 0.89014494, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.5816168785095215 }, { "auxiliary_loss_clip": 0.01131711, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.04625511, "balance_loss_mlp": 1.01654863, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 2.9165095450240703, "language_loss": 0.66708106, "learning_rate": 2.2730902108302663e-07, "loss": 0.68864006, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.6713430881500244 }, { "auxiliary_loss_clip": 0.01131053, "auxiliary_loss_mlp": 0.01028672, "balance_loss_clip": 1.04380107, "balance_loss_mlp": 1.02133453, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.5953089323840004, "language_loss": 0.68450308, "learning_rate": 2.269484726045583e-07, "loss": 0.70610034, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.667404890060425 }, { "auxiliary_loss_clip": 0.01124557, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.04553914, "balance_loss_mlp": 1.02154076, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 1.7558250966883917, "language_loss": 0.79243207, "learning_rate": 2.2658819309659672e-07, "loss": 0.81395954, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.7269439697265625 }, { "auxiliary_loss_clip": 0.01135059, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.04803801, "balance_loss_mlp": 1.01737869, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 1.9028046136961145, "language_loss": 0.84907985, "learning_rate": 2.2622818261379706e-07, "loss": 0.87066984, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.662860155105591 }, { "auxiliary_loss_clip": 0.0113604, "auxiliary_loss_mlp": 0.01026802, "balance_loss_clip": 1.04610181, "balance_loss_mlp": 1.01950896, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 2.339578247341032, "language_loss": 0.74781632, "learning_rate": 2.2586844121077142e-07, "loss": 0.76944482, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.622319459915161 }, { "auxiliary_loss_clip": 0.01105684, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.04056156, "balance_loss_mlp": 1.01748276, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 2.0844508632560075, "language_loss": 0.71929365, "learning_rate": 2.2550896894209215e-07, "loss": 0.74059665, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.790332078933716 }, { "auxiliary_loss_clip": 0.01008914, "auxiliary_loss_mlp": 0.01000687, "balance_loss_clip": 1.01676393, "balance_loss_mlp": 0.99976939, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6789984729983506, "language_loss": 0.56553727, "learning_rate": 2.2514976586229184e-07, "loss": 0.58563334, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.494302749633789 }, { "auxiliary_loss_clip": 0.01057089, "auxiliary_loss_mlp": 0.01002081, "balance_loss_clip": 1.0169332, "balance_loss_mlp": 1.00121105, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7525330706973135, "language_loss": 0.54646736, "learning_rate": 2.247908320258609e-07, "loss": 0.56705904, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.2617738246917725 }, { "auxiliary_loss_clip": 0.01101111, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.04480267, "balance_loss_mlp": 1.0246315, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.6042936809305233, "language_loss": 0.79803073, "learning_rate": 2.2443216748724914e-07, "loss": 0.81936055, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.737955331802368 }, { "auxiliary_loss_clip": 0.01158314, "auxiliary_loss_mlp": 0.0071161, "balance_loss_clip": 1.05133891, "balance_loss_mlp": 1.00053084, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 1.9815922846174487, "language_loss": 0.74083674, "learning_rate": 2.2407377230086588e-07, "loss": 0.75953603, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.6894822120666504 }, { "auxiliary_loss_clip": 0.01115759, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.04361534, "balance_loss_mlp": 1.02005339, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 2.246050520971906, "language_loss": 0.83553362, "learning_rate": 2.23715646521079e-07, "loss": 0.85696793, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.7888810634613037 }, { "auxiliary_loss_clip": 0.01157825, "auxiliary_loss_mlp": 0.00711941, "balance_loss_clip": 1.04829979, "balance_loss_mlp": 1.00051725, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 2.088292222231757, "language_loss": 0.84080124, "learning_rate": 2.2335779020221724e-07, "loss": 0.85949886, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.6058456897735596 }, { "auxiliary_loss_clip": 0.01065401, "auxiliary_loss_mlp": 0.01000615, "balance_loss_clip": 1.0304687, "balance_loss_mlp": 0.99970943, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.8029022204028522, "language_loss": 0.56417722, "learning_rate": 2.2300020339856497e-07, "loss": 0.58483738, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.285015344619751 }, { "auxiliary_loss_clip": 0.01133001, "auxiliary_loss_mlp": 0.01020556, "balance_loss_clip": 1.04425132, "balance_loss_mlp": 1.01357293, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.7077771980078493, "language_loss": 0.77879751, "learning_rate": 2.2264288616436966e-07, "loss": 0.80033302, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.674546718597412 }, { "auxiliary_loss_clip": 0.01130615, "auxiliary_loss_mlp": 0.01024496, "balance_loss_clip": 1.04543924, "balance_loss_mlp": 1.01668167, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 1.9249433281368955, "language_loss": 0.72857308, "learning_rate": 2.222858385538351e-07, "loss": 0.75012416, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.635683059692383 }, { "auxiliary_loss_clip": 0.01151002, "auxiliary_loss_mlp": 0.01022638, "balance_loss_clip": 1.04743075, "balance_loss_mlp": 1.01575327, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 2.6448325635006924, "language_loss": 0.67907745, "learning_rate": 2.2192906062112527e-07, "loss": 0.70081383, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.6078710556030273 }, { "auxiliary_loss_clip": 0.01167196, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 1.04823506, "balance_loss_mlp": 1.0187763, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.8998417302721196, "language_loss": 0.71183431, "learning_rate": 2.2157255242036377e-07, "loss": 0.73376799, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.7071046829223633 }, { "auxiliary_loss_clip": 0.01118108, "auxiliary_loss_mlp": 0.01027028, "balance_loss_clip": 1.04455101, "balance_loss_mlp": 1.02023053, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 1.8241337678798524, "language_loss": 0.74764836, "learning_rate": 2.2121631400563135e-07, "loss": 0.76909971, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.6950175762176514 }, { "auxiliary_loss_clip": 0.01052355, "auxiliary_loss_mlp": 0.01003879, "balance_loss_clip": 1.01616836, "balance_loss_mlp": 1.0030266, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7628297558891334, "language_loss": 0.52917117, "learning_rate": 2.208603454309701e-07, "loss": 0.54973352, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.174194812774658 }, { "auxiliary_loss_clip": 0.01105623, "auxiliary_loss_mlp": 0.01022655, "balance_loss_clip": 1.04570043, "balance_loss_mlp": 1.01574719, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 2.0987845539280325, "language_loss": 0.71252966, "learning_rate": 2.2050464675037994e-07, "loss": 0.73381239, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 4.579834461212158 }, { "auxiliary_loss_clip": 0.01136784, "auxiliary_loss_mlp": 0.01026155, "balance_loss_clip": 1.04547644, "balance_loss_mlp": 1.01849294, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.487915471670925, "language_loss": 0.73108625, "learning_rate": 2.2014921801782016e-07, "loss": 0.75271571, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 3.6230056285858154 }, { "auxiliary_loss_clip": 0.01138081, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.04396963, "balance_loss_mlp": 1.01804519, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 2.703138170092071, "language_loss": 0.74333823, "learning_rate": 2.1979405928720872e-07, "loss": 0.76497376, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.6720597743988037 }, { "auxiliary_loss_clip": 0.01138953, "auxiliary_loss_mlp": 0.01024157, "balance_loss_clip": 1.04587853, "balance_loss_mlp": 1.01728463, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.5154361918699726, "language_loss": 0.79623735, "learning_rate": 2.1943917061242257e-07, "loss": 0.81786847, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.6998159885406494 }, { "auxiliary_loss_clip": 0.01159734, "auxiliary_loss_mlp": 0.00712378, "balance_loss_clip": 1.04855132, "balance_loss_mlp": 1.00059962, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.672621213302767, "language_loss": 0.66722393, "learning_rate": 2.1908455204729903e-07, "loss": 0.68594503, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 3.5708565711975098 }, { "auxiliary_loss_clip": 0.01137196, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.04531479, "balance_loss_mlp": 1.02064693, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.1480997083387208, "language_loss": 0.78743875, "learning_rate": 2.1873020364563265e-07, "loss": 0.80908787, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.7114431858062744 }, { "auxiliary_loss_clip": 0.01152618, "auxiliary_loss_mlp": 0.01026511, "balance_loss_clip": 1.04888332, "balance_loss_mlp": 1.0193789, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.3741261758824455, "language_loss": 0.7625078, "learning_rate": 2.183761254611789e-07, "loss": 0.78429908, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.8574020862579346 }, { "auxiliary_loss_clip": 0.01153287, "auxiliary_loss_mlp": 0.01027078, "balance_loss_clip": 1.050246, "balance_loss_mlp": 1.0201726, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 2.035152674241665, "language_loss": 0.70441759, "learning_rate": 2.1802231754764987e-07, "loss": 0.7262212, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 2.8935325145721436 }, { "auxiliary_loss_clip": 0.01137954, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.0445292, "balance_loss_mlp": 1.01919425, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 2.3442272155151205, "language_loss": 0.76689839, "learning_rate": 2.17668779958718e-07, "loss": 0.78854811, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.7276182174682617 }, { "auxiliary_loss_clip": 0.01169569, "auxiliary_loss_mlp": 0.01026649, "balance_loss_clip": 1.05071366, "balance_loss_mlp": 1.01963699, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.2189881511638947, "language_loss": 0.80199212, "learning_rate": 2.1731551274801553e-07, "loss": 0.82395422, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.5415804386138916 }, { "auxiliary_loss_clip": 0.01140339, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.04747188, "balance_loss_mlp": 1.01958036, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.0937731148470755, "language_loss": 0.6220209, "learning_rate": 2.169625159691324e-07, "loss": 0.64368987, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.7280986309051514 }, { "auxiliary_loss_clip": 0.01115308, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04500473, "balance_loss_mlp": 1.01981783, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.0932001472395108, "language_loss": 0.74370438, "learning_rate": 2.1660978967561784e-07, "loss": 0.76513225, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.7150604724884033 }, { "auxiliary_loss_clip": 0.01167018, "auxiliary_loss_mlp": 0.01025732, "balance_loss_clip": 1.04770076, "balance_loss_mlp": 1.018574, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.6114641131578904, "language_loss": 0.79230976, "learning_rate": 2.1625733392098035e-07, "loss": 0.81423724, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.591592788696289 }, { "auxiliary_loss_clip": 0.01166072, "auxiliary_loss_mlp": 0.01021718, "balance_loss_clip": 1.04769933, "balance_loss_mlp": 1.01466084, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.792200416885641, "language_loss": 0.79720879, "learning_rate": 2.159051487586867e-07, "loss": 0.81908673, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.5717718601226807 }, { "auxiliary_loss_clip": 0.01142921, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.0499022, "balance_loss_mlp": 1.01612163, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 4.250895638222077, "language_loss": 0.72725797, "learning_rate": 2.155532342421642e-07, "loss": 0.74892187, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.6872825622558594 }, { "auxiliary_loss_clip": 0.01154609, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.04700828, "balance_loss_mlp": 1.02168512, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.7760716369403322, "language_loss": 0.78355044, "learning_rate": 2.1520159042479636e-07, "loss": 0.80538565, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.6261024475097656 }, { "auxiliary_loss_clip": 0.01152488, "auxiliary_loss_mlp": 0.01023193, "balance_loss_clip": 1.04835045, "balance_loss_mlp": 1.01642537, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 2.021362652857518, "language_loss": 0.7087146, "learning_rate": 2.148502173599287e-07, "loss": 0.73047143, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.615957498550415 }, { "auxiliary_loss_clip": 0.01131097, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.04483485, "balance_loss_mlp": 1.01819777, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.7136555157759974, "language_loss": 0.65564483, "learning_rate": 2.1449911510086372e-07, "loss": 0.67720848, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.7483713626861572 }, { "auxiliary_loss_clip": 0.01152076, "auxiliary_loss_mlp": 0.01025904, "balance_loss_clip": 1.0490762, "balance_loss_mlp": 1.01863253, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.925330885796717, "language_loss": 0.77064306, "learning_rate": 2.141482837008628e-07, "loss": 0.79242289, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.659044027328491 }, { "auxiliary_loss_clip": 0.01144638, "auxiliary_loss_mlp": 0.0103004, "balance_loss_clip": 1.04551458, "balance_loss_mlp": 1.02280128, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.9274867010522907, "language_loss": 0.71953791, "learning_rate": 2.1379772321314826e-07, "loss": 0.74128473, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.6211442947387695 }, { "auxiliary_loss_clip": 0.01081534, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.04073668, "balance_loss_mlp": 1.02359807, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.9512545233478003, "language_loss": 0.81435627, "learning_rate": 2.1344743369089802e-07, "loss": 0.83547628, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.728926658630371 }, { "auxiliary_loss_clip": 0.0113754, "auxiliary_loss_mlp": 0.01021918, "balance_loss_clip": 1.04700398, "balance_loss_mlp": 1.01508093, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.7476071619961895, "language_loss": 0.82480943, "learning_rate": 2.130974151872522e-07, "loss": 0.84640396, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.69484281539917 }, { "auxiliary_loss_clip": 0.01121061, "auxiliary_loss_mlp": 0.01036148, "balance_loss_clip": 1.04514492, "balance_loss_mlp": 1.02869785, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 2.1153157656373223, "language_loss": 0.78499317, "learning_rate": 2.1274766775530773e-07, "loss": 0.80656523, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.6815342903137207 }, { "auxiliary_loss_clip": 0.01173049, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.05084276, "balance_loss_mlp": 1.02309978, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 2.6197935702916175, "language_loss": 0.79508471, "learning_rate": 2.1239819144812077e-07, "loss": 0.81711912, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.645542860031128 }, { "auxiliary_loss_clip": 0.01110501, "auxiliary_loss_mlp": 0.01028558, "balance_loss_clip": 1.04097033, "balance_loss_mlp": 1.02147424, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.7545538456579488, "language_loss": 0.7009905, "learning_rate": 2.1204898631870716e-07, "loss": 0.722381, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.836790084838867 }, { "auxiliary_loss_clip": 0.01135924, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.0467366, "balance_loss_mlp": 1.01859283, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 2.056263197529022, "language_loss": 0.7631436, "learning_rate": 2.1170005242004006e-07, "loss": 0.78476131, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.6937668323516846 }, { "auxiliary_loss_clip": 0.01139004, "auxiliary_loss_mlp": 0.01023022, "balance_loss_clip": 1.04371524, "balance_loss_mlp": 1.01624775, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 2.223469236290646, "language_loss": 0.78274989, "learning_rate": 2.1135138980505384e-07, "loss": 0.80437016, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 3.6292624473571777 }, { "auxiliary_loss_clip": 0.0113185, "auxiliary_loss_mlp": 0.01023388, "balance_loss_clip": 1.04624248, "balance_loss_mlp": 1.01709712, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.9726049794529574, "language_loss": 0.72551835, "learning_rate": 2.110029985266395e-07, "loss": 0.74707079, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.6255345344543457 }, { "auxiliary_loss_clip": 0.01140575, "auxiliary_loss_mlp": 0.01025352, "balance_loss_clip": 1.04603338, "balance_loss_mlp": 1.01840758, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.8294573998182633, "language_loss": 0.73904037, "learning_rate": 2.1065487863764787e-07, "loss": 0.76069963, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 3.650573253631592 }, { "auxiliary_loss_clip": 0.01094695, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.03785133, "balance_loss_mlp": 1.01712918, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.5257323677865655, "language_loss": 0.85816443, "learning_rate": 2.1030703019088846e-07, "loss": 0.87935615, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 3.6435391902923584 }, { "auxiliary_loss_clip": 0.01146902, "auxiliary_loss_mlp": 0.01026541, "balance_loss_clip": 1.04609275, "balance_loss_mlp": 1.01906061, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 2.4284606478326762, "language_loss": 0.70612133, "learning_rate": 2.099594532391291e-07, "loss": 0.7278558, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.6052134037017822 }, { "auxiliary_loss_clip": 0.011441, "auxiliary_loss_mlp": 0.01022275, "balance_loss_clip": 1.04679692, "balance_loss_mlp": 1.01526833, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 1.5885163823356685, "language_loss": 0.79300916, "learning_rate": 2.0961214783509806e-07, "loss": 0.81467289, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.7203726768493652 }, { "auxiliary_loss_clip": 0.01139108, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.04431605, "balance_loss_mlp": 1.01866591, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 2.35798573928129, "language_loss": 0.74848098, "learning_rate": 2.0926511403148051e-07, "loss": 0.77012801, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.644397735595703 }, { "auxiliary_loss_clip": 0.01129892, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.04794419, "balance_loss_mlp": 1.02284753, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 2.1281542298526914, "language_loss": 0.75556362, "learning_rate": 2.0891835188092143e-07, "loss": 0.77716351, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.6818065643310547 }, { "auxiliary_loss_clip": 0.01125438, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.0434823, "balance_loss_mlp": 1.0211544, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 2.0513708803569726, "language_loss": 0.81895798, "learning_rate": 2.0857186143602434e-07, "loss": 0.84049177, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.638641357421875 }, { "auxiliary_loss_clip": 0.01112079, "auxiliary_loss_mlp": 0.01026215, "balance_loss_clip": 1.04206824, "balance_loss_mlp": 1.01918197, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 6.125189610520335, "language_loss": 0.67712754, "learning_rate": 2.0822564274935094e-07, "loss": 0.69851047, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.662816286087036 }, { "auxiliary_loss_clip": 0.01138687, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.04825044, "balance_loss_mlp": 1.01764846, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 1.912266818871089, "language_loss": 0.6719532, "learning_rate": 2.078796958734239e-07, "loss": 0.69359231, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.7488162517547607 }, { "auxiliary_loss_clip": 0.01152808, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.04824829, "balance_loss_mlp": 1.02080274, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 1.8863591702858624, "language_loss": 0.75260794, "learning_rate": 2.0753402086072124e-07, "loss": 0.77442169, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.6852970123291016 }, { "auxiliary_loss_clip": 0.01086926, "auxiliary_loss_mlp": 0.01028349, "balance_loss_clip": 1.04371929, "balance_loss_mlp": 1.02162576, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 3.189747415036407, "language_loss": 0.75437427, "learning_rate": 2.071886177636828e-07, "loss": 0.775527, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.8783578872680664 }, { "auxiliary_loss_clip": 0.01148298, "auxiliary_loss_mlp": 0.01025469, "balance_loss_clip": 1.04633236, "balance_loss_mlp": 1.0181613, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.8780786426164018, "language_loss": 0.83190691, "learning_rate": 2.0684348663470575e-07, "loss": 0.85364461, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.597027540206909 }, { "auxiliary_loss_clip": 0.0113483, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.04363728, "balance_loss_mlp": 1.02081156, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.944954029179894, "language_loss": 0.61502814, "learning_rate": 2.0649862752614555e-07, "loss": 0.63665903, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.608551502227783 }, { "auxiliary_loss_clip": 0.01048199, "auxiliary_loss_mlp": 0.01006142, "balance_loss_clip": 1.0184114, "balance_loss_mlp": 1.00508678, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7515657266093718, "language_loss": 0.56966114, "learning_rate": 2.0615404049031838e-07, "loss": 0.59020454, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.2277584075927734 }, { "auxiliary_loss_clip": 0.01154227, "auxiliary_loss_mlp": 0.01025399, "balance_loss_clip": 1.04900432, "balance_loss_mlp": 1.01781714, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 3.1111422491800247, "language_loss": 0.77958727, "learning_rate": 2.0580972557949616e-07, "loss": 0.80138355, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.6491689682006836 }, { "auxiliary_loss_clip": 0.0106059, "auxiliary_loss_mlp": 0.01006524, "balance_loss_clip": 1.01842403, "balance_loss_mlp": 1.00546932, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.7981108915778378, "language_loss": 0.54234546, "learning_rate": 2.054656828459125e-07, "loss": 0.56301659, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.2456462383270264 }, { "auxiliary_loss_clip": 0.01101081, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.0414567, "balance_loss_mlp": 1.02058637, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.8989882176525028, "language_loss": 0.77628475, "learning_rate": 2.051219123417578e-07, "loss": 0.79757589, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.7745139598846436 }, { "auxiliary_loss_clip": 0.01168951, "auxiliary_loss_mlp": 0.01022354, "balance_loss_clip": 1.04838991, "balance_loss_mlp": 1.01479053, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.0708457712694996, "language_loss": 0.60400164, "learning_rate": 2.0477841411918196e-07, "loss": 0.62591469, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.589210033416748 }, { "auxiliary_loss_clip": 0.01148629, "auxiliary_loss_mlp": 0.01023034, "balance_loss_clip": 1.04725313, "balance_loss_mlp": 1.01617908, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 2.0674541755229954, "language_loss": 0.74520147, "learning_rate": 2.0443518823029326e-07, "loss": 0.76691812, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.6839852333068848 }, { "auxiliary_loss_clip": 0.01113815, "auxiliary_loss_mlp": 0.01022488, "balance_loss_clip": 1.0431565, "balance_loss_mlp": 1.01482916, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.0625486965738773, "language_loss": 0.76660651, "learning_rate": 2.0409223472715854e-07, "loss": 0.78796953, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.6168811321258545 }, { "auxiliary_loss_clip": 0.01122001, "auxiliary_loss_mlp": 0.00711282, "balance_loss_clip": 1.04537797, "balance_loss_mlp": 1.00067961, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 2.4378883446382287, "language_loss": 0.75064075, "learning_rate": 2.0374955366180434e-07, "loss": 0.76897359, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.6996958255767822 }, { "auxiliary_loss_clip": 0.01124637, "auxiliary_loss_mlp": 0.01026628, "balance_loss_clip": 1.04396081, "balance_loss_mlp": 1.0194664, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.8034437250583912, "language_loss": 0.72552288, "learning_rate": 2.034071450862147e-07, "loss": 0.7470355, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.692143678665161 }, { "auxiliary_loss_clip": 0.01138341, "auxiliary_loss_mlp": 0.01022523, "balance_loss_clip": 1.04392469, "balance_loss_mlp": 1.01505136, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.6636967797535462, "language_loss": 0.77050591, "learning_rate": 2.030650090523327e-07, "loss": 0.79211462, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.647796630859375 }, { "auxiliary_loss_clip": 0.01119268, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.04480457, "balance_loss_mlp": 1.0191257, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.839611762895994, "language_loss": 0.5967856, "learning_rate": 2.0272314561205995e-07, "loss": 0.61824495, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.744022846221924 }, { "auxiliary_loss_clip": 0.01113858, "auxiliary_loss_mlp": 0.01023366, "balance_loss_clip": 1.04247284, "balance_loss_mlp": 1.01638353, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 4.368904991311825, "language_loss": 0.73158121, "learning_rate": 2.023815548172567e-07, "loss": 0.75295341, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 3.5642199516296387 }, { "auxiliary_loss_clip": 0.01153609, "auxiliary_loss_mlp": 0.01025688, "balance_loss_clip": 1.04704487, "balance_loss_mlp": 1.01818717, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.5862274273497938, "language_loss": 0.6598013, "learning_rate": 2.0204023671974267e-07, "loss": 0.68159431, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.6439130306243896 }, { "auxiliary_loss_clip": 0.0114768, "auxiliary_loss_mlp": 0.01028433, "balance_loss_clip": 1.04571438, "balance_loss_mlp": 1.02094054, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.1339723290566583, "language_loss": 0.81042039, "learning_rate": 2.0169919137129532e-07, "loss": 0.83218151, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 3.532163381576538 }, { "auxiliary_loss_clip": 0.01155497, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.04993391, "balance_loss_mlp": 1.02180111, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.423080888554196, "language_loss": 0.71113712, "learning_rate": 2.013584188236508e-07, "loss": 0.73298597, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 3.530945062637329 }, { "auxiliary_loss_clip": 0.01171199, "auxiliary_loss_mlp": 0.01028888, "balance_loss_clip": 1.05012345, "balance_loss_mlp": 1.02159214, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.8542088008083235, "language_loss": 0.79475713, "learning_rate": 2.0101791912850396e-07, "loss": 0.81675792, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.6023247241973877 }, { "auxiliary_loss_clip": 0.01139715, "auxiliary_loss_mlp": 0.01026314, "balance_loss_clip": 1.04843748, "balance_loss_mlp": 1.01909614, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 2.011769221875739, "language_loss": 0.63863504, "learning_rate": 2.006776923375082e-07, "loss": 0.66029531, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.8606956005096436 }, { "auxiliary_loss_clip": 0.01168345, "auxiliary_loss_mlp": 0.01022887, "balance_loss_clip": 1.04900169, "balance_loss_mlp": 1.01577306, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.8326128461283624, "language_loss": 0.71337312, "learning_rate": 2.003377385022764e-07, "loss": 0.73528546, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.6061151027679443 }, { "auxiliary_loss_clip": 0.01138665, "auxiliary_loss_mlp": 0.01028284, "balance_loss_clip": 1.04625416, "balance_loss_mlp": 1.02104175, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.119499274787899, "language_loss": 0.77720213, "learning_rate": 1.9999805767437826e-07, "loss": 0.79887164, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.619612693786621 }, { "auxiliary_loss_clip": 0.01128819, "auxiliary_loss_mlp": 0.01022606, "balance_loss_clip": 1.04347301, "balance_loss_mlp": 1.01542044, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 2.3745546169493683, "language_loss": 0.71619105, "learning_rate": 1.9965864990534386e-07, "loss": 0.73770535, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.666973352432251 }, { "auxiliary_loss_clip": 0.01113542, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.04117966, "balance_loss_mlp": 1.01860976, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.702443356337646, "language_loss": 0.77888048, "learning_rate": 1.9931951524666092e-07, "loss": 0.80027211, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.7635393142700195 }, { "auxiliary_loss_clip": 0.01156077, "auxiliary_loss_mlp": 0.00711387, "balance_loss_clip": 1.04786205, "balance_loss_mlp": 1.00059557, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.8926846858361799, "language_loss": 0.81391883, "learning_rate": 1.9898065374977534e-07, "loss": 0.83259344, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.6623151302337646 }, { "auxiliary_loss_clip": 0.01115696, "auxiliary_loss_mlp": 0.01019645, "balance_loss_clip": 1.04112518, "balance_loss_mlp": 1.01325274, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 1.8883564708338108, "language_loss": 0.73284215, "learning_rate": 1.9864206546609342e-07, "loss": 0.75419557, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.647000789642334 }, { "auxiliary_loss_clip": 0.01168329, "auxiliary_loss_mlp": 0.01023253, "balance_loss_clip": 1.04820204, "balance_loss_mlp": 1.01636243, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 2.9032324989277427, "language_loss": 0.84035313, "learning_rate": 1.983037504469771e-07, "loss": 0.86226898, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.5919575691223145 }, { "auxiliary_loss_clip": 0.01155394, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.05010223, "balance_loss_mlp": 1.02405357, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.6879415701606326, "language_loss": 0.66761446, "learning_rate": 1.9796570874374984e-07, "loss": 0.68948305, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.624819755554199 }, { "auxiliary_loss_clip": 0.0114144, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.04842162, "balance_loss_mlp": 1.02146018, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.7542396519965588, "language_loss": 0.77519345, "learning_rate": 1.976279404076917e-07, "loss": 0.79689515, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.631812810897827 }, { "auxiliary_loss_clip": 0.01120159, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.04442632, "balance_loss_mlp": 1.0192821, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.8777157858263083, "language_loss": 0.76618052, "learning_rate": 1.9729044549004193e-07, "loss": 0.78764844, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.7200214862823486 }, { "auxiliary_loss_clip": 0.01148154, "auxiliary_loss_mlp": 0.01022535, "balance_loss_clip": 1.04657269, "balance_loss_mlp": 1.01558805, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.801608006381255, "language_loss": 0.70109648, "learning_rate": 1.9695322404199822e-07, "loss": 0.72280335, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.6319997310638428 }, { "auxiliary_loss_clip": 0.01137457, "auxiliary_loss_mlp": 0.01026193, "balance_loss_clip": 1.04631901, "balance_loss_mlp": 1.01889443, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.927842222354129, "language_loss": 0.82119393, "learning_rate": 1.9661627611471654e-07, "loss": 0.84283048, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.675440549850464 }, { "auxiliary_loss_clip": 0.011435, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.04705727, "balance_loss_mlp": 1.01900864, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 2.2888926016794353, "language_loss": 0.69814193, "learning_rate": 1.9627960175931246e-07, "loss": 0.71984243, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 2.86445689201355 }, { "auxiliary_loss_clip": 0.01152031, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.04833698, "balance_loss_mlp": 1.02271616, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 2.4653744764888663, "language_loss": 0.74224037, "learning_rate": 1.9594320102685847e-07, "loss": 0.76405442, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.62036395072937 }, { "auxiliary_loss_clip": 0.01127193, "auxiliary_loss_mlp": 0.00710691, "balance_loss_clip": 1.04419327, "balance_loss_mlp": 1.00055397, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 2.296926695872685, "language_loss": 0.6393609, "learning_rate": 1.956070739683864e-07, "loss": 0.6577397, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.653398036956787 }, { "auxiliary_loss_clip": 0.01105936, "auxiliary_loss_mlp": 0.01026373, "balance_loss_clip": 1.04088807, "balance_loss_mlp": 1.02003109, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.562799579736547, "language_loss": 0.74314523, "learning_rate": 1.9527122063488678e-07, "loss": 0.76446837, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.7312722206115723 }, { "auxiliary_loss_clip": 0.01132345, "auxiliary_loss_mlp": 0.01019681, "balance_loss_clip": 1.04161692, "balance_loss_mlp": 1.01295161, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.6685882270647578, "language_loss": 0.80515778, "learning_rate": 1.9493564107730755e-07, "loss": 0.82667804, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.5888831615448 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.01024286, "balance_loss_clip": 1.04115105, "balance_loss_mlp": 1.0169487, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 2.2012277091706656, "language_loss": 0.6118964, "learning_rate": 1.9460033534655684e-07, "loss": 0.63340437, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.669459819793701 }, { "auxiliary_loss_clip": 0.01127944, "auxiliary_loss_mlp": 0.01025469, "balance_loss_clip": 1.04050803, "balance_loss_mlp": 1.01836729, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.703934472541485, "language_loss": 0.84352565, "learning_rate": 1.9426530349349978e-07, "loss": 0.86505973, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.6455228328704834 }, { "auxiliary_loss_clip": 0.01150481, "auxiliary_loss_mlp": 0.00711088, "balance_loss_clip": 1.04637182, "balance_loss_mlp": 1.00052738, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 2.704751094478071, "language_loss": 0.6509462, "learning_rate": 1.9393054556896038e-07, "loss": 0.66956192, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 3.552522897720337 }, { "auxiliary_loss_clip": 0.01118007, "auxiliary_loss_mlp": 0.01023328, "balance_loss_clip": 1.04312849, "balance_loss_mlp": 1.01528478, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 3.2554945486326687, "language_loss": 0.69171, "learning_rate": 1.9359606162372133e-07, "loss": 0.71312344, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 3.7425622940063477 }, { "auxiliary_loss_clip": 0.01168224, "auxiliary_loss_mlp": 0.01022411, "balance_loss_clip": 1.0500381, "balance_loss_mlp": 1.01539898, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.8425792733687054, "language_loss": 0.70615709, "learning_rate": 1.9326185170852293e-07, "loss": 0.72806346, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 3.453502655029297 }, { "auxiliary_loss_clip": 0.01151322, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.0477947, "balance_loss_mlp": 1.02105987, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 1.8958070459854979, "language_loss": 0.7211566, "learning_rate": 1.9292791587406598e-07, "loss": 0.74294925, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.6050171852111816 }, { "auxiliary_loss_clip": 0.01148888, "auxiliary_loss_mlp": 0.00711501, "balance_loss_clip": 1.0452683, "balance_loss_mlp": 1.0006026, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.8027351003845036, "language_loss": 0.86849105, "learning_rate": 1.9259425417100661e-07, "loss": 0.88709491, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 3.492093086242676 }, { "auxiliary_loss_clip": 0.01084633, "auxiliary_loss_mlp": 0.01026854, "balance_loss_clip": 1.03613567, "balance_loss_mlp": 1.02001143, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.339464820309153, "language_loss": 0.74931902, "learning_rate": 1.9226086664996234e-07, "loss": 0.7704339, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.6727728843688965 }, { "auxiliary_loss_clip": 0.01139179, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.04764891, "balance_loss_mlp": 1.02335382, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 2.3552452933056425, "language_loss": 0.74662077, "learning_rate": 1.9192775336150712e-07, "loss": 0.76831698, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.7629446983337402 }, { "auxiliary_loss_clip": 0.0105643, "auxiliary_loss_mlp": 0.01004943, "balance_loss_clip": 1.01730943, "balance_loss_mlp": 1.00381613, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7628712034254088, "language_loss": 0.56270194, "learning_rate": 1.915949143561739e-07, "loss": 0.58331567, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.221328020095825 }, { "auxiliary_loss_clip": 0.01153275, "auxiliary_loss_mlp": 0.01022131, "balance_loss_clip": 1.04935312, "balance_loss_mlp": 1.01510692, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 2.0488555228755563, "language_loss": 0.78561175, "learning_rate": 1.9126234968445498e-07, "loss": 0.80736578, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 2.737920045852661 }, { "auxiliary_loss_clip": 0.01169817, "auxiliary_loss_mlp": 0.01024344, "balance_loss_clip": 1.04984736, "balance_loss_mlp": 1.01737654, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.459260751741477, "language_loss": 0.67796153, "learning_rate": 1.9093005939679884e-07, "loss": 0.69990313, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.64042067527771 }, { "auxiliary_loss_clip": 0.01153312, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.04986691, "balance_loss_mlp": 1.02432704, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 2.209408985447502, "language_loss": 0.76591462, "learning_rate": 1.9059804354361452e-07, "loss": 0.78776366, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.5762104988098145 }, { "auxiliary_loss_clip": 0.01128988, "auxiliary_loss_mlp": 0.01027214, "balance_loss_clip": 1.04221761, "balance_loss_mlp": 1.0200789, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 1.9124993571104845, "language_loss": 0.70260555, "learning_rate": 1.902663021752684e-07, "loss": 0.72416759, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.789727210998535 }, { "auxiliary_loss_clip": 0.01172739, "auxiliary_loss_mlp": 0.01023964, "balance_loss_clip": 1.05166101, "balance_loss_mlp": 1.01676416, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 4.494755172212205, "language_loss": 0.8229003, "learning_rate": 1.8993483534208556e-07, "loss": 0.84486735, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.533690929412842 }, { "auxiliary_loss_clip": 0.0113166, "auxiliary_loss_mlp": 0.01028816, "balance_loss_clip": 1.04624808, "balance_loss_mlp": 1.0213387, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 3.471685326583815, "language_loss": 0.74886918, "learning_rate": 1.8960364309434884e-07, "loss": 0.77047396, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.6422500610351562 }, { "auxiliary_loss_clip": 0.01086776, "auxiliary_loss_mlp": 0.00711763, "balance_loss_clip": 1.04269481, "balance_loss_mlp": 1.00059533, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.7227656948614485, "language_loss": 0.7853179, "learning_rate": 1.8927272548229967e-07, "loss": 0.80330336, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.700664758682251 }, { "auxiliary_loss_clip": 0.01108311, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.04620051, "balance_loss_mlp": 1.02330232, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.6843058700548068, "language_loss": 0.83011067, "learning_rate": 1.8894208255613876e-07, "loss": 0.85149699, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.7350378036499023 }, { "auxiliary_loss_clip": 0.01166802, "auxiliary_loss_mlp": 0.01024404, "balance_loss_clip": 1.04758894, "balance_loss_mlp": 1.01749587, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.105525619449685, "language_loss": 0.77819377, "learning_rate": 1.8861171436602397e-07, "loss": 0.80010581, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.538694143295288 }, { "auxiliary_loss_clip": 0.01157326, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.05146384, "balance_loss_mlp": 1.01677608, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.424814165719609, "language_loss": 0.80888772, "learning_rate": 1.882816209620719e-07, "loss": 0.83069587, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.6301097869873047 }, { "auxiliary_loss_clip": 0.01142723, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.04964566, "balance_loss_mlp": 1.01996064, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 2.190068955208596, "language_loss": 0.76826745, "learning_rate": 1.8795180239435738e-07, "loss": 0.78996611, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.607487440109253 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.04795229, "balance_loss_mlp": 1.02039671, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 2.768130971515401, "language_loss": 0.76788139, "learning_rate": 1.8762225871291348e-07, "loss": 0.78959358, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.616459608078003 }, { "auxiliary_loss_clip": 0.01168731, "auxiliary_loss_mlp": 0.0071179, "balance_loss_clip": 1.04918075, "balance_loss_mlp": 1.0005635, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.805195151804699, "language_loss": 0.80884284, "learning_rate": 1.8729298996773201e-07, "loss": 0.82764804, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.5557708740234375 }, { "auxiliary_loss_clip": 0.01055704, "auxiliary_loss_mlp": 0.0101133, "balance_loss_clip": 1.01709914, "balance_loss_mlp": 1.01026869, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8345281815979559, "language_loss": 0.60931754, "learning_rate": 1.8696399620876301e-07, "loss": 0.62998784, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.1377882957458496 }, { "auxiliary_loss_clip": 0.01115972, "auxiliary_loss_mlp": 0.01025321, "balance_loss_clip": 1.04046512, "balance_loss_mlp": 1.01819193, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.138609273417299, "language_loss": 0.79382813, "learning_rate": 1.866352774859141e-07, "loss": 0.8152411, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.639291286468506 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01021851, "balance_loss_clip": 1.04342675, "balance_loss_mlp": 1.0154376, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.6609830648769073, "language_loss": 0.69324297, "learning_rate": 1.8630683384905188e-07, "loss": 0.71470177, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.6825716495513916 }, { "auxiliary_loss_clip": 0.01169523, "auxiliary_loss_mlp": 0.00711325, "balance_loss_clip": 1.04981542, "balance_loss_mlp": 1.00056696, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 1.961019280102674, "language_loss": 0.8846339, "learning_rate": 1.8597866534800045e-07, "loss": 0.90344238, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.5627071857452393 }, { "auxiliary_loss_clip": 0.01154905, "auxiliary_loss_mlp": 0.00711445, "balance_loss_clip": 1.04806006, "balance_loss_mlp": 1.00065541, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.7198950620896, "language_loss": 0.7460413, "learning_rate": 1.8565077203254398e-07, "loss": 0.76470482, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 2.9793131351470947 }, { "auxiliary_loss_clip": 0.01123223, "auxiliary_loss_mlp": 0.01027611, "balance_loss_clip": 1.04969001, "balance_loss_mlp": 1.02018392, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 3.2020707963397466, "language_loss": 0.73206925, "learning_rate": 1.8532315395242203e-07, "loss": 0.75357759, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 3.52409029006958 }, { "auxiliary_loss_clip": 0.01123302, "auxiliary_loss_mlp": 0.01022344, "balance_loss_clip": 1.04429889, "balance_loss_mlp": 1.01559067, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 3.4437560831329592, "language_loss": 0.72468537, "learning_rate": 1.849958111573353e-07, "loss": 0.74614185, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 3.627519130706787 }, { "auxiliary_loss_clip": 0.01164829, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.04759073, "balance_loss_mlp": 1.01766992, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.9319524546530593, "language_loss": 0.64563382, "learning_rate": 1.8466874369694074e-07, "loss": 0.66752899, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 3.379852771759033 }, { "auxiliary_loss_clip": 0.01119554, "auxiliary_loss_mlp": 0.01026328, "balance_loss_clip": 1.04050565, "balance_loss_mlp": 1.01986694, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 2.3137834110109274, "language_loss": 0.69946998, "learning_rate": 1.843419516208542e-07, "loss": 0.72092879, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.6317992210388184 }, { "auxiliary_loss_clip": 0.01154781, "auxiliary_loss_mlp": 0.01026065, "balance_loss_clip": 1.04978108, "balance_loss_mlp": 1.01848352, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.9402691215333787, "language_loss": 0.79680574, "learning_rate": 1.8401543497865047e-07, "loss": 0.81861413, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 3.514643669128418 }, { "auxiliary_loss_clip": 0.01153515, "auxiliary_loss_mlp": 0.00710995, "balance_loss_clip": 1.04645646, "balance_loss_mlp": 1.00055933, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.8123736389983627, "language_loss": 0.63931382, "learning_rate": 1.836891938198608e-07, "loss": 0.65795887, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.656510591506958 }, { "auxiliary_loss_clip": 0.01136196, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.04568172, "balance_loss_mlp": 1.02051687, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.6015326060499135, "language_loss": 0.71064544, "learning_rate": 1.8336322819397677e-07, "loss": 0.73228329, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.6609506607055664 }, { "auxiliary_loss_clip": 0.01122473, "auxiliary_loss_mlp": 0.01019644, "balance_loss_clip": 1.04139066, "balance_loss_mlp": 1.01237226, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 2.1249117058720195, "language_loss": 0.62258428, "learning_rate": 1.8303753815044654e-07, "loss": 0.64400548, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.6604835987091064 }, { "auxiliary_loss_clip": 0.01147077, "auxiliary_loss_mlp": 0.01030245, "balance_loss_clip": 1.04651833, "balance_loss_mlp": 1.02220476, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.4010550602194813, "language_loss": 0.70352566, "learning_rate": 1.827121237386773e-07, "loss": 0.72529888, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.6626510620117188 }, { "auxiliary_loss_clip": 0.01141086, "auxiliary_loss_mlp": 0.01024705, "balance_loss_clip": 1.04732978, "balance_loss_mlp": 1.0168879, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.703338880731958, "language_loss": 0.75426483, "learning_rate": 1.8238698500803374e-07, "loss": 0.77592278, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.543519973754883 }, { "auxiliary_loss_clip": 0.01061094, "auxiliary_loss_mlp": 0.01006088, "balance_loss_clip": 1.01831532, "balance_loss_mlp": 1.00501561, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7207809732194513, "language_loss": 0.56273019, "learning_rate": 1.820621220078391e-07, "loss": 0.58340204, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.3152823448181152 }, { "auxiliary_loss_clip": 0.01167774, "auxiliary_loss_mlp": 0.01023146, "balance_loss_clip": 1.04803228, "balance_loss_mlp": 1.01588941, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.9233531576282064, "language_loss": 0.68083078, "learning_rate": 1.8173753478737553e-07, "loss": 0.70273995, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.5942792892456055 }, { "auxiliary_loss_clip": 0.01169647, "auxiliary_loss_mlp": 0.01022536, "balance_loss_clip": 1.04927206, "balance_loss_mlp": 1.01523995, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 2.8642978822870893, "language_loss": 0.7985217, "learning_rate": 1.8141322339588205e-07, "loss": 0.82044351, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.5833957195281982 }, { "auxiliary_loss_clip": 0.01166494, "auxiliary_loss_mlp": 0.01022941, "balance_loss_clip": 1.04801238, "balance_loss_mlp": 1.01548505, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 2.0302133922202583, "language_loss": 0.70007277, "learning_rate": 1.810891878825569e-07, "loss": 0.7219671, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.597240447998047 }, { "auxiliary_loss_clip": 0.01136415, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 1.04538035, "balance_loss_mlp": 1.01908243, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.089577120845742, "language_loss": 0.71839023, "learning_rate": 1.8076542829655561e-07, "loss": 0.74002516, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.6597485542297363 }, { "auxiliary_loss_clip": 0.01138877, "auxiliary_loss_mlp": 0.01031268, "balance_loss_clip": 1.04735124, "balance_loss_mlp": 1.023525, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 2.136290471380156, "language_loss": 0.79635942, "learning_rate": 1.8044194468699203e-07, "loss": 0.81806093, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.5763957500457764 }, { "auxiliary_loss_clip": 0.01136745, "auxiliary_loss_mlp": 0.01021934, "balance_loss_clip": 1.04912305, "balance_loss_mlp": 1.01458514, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 4.287880149529247, "language_loss": 0.75489187, "learning_rate": 1.8011873710293912e-07, "loss": 0.77647871, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.719186782836914 }, { "auxiliary_loss_clip": 0.0115207, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04895711, "balance_loss_mlp": 1.01810241, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 2.346271921456061, "language_loss": 0.69553578, "learning_rate": 1.7979580559342677e-07, "loss": 0.71731079, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.7433438301086426 }, { "auxiliary_loss_clip": 0.01134822, "auxiliary_loss_mlp": 0.01024124, "balance_loss_clip": 1.04596031, "balance_loss_mlp": 1.01685834, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.5865332967004147, "language_loss": 0.67099059, "learning_rate": 1.7947315020744358e-07, "loss": 0.69257998, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.658496856689453 }, { "auxiliary_loss_clip": 0.01136037, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.04471469, "balance_loss_mlp": 1.02097905, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 1.91241786376176, "language_loss": 0.80466747, "learning_rate": 1.7915077099393594e-07, "loss": 0.82631022, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.675295829772949 }, { "auxiliary_loss_clip": 0.01155733, "auxiliary_loss_mlp": 0.01030737, "balance_loss_clip": 1.04745376, "balance_loss_mlp": 1.02356017, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 2.0516833443601348, "language_loss": 0.73326933, "learning_rate": 1.788286680018083e-07, "loss": 0.75513399, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.5610225200653076 }, { "auxiliary_loss_clip": 0.01140336, "auxiliary_loss_mlp": 0.01025169, "balance_loss_clip": 1.04628313, "balance_loss_mlp": 1.01772439, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 2.263457835936893, "language_loss": 0.72698605, "learning_rate": 1.7850684127992443e-07, "loss": 0.74864107, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.714336395263672 }, { "auxiliary_loss_clip": 0.0112511, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.04813612, "balance_loss_mlp": 1.02183938, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.6322451374988052, "language_loss": 0.70352364, "learning_rate": 1.7818529087710378e-07, "loss": 0.72506106, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.63071870803833 }, { "auxiliary_loss_clip": 0.0115358, "auxiliary_loss_mlp": 0.00711599, "balance_loss_clip": 1.04867625, "balance_loss_mlp": 1.00063121, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 2.320734808027799, "language_loss": 0.84215784, "learning_rate": 1.7786401684212637e-07, "loss": 0.86080962, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.593721866607666 }, { "auxiliary_loss_clip": 0.01033126, "auxiliary_loss_mlp": 0.01006864, "balance_loss_clip": 1.02031279, "balance_loss_mlp": 1.00583923, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.740521343679481, "language_loss": 0.55909634, "learning_rate": 1.7754301922372883e-07, "loss": 0.57949626, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.0862855911254883 }, { "auxiliary_loss_clip": 0.01097941, "auxiliary_loss_mlp": 0.01023281, "balance_loss_clip": 1.04269791, "balance_loss_mlp": 1.01607156, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 2.1644390585593887, "language_loss": 0.81091809, "learning_rate": 1.7722229807060617e-07, "loss": 0.83213031, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.7898104190826416 }, { "auxiliary_loss_clip": 0.01110225, "auxiliary_loss_mlp": 0.01025457, "balance_loss_clip": 1.04029334, "balance_loss_mlp": 1.01832557, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 2.1384408154182024, "language_loss": 0.81869632, "learning_rate": 1.7690185343141172e-07, "loss": 0.84005314, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 3.918069839477539 }, { "auxiliary_loss_clip": 0.01136674, "auxiliary_loss_mlp": 0.01023028, "balance_loss_clip": 1.04483366, "balance_loss_mlp": 1.01649284, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 3.6102529078331727, "language_loss": 0.70080745, "learning_rate": 1.7658168535475615e-07, "loss": 0.72240442, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 3.608105421066284 }, { "auxiliary_loss_clip": 0.01142943, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.04876125, "balance_loss_mlp": 1.02508855, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.8962233041623675, "language_loss": 0.64747697, "learning_rate": 1.7626179388920948e-07, "loss": 0.66923261, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 4.148921728134155 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.00711323, "balance_loss_clip": 1.04638886, "balance_loss_mlp": 1.00059772, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.9527990333103653, "language_loss": 0.80391973, "learning_rate": 1.7594217908329866e-07, "loss": 0.82240915, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.8018620014190674 }, { "auxiliary_loss_clip": 0.01127532, "auxiliary_loss_mlp": 0.01020436, "balance_loss_clip": 1.04405808, "balance_loss_mlp": 1.01401091, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 1.881650971551203, "language_loss": 0.74196631, "learning_rate": 1.7562284098550895e-07, "loss": 0.76344603, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 3.5820493698120117 }, { "auxiliary_loss_clip": 0.01046269, "auxiliary_loss_mlp": 0.01003031, "balance_loss_clip": 1.02234936, "balance_loss_mlp": 1.00187421, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8388456920899294, "language_loss": 0.62257618, "learning_rate": 1.753037796442838e-07, "loss": 0.64306921, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.2007410526275635 }, { "auxiliary_loss_clip": 0.01168704, "auxiliary_loss_mlp": 0.01024011, "balance_loss_clip": 1.04926801, "balance_loss_mlp": 1.01675105, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.082856723745905, "language_loss": 0.75382149, "learning_rate": 1.74984995108024e-07, "loss": 0.77574861, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.5932633876800537 }, { "auxiliary_loss_clip": 0.01157671, "auxiliary_loss_mlp": 0.01026511, "balance_loss_clip": 1.04949284, "balance_loss_mlp": 1.01870847, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.282442348939414, "language_loss": 0.82763219, "learning_rate": 1.7466648742508981e-07, "loss": 0.84947401, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.533360481262207 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.0102816, "balance_loss_clip": 1.04776096, "balance_loss_mlp": 1.02090597, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 2.1789683236559156, "language_loss": 0.84380817, "learning_rate": 1.7434825664379837e-07, "loss": 0.86544979, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.6848113536834717 }, { "auxiliary_loss_clip": 0.01154033, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.04766393, "balance_loss_mlp": 1.02190757, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 3.0341681495640906, "language_loss": 0.85968155, "learning_rate": 1.740303028124246e-07, "loss": 0.88151574, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.547015905380249 }, { "auxiliary_loss_clip": 0.01079349, "auxiliary_loss_mlp": 0.01022841, "balance_loss_clip": 1.03850162, "balance_loss_mlp": 1.01578689, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 1.8911840547023488, "language_loss": 0.75690949, "learning_rate": 1.7371262597920212e-07, "loss": 0.77793139, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.811891794204712 }, { "auxiliary_loss_clip": 0.01101552, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.04540253, "balance_loss_mlp": 1.02297592, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.819868032161465, "language_loss": 0.76444292, "learning_rate": 1.7339522619232195e-07, "loss": 0.7857554, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.6671218872070312 }, { "auxiliary_loss_clip": 0.01143716, "auxiliary_loss_mlp": 0.01030492, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.02295244, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 1.7598510843166288, "language_loss": 0.75408864, "learning_rate": 1.730781034999338e-07, "loss": 0.77583075, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.71732497215271 }, { "auxiliary_loss_clip": 0.01168068, "auxiliary_loss_mlp": 0.0102448, "balance_loss_clip": 1.05252576, "balance_loss_mlp": 1.01748836, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 5.065840870380606, "language_loss": 0.73374426, "learning_rate": 1.7276125795014497e-07, "loss": 0.75566971, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.6541051864624023 }, { "auxiliary_loss_clip": 0.01138779, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.0439527, "balance_loss_mlp": 1.0186882, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 2.1542947864417488, "language_loss": 0.6752848, "learning_rate": 1.7244468959102054e-07, "loss": 0.69693702, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.6701817512512207 }, { "auxiliary_loss_clip": 0.01153117, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.04935598, "balance_loss_mlp": 1.01776481, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 3.375453334891635, "language_loss": 0.85034668, "learning_rate": 1.7212839847058348e-07, "loss": 0.87213176, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.624861001968384 }, { "auxiliary_loss_clip": 0.0109895, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 1.04171324, "balance_loss_mlp": 1.01902127, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 2.092471991560001, "language_loss": 0.73971319, "learning_rate": 1.718123846368147e-07, "loss": 0.76096219, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.7504048347473145 }, { "auxiliary_loss_clip": 0.01136075, "auxiliary_loss_mlp": 0.00710999, "balance_loss_clip": 1.04803705, "balance_loss_mlp": 1.00061893, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 2.413581234433034, "language_loss": 0.71833748, "learning_rate": 1.714966481376543e-07, "loss": 0.73680824, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.63887882232666 }, { "auxiliary_loss_clip": 0.01152475, "auxiliary_loss_mlp": 0.01027424, "balance_loss_clip": 1.04792643, "balance_loss_mlp": 1.02024734, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 2.107164514842327, "language_loss": 0.82903278, "learning_rate": 1.7118118902099797e-07, "loss": 0.85083175, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.6261682510375977 }, { "auxiliary_loss_clip": 0.01153013, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.04806018, "balance_loss_mlp": 1.02000499, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 2.12715316490382, "language_loss": 0.80612057, "learning_rate": 1.7086600733470146e-07, "loss": 0.82792228, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.639948606491089 }, { "auxiliary_loss_clip": 0.0114933, "auxiliary_loss_mlp": 0.01020635, "balance_loss_clip": 1.04775536, "balance_loss_mlp": 1.01414406, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.8013819479717026, "language_loss": 0.77064395, "learning_rate": 1.7055110312657738e-07, "loss": 0.79234356, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.585934638977051 }, { "auxiliary_loss_clip": 0.01129984, "auxiliary_loss_mlp": 0.01028917, "balance_loss_clip": 1.04498386, "balance_loss_mlp": 1.02155638, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 2.4758823735859528, "language_loss": 0.7407515, "learning_rate": 1.702364764443962e-07, "loss": 0.76234055, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 2.7474522590637207 }, { "auxiliary_loss_clip": 0.01084273, "auxiliary_loss_mlp": 0.01030577, "balance_loss_clip": 1.03906536, "balance_loss_mlp": 1.02368104, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 2.0798885894858397, "language_loss": 0.72251695, "learning_rate": 1.6992212733588685e-07, "loss": 0.74366546, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.754838228225708 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.04651928, "balance_loss_mlp": 1.019822, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 2.0842320895553526, "language_loss": 0.74901831, "learning_rate": 1.6960805584873538e-07, "loss": 0.77061999, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.678218126296997 }, { "auxiliary_loss_clip": 0.01106124, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 1.04238188, "balance_loss_mlp": 1.01869178, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 2.4080001687219013, "language_loss": 0.78384423, "learning_rate": 1.6929426203058684e-07, "loss": 0.80515718, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.6694297790527344 }, { "auxiliary_loss_clip": 0.01172663, "auxiliary_loss_mlp": 0.00712264, "balance_loss_clip": 1.04875469, "balance_loss_mlp": 1.00062943, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.449384926202109, "language_loss": 0.79934084, "learning_rate": 1.689807459290431e-07, "loss": 0.8181901, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.605619430541992 }, { "auxiliary_loss_clip": 0.01139057, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.04777467, "balance_loss_mlp": 1.01882517, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 2.436820089915958, "language_loss": 0.7110033, "learning_rate": 1.6866750759166437e-07, "loss": 0.73265338, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 4.626864671707153 }, { "auxiliary_loss_clip": 0.01115053, "auxiliary_loss_mlp": 0.01028295, "balance_loss_clip": 1.04056048, "balance_loss_mlp": 1.02096081, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.4313454261116076, "language_loss": 0.77274382, "learning_rate": 1.6835454706596865e-07, "loss": 0.79417735, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.6185359954833984 }, { "auxiliary_loss_clip": 0.0117169, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.05177808, "balance_loss_mlp": 1.02522278, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 2.7422356316387217, "language_loss": 0.73599094, "learning_rate": 1.680418643994317e-07, "loss": 0.75802904, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 3.4939064979553223 }, { "auxiliary_loss_clip": 0.01071239, "auxiliary_loss_mlp": 0.01002942, "balance_loss_clip": 1.01818943, "balance_loss_mlp": 1.00183952, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8871830041463593, "language_loss": 0.64494759, "learning_rate": 1.6772945963948738e-07, "loss": 0.66568941, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.1805245876312256 }, { "auxiliary_loss_clip": 0.01136505, "auxiliary_loss_mlp": 0.01022886, "balance_loss_clip": 1.04825759, "balance_loss_mlp": 1.01520252, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.3333656813072388, "language_loss": 0.7720927, "learning_rate": 1.6741733283352733e-07, "loss": 0.79368663, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 3.515949010848999 }, { "auxiliary_loss_clip": 0.0110933, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.04422867, "balance_loss_mlp": 1.01926136, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 11.79664590658425, "language_loss": 0.83989704, "learning_rate": 1.6710548402890102e-07, "loss": 0.86125517, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.722398042678833 }, { "auxiliary_loss_clip": 0.01173485, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.05064368, "balance_loss_mlp": 1.02069116, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 2.0616685822361682, "language_loss": 0.66882706, "learning_rate": 1.6679391327291527e-07, "loss": 0.69084305, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.681565999984741 }, { "auxiliary_loss_clip": 0.01136312, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.04395437, "balance_loss_mlp": 1.02120757, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 3.0477513647907877, "language_loss": 0.68139213, "learning_rate": 1.6648262061283492e-07, "loss": 0.70303583, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.631913185119629 }, { "auxiliary_loss_clip": 0.01121806, "auxiliary_loss_mlp": 0.01023388, "balance_loss_clip": 1.04329324, "balance_loss_mlp": 1.01593113, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 2.5811930274737267, "language_loss": 0.73759449, "learning_rate": 1.6617160609588353e-07, "loss": 0.75904644, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.6482322216033936 }, { "auxiliary_loss_clip": 0.01139185, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.04526508, "balance_loss_mlp": 1.02487087, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.3711405079721666, "language_loss": 0.72291559, "learning_rate": 1.6586086976924163e-07, "loss": 0.74463069, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.6079938411712646 }, { "auxiliary_loss_clip": 0.01154109, "auxiliary_loss_mlp": 0.0102107, "balance_loss_clip": 1.0475229, "balance_loss_mlp": 1.01443291, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 2.4155670339805098, "language_loss": 0.78333759, "learning_rate": 1.6555041168004747e-07, "loss": 0.80508935, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.6168293952941895 }, { "auxiliary_loss_clip": 0.0113388, "auxiliary_loss_mlp": 0.01025594, "balance_loss_clip": 1.04566193, "balance_loss_mlp": 1.01859593, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 1.9450931298733267, "language_loss": 0.69126898, "learning_rate": 1.6524023187539715e-07, "loss": 0.71286374, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.6136577129364014 }, { "auxiliary_loss_clip": 0.01134843, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.04471326, "balance_loss_mlp": 1.02659547, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 1.7349823170420346, "language_loss": 0.74821651, "learning_rate": 1.649303304023446e-07, "loss": 0.76990664, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.6014535427093506 }, { "auxiliary_loss_clip": 0.01113767, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.04451442, "balance_loss_mlp": 1.02270627, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 2.195141538577346, "language_loss": 0.7889291, "learning_rate": 1.6462070730790246e-07, "loss": 0.81036544, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.650284767150879 }, { "auxiliary_loss_clip": 0.01128708, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.0415175, "balance_loss_mlp": 1.01660419, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.3264714613115594, "language_loss": 0.78590608, "learning_rate": 1.6431136263903912e-07, "loss": 0.80742788, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.660909652709961 }, { "auxiliary_loss_clip": 0.01154083, "auxiliary_loss_mlp": 0.00711323, "balance_loss_clip": 1.04527855, "balance_loss_mlp": 1.00052893, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.9476722814613885, "language_loss": 0.73523062, "learning_rate": 1.6400229644268282e-07, "loss": 0.75388467, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.607093095779419 }, { "auxiliary_loss_clip": 0.01118039, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.04773748, "balance_loss_mlp": 1.02640784, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 1.9191448589589646, "language_loss": 0.80927396, "learning_rate": 1.6369350876571852e-07, "loss": 0.83079195, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.6889569759368896 }, { "auxiliary_loss_clip": 0.0110165, "auxiliary_loss_mlp": 0.01026092, "balance_loss_clip": 1.04211903, "balance_loss_mlp": 1.019521, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 2.232796120708142, "language_loss": 0.81871045, "learning_rate": 1.6338499965498874e-07, "loss": 0.83998787, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.7058684825897217 }, { "auxiliary_loss_clip": 0.01116982, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 1.04259467, "balance_loss_mlp": 1.02068257, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 2.983304693474121, "language_loss": 0.77390635, "learning_rate": 1.630767691572943e-07, "loss": 0.79535139, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.767603874206543 }, { "auxiliary_loss_clip": 0.01048454, "auxiliary_loss_mlp": 0.01009934, "balance_loss_clip": 1.01782906, "balance_loss_mlp": 1.0088551, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7534938922877458, "language_loss": 0.53502291, "learning_rate": 1.6276881731939306e-07, "loss": 0.55560678, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.2873458862304688 }, { "auxiliary_loss_clip": 0.01150842, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.04822004, "balance_loss_mlp": 1.01681972, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.8661022169526686, "language_loss": 0.75502527, "learning_rate": 1.6246114418800193e-07, "loss": 0.77677238, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.6328067779541016 }, { "auxiliary_loss_clip": 0.01148045, "auxiliary_loss_mlp": 0.01031997, "balance_loss_clip": 1.04735422, "balance_loss_mlp": 1.02426028, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.6871220942344194, "language_loss": 0.76847041, "learning_rate": 1.6215374980979423e-07, "loss": 0.79027081, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.6985130310058594 }, { "auxiliary_loss_clip": 0.01150615, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.04979515, "balance_loss_mlp": 1.01963234, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 27.660511966507237, "language_loss": 0.68858945, "learning_rate": 1.6184663423140133e-07, "loss": 0.71036106, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.785284996032715 }, { "auxiliary_loss_clip": 0.01108119, "auxiliary_loss_mlp": 0.01028865, "balance_loss_clip": 1.04392409, "balance_loss_mlp": 1.02214503, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 4.651265764921924, "language_loss": 0.64141738, "learning_rate": 1.615397974994126e-07, "loss": 0.6627872, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.7030789852142334 }, { "auxiliary_loss_clip": 0.01165973, "auxiliary_loss_mlp": 0.01025981, "balance_loss_clip": 1.04885399, "balance_loss_mlp": 1.01872373, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 2.266625577597652, "language_loss": 0.8078264, "learning_rate": 1.6123323966037438e-07, "loss": 0.82974589, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.618687152862549 }, { "auxiliary_loss_clip": 0.01168262, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.04976106, "balance_loss_mlp": 1.01925349, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 1.9582025732912356, "language_loss": 0.7897799, "learning_rate": 1.6092696076079216e-07, "loss": 0.81172603, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.580272912979126 }, { "auxiliary_loss_clip": 0.01110319, "auxiliary_loss_mlp": 0.01021075, "balance_loss_clip": 1.04370558, "balance_loss_mlp": 1.01459026, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.6602956875180086, "language_loss": 0.74181414, "learning_rate": 1.6062096084712785e-07, "loss": 0.7631281, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 4.40845251083374 }, { "auxiliary_loss_clip": 0.01126046, "auxiliary_loss_mlp": 0.0071182, "balance_loss_clip": 1.04208827, "balance_loss_mlp": 1.00056434, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 2.0781810646684984, "language_loss": 0.70699227, "learning_rate": 1.6031523996580098e-07, "loss": 0.725371, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.6318771839141846 }, { "auxiliary_loss_clip": 0.01128503, "auxiliary_loss_mlp": 0.01027002, "balance_loss_clip": 1.04493999, "balance_loss_mlp": 1.01936638, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 2.3471359027771936, "language_loss": 0.6649071, "learning_rate": 1.6000979816318981e-07, "loss": 0.68646216, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 3.57239031791687 }, { "auxiliary_loss_clip": 0.01148844, "auxiliary_loss_mlp": 0.01025161, "balance_loss_clip": 1.04861629, "balance_loss_mlp": 1.01766896, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.8066477366370406, "language_loss": 0.74977887, "learning_rate": 1.5970463548562886e-07, "loss": 0.77151895, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.558089256286621 }, { "auxiliary_loss_clip": 0.01135717, "auxiliary_loss_mlp": 0.01027992, "balance_loss_clip": 1.04701185, "balance_loss_mlp": 1.02084219, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.8505562980279937, "language_loss": 0.7123639, "learning_rate": 1.5939975197941192e-07, "loss": 0.73400104, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 3.6199185848236084 }, { "auxiliary_loss_clip": 0.01045968, "auxiliary_loss_mlp": 0.01004698, "balance_loss_clip": 1.01676106, "balance_loss_mlp": 1.00375068, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8099924494922676, "language_loss": 0.53314996, "learning_rate": 1.5909514769078892e-07, "loss": 0.55365658, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 3.2988295555114746 }, { "auxiliary_loss_clip": 0.01114715, "auxiliary_loss_mlp": 0.0102973, "balance_loss_clip": 1.04820871, "balance_loss_mlp": 1.02270293, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.659910020101776, "language_loss": 0.77559292, "learning_rate": 1.5879082266596867e-07, "loss": 0.79703736, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.7213709354400635 }, { "auxiliary_loss_clip": 0.0112983, "auxiliary_loss_mlp": 0.01022602, "balance_loss_clip": 1.04270554, "balance_loss_mlp": 1.01609874, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 2.108938634950352, "language_loss": 0.72345054, "learning_rate": 1.5848677695111645e-07, "loss": 0.74497491, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.7265961170196533 }, { "auxiliary_loss_clip": 0.01127886, "auxiliary_loss_mlp": 0.01025751, "balance_loss_clip": 1.04548383, "balance_loss_mlp": 1.01822853, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 3.3036933650234017, "language_loss": 0.69747484, "learning_rate": 1.5818301059235562e-07, "loss": 0.71901119, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.621242046356201 }, { "auxiliary_loss_clip": 0.01143046, "auxiliary_loss_mlp": 0.01028458, "balance_loss_clip": 1.05061269, "balance_loss_mlp": 1.02073312, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.7776857945380315, "language_loss": 0.81582642, "learning_rate": 1.578795236357684e-07, "loss": 0.83754146, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.711740732192993 }, { "auxiliary_loss_clip": 0.01138688, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.04932642, "balance_loss_mlp": 1.02530789, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 2.041149620286163, "language_loss": 0.85517913, "learning_rate": 1.5757631612739218e-07, "loss": 0.87689078, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.602802276611328 }, { "auxiliary_loss_clip": 0.01070216, "auxiliary_loss_mlp": 0.01002906, "balance_loss_clip": 1.01865649, "balance_loss_mlp": 1.00195217, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.779581904794382, "language_loss": 0.61414981, "learning_rate": 1.572733881132242e-07, "loss": 0.63488114, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.2356793880462646 }, { "auxiliary_loss_clip": 0.01035161, "auxiliary_loss_mlp": 0.01001157, "balance_loss_clip": 1.02138448, "balance_loss_mlp": 0.99997121, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7792260612753723, "language_loss": 0.58500719, "learning_rate": 1.5697073963921814e-07, "loss": 0.60537034, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.1843812465667725 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.05020571, "balance_loss_mlp": 1.01706052, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.370672822146344, "language_loss": 0.8518247, "learning_rate": 1.566683707512857e-07, "loss": 0.87362874, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.580190420150757 }, { "auxiliary_loss_clip": 0.01133818, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.0469799, "balance_loss_mlp": 1.0177722, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 2.145703288757798, "language_loss": 0.7958777, "learning_rate": 1.5636628149529553e-07, "loss": 0.81747043, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.6264803409576416 }, { "auxiliary_loss_clip": 0.01133064, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.04348576, "balance_loss_mlp": 1.02177715, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.454244264273986, "language_loss": 0.79492611, "learning_rate": 1.560644719170743e-07, "loss": 0.81654465, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.695261240005493 }, { "auxiliary_loss_clip": 0.01119138, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.04235494, "balance_loss_mlp": 1.0280323, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 2.757709279923602, "language_loss": 0.71863329, "learning_rate": 1.5576294206240692e-07, "loss": 0.74017608, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.812222480773926 }, { "auxiliary_loss_clip": 0.01133995, "auxiliary_loss_mlp": 0.01025905, "balance_loss_clip": 1.04658294, "balance_loss_mlp": 1.01879108, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.6730914620723163, "language_loss": 0.67786229, "learning_rate": 1.5546169197703507e-07, "loss": 0.69946134, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 2.952279567718506 }, { "auxiliary_loss_clip": 0.01139395, "auxiliary_loss_mlp": 0.01025204, "balance_loss_clip": 1.04323566, "balance_loss_mlp": 1.01822138, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.615461395549173, "language_loss": 0.77282667, "learning_rate": 1.5516072170665774e-07, "loss": 0.79447258, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.681488275527954 }, { "auxiliary_loss_clip": 0.01154669, "auxiliary_loss_mlp": 0.01022192, "balance_loss_clip": 1.04874671, "balance_loss_mlp": 1.01512039, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 2.103035572469119, "language_loss": 0.8718971, "learning_rate": 1.5486003129693214e-07, "loss": 0.89366579, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.56235408782959 }, { "auxiliary_loss_clip": 0.01156521, "auxiliary_loss_mlp": 0.01026645, "balance_loss_clip": 1.04906642, "balance_loss_mlp": 1.01960301, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 1.916118671664, "language_loss": 0.77930903, "learning_rate": 1.545596207934725e-07, "loss": 0.80114067, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.6028032302856445 }, { "auxiliary_loss_clip": 0.01131354, "auxiliary_loss_mlp": 0.01025238, "balance_loss_clip": 1.0445627, "balance_loss_mlp": 1.01869965, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.872392447794172, "language_loss": 0.77932286, "learning_rate": 1.5425949024185147e-07, "loss": 0.80088884, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.647583484649658 }, { "auxiliary_loss_clip": 0.01135214, "auxiliary_loss_mlp": 0.01021726, "balance_loss_clip": 1.04254234, "balance_loss_mlp": 1.01451707, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 1.8479295446313153, "language_loss": 0.67393041, "learning_rate": 1.5395963968759818e-07, "loss": 0.69549978, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.611475944519043 }, { "auxiliary_loss_clip": 0.01134183, "auxiliary_loss_mlp": 0.01024077, "balance_loss_clip": 1.04319441, "balance_loss_mlp": 1.01703739, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.5677130096921903, "language_loss": 0.64493513, "learning_rate": 1.536600691761998e-07, "loss": 0.66651773, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 2.9709370136260986 }, { "auxiliary_loss_clip": 0.01122858, "auxiliary_loss_mlp": 0.01027306, "balance_loss_clip": 1.04703903, "balance_loss_mlp": 1.02034712, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 2.126221187440369, "language_loss": 0.71766496, "learning_rate": 1.5336077875310084e-07, "loss": 0.73916662, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.662194013595581 }, { "auxiliary_loss_clip": 0.01111175, "auxiliary_loss_mlp": 0.01025523, "balance_loss_clip": 1.04465687, "balance_loss_mlp": 1.01870465, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 1.977957916139928, "language_loss": 0.73882574, "learning_rate": 1.5306176846370321e-07, "loss": 0.76019275, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.7116734981536865 }, { "auxiliary_loss_clip": 0.01142956, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.04557729, "balance_loss_mlp": 1.01870453, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 2.624177981695129, "language_loss": 0.74020183, "learning_rate": 1.5276303835336712e-07, "loss": 0.7618897, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 3.588056802749634 }, { "auxiliary_loss_clip": 0.01060621, "auxiliary_loss_mlp": 0.01007275, "balance_loss_clip": 1.01853371, "balance_loss_mlp": 1.00616634, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.7631272349333956, "language_loss": 0.53522825, "learning_rate": 1.524645884674094e-07, "loss": 0.55590725, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.1793875694274902 }, { "auxiliary_loss_clip": 0.01171742, "auxiliary_loss_mlp": 0.00711715, "balance_loss_clip": 1.05015993, "balance_loss_mlp": 1.00056458, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.4377680243212523, "language_loss": 0.79458177, "learning_rate": 1.521664188511047e-07, "loss": 0.8134163, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 3.5651416778564453 }, { "auxiliary_loss_clip": 0.01138522, "auxiliary_loss_mlp": 0.00711706, "balance_loss_clip": 1.04905427, "balance_loss_mlp": 1.00068331, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 2.228956402253443, "language_loss": 0.80466807, "learning_rate": 1.518685295496851e-07, "loss": 0.8231703, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 3.540052652359009 }, { "auxiliary_loss_clip": 0.01151497, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.04603601, "balance_loss_mlp": 1.02366519, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.8625544502545086, "language_loss": 0.85482162, "learning_rate": 1.5157092060833975e-07, "loss": 0.87664306, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.6647891998291016 }, { "auxiliary_loss_clip": 0.01134788, "auxiliary_loss_mlp": 0.01021182, "balance_loss_clip": 1.04533172, "balance_loss_mlp": 1.01455426, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.5918754965258861, "language_loss": 0.66044629, "learning_rate": 1.5127359207221658e-07, "loss": 0.68200594, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.6485180854797363 }, { "auxiliary_loss_clip": 0.01076134, "auxiliary_loss_mlp": 0.01024531, "balance_loss_clip": 1.03609395, "balance_loss_mlp": 1.01705623, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 2.172699786389227, "language_loss": 0.73228389, "learning_rate": 1.5097654398641923e-07, "loss": 0.75329053, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.7807912826538086 }, { "auxiliary_loss_clip": 0.01157508, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04901576, "balance_loss_mlp": 1.01882529, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.3503530319270973, "language_loss": 0.73129719, "learning_rate": 1.5067977639601014e-07, "loss": 0.75312805, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.638730764389038 }, { "auxiliary_loss_clip": 0.01136694, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.04843712, "balance_loss_mlp": 1.01802325, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.2583710904733008, "language_loss": 0.71215951, "learning_rate": 1.5038328934600864e-07, "loss": 0.73377436, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.6358654499053955 }, { "auxiliary_loss_clip": 0.01138379, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.04830599, "balance_loss_mlp": 1.01876414, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 3.281254082736209, "language_loss": 0.69682872, "learning_rate": 1.5008708288139161e-07, "loss": 0.71847439, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.827544689178467 }, { "auxiliary_loss_clip": 0.01153092, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.04900849, "balance_loss_mlp": 1.02303505, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 2.6774841307588892, "language_loss": 0.73477507, "learning_rate": 1.497911570470931e-07, "loss": 0.75660795, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.5882771015167236 }, { "auxiliary_loss_clip": 0.01112215, "auxiliary_loss_mlp": 0.01030823, "balance_loss_clip": 1.04406273, "balance_loss_mlp": 1.02379608, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.7478636923184137, "language_loss": 0.86028326, "learning_rate": 1.494955118880048e-07, "loss": 0.88171363, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.710239887237549 }, { "auxiliary_loss_clip": 0.01150422, "auxiliary_loss_mlp": 0.01022497, "balance_loss_clip": 1.04564106, "balance_loss_mlp": 1.01599097, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.6786227667013176, "language_loss": 0.73358476, "learning_rate": 1.4920014744897634e-07, "loss": 0.75531399, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.610226631164551 }, { "auxiliary_loss_clip": 0.01129042, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.0462184, "balance_loss_mlp": 1.02658749, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.7506703557995285, "language_loss": 0.85962892, "learning_rate": 1.4890506377481392e-07, "loss": 0.88125068, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.6640987396240234 }, { "auxiliary_loss_clip": 0.01087061, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 1.04219234, "balance_loss_mlp": 1.01769447, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.4454008436505628, "language_loss": 0.63984883, "learning_rate": 1.486102609102815e-07, "loss": 0.66096133, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.7245428562164307 }, { "auxiliary_loss_clip": 0.01131218, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.04562831, "balance_loss_mlp": 1.01850104, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 3.084520962432204, "language_loss": 0.85879743, "learning_rate": 1.483157389001004e-07, "loss": 0.88036299, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.708832025527954 }, { "auxiliary_loss_clip": 0.01134163, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.04388273, "balance_loss_mlp": 1.01886773, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.724914032088588, "language_loss": 0.7904641, "learning_rate": 1.4802149778894933e-07, "loss": 0.81207317, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.6211485862731934 }, { "auxiliary_loss_clip": 0.01142575, "auxiliary_loss_mlp": 0.01026524, "balance_loss_clip": 1.04326153, "balance_loss_mlp": 1.0196991, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.9876412999934248, "language_loss": 0.87779427, "learning_rate": 1.4772753762146484e-07, "loss": 0.89948529, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.5980467796325684 }, { "auxiliary_loss_clip": 0.01146277, "auxiliary_loss_mlp": 0.01024254, "balance_loss_clip": 1.04480422, "balance_loss_mlp": 1.01680684, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.607663210277943, "language_loss": 0.70437479, "learning_rate": 1.474338584422401e-07, "loss": 0.72608006, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.693432092666626 }, { "auxiliary_loss_clip": 0.01150789, "auxiliary_loss_mlp": 0.01023572, "balance_loss_clip": 1.04831254, "balance_loss_mlp": 1.0170157, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 1.9972798509030012, "language_loss": 0.76039052, "learning_rate": 1.4714046029582595e-07, "loss": 0.78213418, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.6366682052612305 }, { "auxiliary_loss_clip": 0.01123584, "auxiliary_loss_mlp": 0.01025896, "balance_loss_clip": 1.04438388, "balance_loss_mlp": 1.01842475, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.968849894322375, "language_loss": 0.75885427, "learning_rate": 1.46847343226731e-07, "loss": 0.78034902, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.7077600955963135 }, { "auxiliary_loss_clip": 0.01156956, "auxiliary_loss_mlp": 0.01022402, "balance_loss_clip": 1.0493933, "balance_loss_mlp": 1.01573193, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.3860289063218096, "language_loss": 0.69535995, "learning_rate": 1.465545072794203e-07, "loss": 0.71715343, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.620776653289795 }, { "auxiliary_loss_clip": 0.01102662, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.04520845, "balance_loss_mlp": 1.02094376, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.7253617019234082, "language_loss": 0.75890934, "learning_rate": 1.4626195249831774e-07, "loss": 0.780213, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.80126953125 }, { "auxiliary_loss_clip": 0.01148709, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.04516113, "balance_loss_mlp": 1.01855087, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 3.9922263401075533, "language_loss": 0.72126245, "learning_rate": 1.4596967892780244e-07, "loss": 0.74300241, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.569120168685913 }, { "auxiliary_loss_clip": 0.01167164, "auxiliary_loss_mlp": 0.01025878, "balance_loss_clip": 1.05027878, "balance_loss_mlp": 1.01912498, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.8901409478196083, "language_loss": 0.74620491, "learning_rate": 1.4567768661221314e-07, "loss": 0.76813537, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.5867760181427 }, { "auxiliary_loss_clip": 0.0115424, "auxiliary_loss_mlp": 0.00711482, "balance_loss_clip": 1.04761434, "balance_loss_mlp": 1.00059736, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 1.9825792403607811, "language_loss": 0.74700034, "learning_rate": 1.4538597559584442e-07, "loss": 0.76565754, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.6004459857940674 }, { "auxiliary_loss_clip": 0.01131264, "auxiliary_loss_mlp": 0.01028989, "balance_loss_clip": 1.04477942, "balance_loss_mlp": 1.02161586, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 2.892583334545451, "language_loss": 0.78869808, "learning_rate": 1.4509454592294823e-07, "loss": 0.81030065, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 3.608151912689209 }, { "auxiliary_loss_clip": 0.01122134, "auxiliary_loss_mlp": 0.0071125, "balance_loss_clip": 1.04557443, "balance_loss_mlp": 1.00068557, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 2.9239132541308646, "language_loss": 0.78741729, "learning_rate": 1.448033976377354e-07, "loss": 0.8057512, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.6231746673583984 }, { "auxiliary_loss_clip": 0.01153949, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.0456593, "balance_loss_mlp": 1.02022815, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 1.9717115305050332, "language_loss": 0.74336684, "learning_rate": 1.445125307843713e-07, "loss": 0.7651751, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.6137495040893555 }, { "auxiliary_loss_clip": 0.0115116, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.04942155, "balance_loss_mlp": 1.02116358, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.6703099516015372, "language_loss": 0.75787151, "learning_rate": 1.442219454069813e-07, "loss": 0.77966303, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 4.41896915435791 }, { "auxiliary_loss_clip": 0.01108447, "auxiliary_loss_mlp": 0.0102226, "balance_loss_clip": 1.0437609, "balance_loss_mlp": 1.01628208, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 2.106450098985086, "language_loss": 0.66820371, "learning_rate": 1.4393164154964676e-07, "loss": 0.68951082, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.682506799697876 }, { "auxiliary_loss_clip": 0.01150976, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.0476594, "balance_loss_mlp": 1.01934695, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 2.120912522440296, "language_loss": 0.94088978, "learning_rate": 1.4364161925640649e-07, "loss": 0.96265864, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.6785664558410645 }, { "auxiliary_loss_clip": 0.01168053, "auxiliary_loss_mlp": 0.01028053, "balance_loss_clip": 1.05015159, "balance_loss_mlp": 1.02135587, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 2.2313831606245422, "language_loss": 0.85346842, "learning_rate": 1.4335187857125663e-07, "loss": 0.87542951, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.533735513687134 }, { "auxiliary_loss_clip": 0.01153238, "auxiliary_loss_mlp": 0.01025787, "balance_loss_clip": 1.04717994, "balance_loss_mlp": 1.01881886, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.6542484196240452, "language_loss": 0.75722992, "learning_rate": 1.4306241953815023e-07, "loss": 0.77902019, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.6716766357421875 }, { "auxiliary_loss_clip": 0.01156162, "auxiliary_loss_mlp": 0.01022896, "balance_loss_clip": 1.04867649, "balance_loss_mlp": 1.01635396, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 2.3229547965632076, "language_loss": 0.70913875, "learning_rate": 1.4277324220099862e-07, "loss": 0.73092937, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.6422970294952393 }, { "auxiliary_loss_clip": 0.01116121, "auxiliary_loss_mlp": 0.01022287, "balance_loss_clip": 1.04304361, "balance_loss_mlp": 1.01542616, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.9452447213200854, "language_loss": 0.74374712, "learning_rate": 1.4248434660366938e-07, "loss": 0.76513124, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.6916208267211914 }, { "auxiliary_loss_clip": 0.0113864, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.04843533, "balance_loss_mlp": 1.02318883, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 1.7607489941388852, "language_loss": 0.70444274, "learning_rate": 1.4219573278998808e-07, "loss": 0.72612733, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.6322779655456543 }, { "auxiliary_loss_clip": 0.01133182, "auxiliary_loss_mlp": 0.01026831, "balance_loss_clip": 1.04398024, "balance_loss_mlp": 1.01943099, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 3.0729323503794057, "language_loss": 0.65208644, "learning_rate": 1.4190740080373685e-07, "loss": 0.67368662, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.8127663135528564 }, { "auxiliary_loss_clip": 0.01105913, "auxiliary_loss_mlp": 0.01023109, "balance_loss_clip": 1.04643238, "balance_loss_mlp": 1.01629043, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.9863405377667, "language_loss": 0.84252298, "learning_rate": 1.4161935068865538e-07, "loss": 0.86381322, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.6809349060058594 }, { "auxiliary_loss_clip": 0.01169758, "auxiliary_loss_mlp": 0.01025339, "balance_loss_clip": 1.05012202, "balance_loss_mlp": 1.01768613, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 1.9139815260189181, "language_loss": 0.75413281, "learning_rate": 1.4133158248844113e-07, "loss": 0.77608383, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.5314395427703857 }, { "auxiliary_loss_clip": 0.01121918, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.04298472, "balance_loss_mlp": 1.02231991, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.9814019620830075, "language_loss": 0.73438478, "learning_rate": 1.4104409624674785e-07, "loss": 0.75589728, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.712254524230957 }, { "auxiliary_loss_clip": 0.0115647, "auxiliary_loss_mlp": 0.01025607, "balance_loss_clip": 1.05212939, "balance_loss_mlp": 1.01840711, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.8900982595273028, "language_loss": 0.78510761, "learning_rate": 1.407568920071873e-07, "loss": 0.8069284, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.621270179748535 }, { "auxiliary_loss_clip": 0.01174217, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.05161226, "balance_loss_mlp": 1.02441573, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 4.490670669222491, "language_loss": 0.67947966, "learning_rate": 1.4046996981332782e-07, "loss": 0.70154178, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.6382181644439697 }, { "auxiliary_loss_clip": 0.01121563, "auxiliary_loss_mlp": 0.01025226, "balance_loss_clip": 1.0432812, "balance_loss_mlp": 1.01738238, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 3.3638197075906113, "language_loss": 0.78387737, "learning_rate": 1.4018332970869516e-07, "loss": 0.80534524, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.6628973484039307 }, { "auxiliary_loss_clip": 0.0113175, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.04730129, "balance_loss_mlp": 1.02128553, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.9592578262559959, "language_loss": 0.85252428, "learning_rate": 1.398969717367733e-07, "loss": 0.87412632, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.6787054538726807 }, { "auxiliary_loss_clip": 0.01103511, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.04421079, "balance_loss_mlp": 1.01695871, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.6712928990363478, "language_loss": 0.76214182, "learning_rate": 1.396108959410014e-07, "loss": 0.78341335, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.642765522003174 }, { "auxiliary_loss_clip": 0.01153434, "auxiliary_loss_mlp": 0.00711512, "balance_loss_clip": 1.05052221, "balance_loss_mlp": 1.00056601, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.7121753368260082, "language_loss": 0.81374669, "learning_rate": 1.3932510236477745e-07, "loss": 0.83239615, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.6642990112304688 }, { "auxiliary_loss_clip": 0.01150629, "auxiliary_loss_mlp": 0.01028301, "balance_loss_clip": 1.04546189, "balance_loss_mlp": 1.02111912, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.762149329410209, "language_loss": 0.5633868, "learning_rate": 1.3903959105145636e-07, "loss": 0.58517611, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.6122279167175293 }, { "auxiliary_loss_clip": 0.01168843, "auxiliary_loss_mlp": 0.01025981, "balance_loss_clip": 1.04983366, "balance_loss_mlp": 1.01871824, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 1.9471998473741414, "language_loss": 0.83210641, "learning_rate": 1.387543620443492e-07, "loss": 0.85405463, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.615427017211914 }, { "auxiliary_loss_clip": 0.01168646, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.05100489, "balance_loss_mlp": 1.01886475, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 2.409308048222812, "language_loss": 0.8432011, "learning_rate": 1.3846941538672606e-07, "loss": 0.86514598, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.635319232940674 }, { "auxiliary_loss_clip": 0.01110808, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.04590476, "balance_loss_mlp": 1.02356815, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.4869225521708542, "language_loss": 0.81334114, "learning_rate": 1.3818475112181193e-07, "loss": 0.83475852, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.7162320613861084 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01025147, "balance_loss_clip": 1.04603088, "balance_loss_mlp": 1.01811111, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 3.858011664494301, "language_loss": 0.79379094, "learning_rate": 1.3790036929279091e-07, "loss": 0.81540275, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.6063413619995117 }, { "auxiliary_loss_clip": 0.01156109, "auxiliary_loss_mlp": 0.00710997, "balance_loss_clip": 1.05005181, "balance_loss_mlp": 1.00062537, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.772119419256685, "language_loss": 0.59249115, "learning_rate": 1.3761626994280363e-07, "loss": 0.61116219, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 4.388470411300659 }, { "auxiliary_loss_clip": 0.01125977, "auxiliary_loss_mlp": 0.01021092, "balance_loss_clip": 1.04551995, "balance_loss_mlp": 1.01444888, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.762981062614717, "language_loss": 0.73569888, "learning_rate": 1.3733245311494735e-07, "loss": 0.7571696, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.7654712200164795 }, { "auxiliary_loss_clip": 0.0115401, "auxiliary_loss_mlp": 0.01023235, "balance_loss_clip": 1.048491, "balance_loss_mlp": 1.01600766, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 1.9269473823826289, "language_loss": 0.70811838, "learning_rate": 1.3704891885227676e-07, "loss": 0.72989082, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 3.5739943981170654 }, { "auxiliary_loss_clip": 0.01119114, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.04160893, "balance_loss_mlp": 1.0204854, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 2.4962304681481386, "language_loss": 0.77994269, "learning_rate": 1.367656671978037e-07, "loss": 0.80141342, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 3.524130344390869 }, { "auxiliary_loss_clip": 0.0114039, "auxiliary_loss_mlp": 0.01023697, "balance_loss_clip": 1.0441432, "balance_loss_mlp": 1.01683331, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 2.017753274754818, "language_loss": 0.73314381, "learning_rate": 1.36482698194498e-07, "loss": 0.7547847, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.6390810012817383 }, { "auxiliary_loss_clip": 0.01134526, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.04493332, "balance_loss_mlp": 1.0206629, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.948254035862051, "language_loss": 0.72094405, "learning_rate": 1.3620001188528506e-07, "loss": 0.74257547, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.622931480407715 }, { "auxiliary_loss_clip": 0.01155423, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.04705644, "balance_loss_mlp": 1.0193187, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.719256254932358, "language_loss": 0.74285775, "learning_rate": 1.3591760831304865e-07, "loss": 0.7646817, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.659722089767456 }, { "auxiliary_loss_clip": 0.0116828, "auxiliary_loss_mlp": 0.01025704, "balance_loss_clip": 1.0500257, "balance_loss_mlp": 1.01888847, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.9623678211110773, "language_loss": 0.79797328, "learning_rate": 1.356354875206287e-07, "loss": 0.81991309, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.5655932426452637 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.01031696, "balance_loss_clip": 1.04903793, "balance_loss_mlp": 1.02464819, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 1.9320873702008419, "language_loss": 0.69944608, "learning_rate": 1.3535364955082296e-07, "loss": 0.72099447, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.8840060234069824 }, { "auxiliary_loss_clip": 0.01168205, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.0509361, "balance_loss_mlp": 1.01795673, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.9585634811874122, "language_loss": 0.64709026, "learning_rate": 1.3507209444638613e-07, "loss": 0.66901785, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.600674629211426 }, { "auxiliary_loss_clip": 0.01153729, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.04846418, "balance_loss_mlp": 1.02147818, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.8588471088429326, "language_loss": 0.74335921, "learning_rate": 1.347908222500298e-07, "loss": 0.76518756, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.6898367404937744 }, { "auxiliary_loss_clip": 0.01109756, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 1.04496479, "balance_loss_mlp": 1.01929533, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 2.134492753739863, "language_loss": 0.69909626, "learning_rate": 1.3450983300442276e-07, "loss": 0.72045153, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.6501290798187256 }, { "auxiliary_loss_clip": 0.01156396, "auxiliary_loss_mlp": 0.01021922, "balance_loss_clip": 1.04956591, "balance_loss_mlp": 1.01521075, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 3.987381447718642, "language_loss": 0.73591423, "learning_rate": 1.3422912675219068e-07, "loss": 0.75769734, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.6410605907440186 }, { "auxiliary_loss_clip": 0.01165405, "auxiliary_loss_mlp": 0.01026397, "balance_loss_clip": 1.05000973, "balance_loss_mlp": 1.01975417, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.8502277483280094, "language_loss": 0.79188967, "learning_rate": 1.339487035359166e-07, "loss": 0.81380779, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.6153621673583984 }, { "auxiliary_loss_clip": 0.01140667, "auxiliary_loss_mlp": 0.00710417, "balance_loss_clip": 1.04996729, "balance_loss_mlp": 1.000615, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 2.2114995085173663, "language_loss": 0.84697449, "learning_rate": 1.336685633981409e-07, "loss": 0.86548537, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.6362497806549072 }, { "auxiliary_loss_clip": 0.01153598, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.0465548, "balance_loss_mlp": 1.02230966, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 1.9660234849915168, "language_loss": 0.75103295, "learning_rate": 1.333887063813597e-07, "loss": 0.77286565, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.581587791442871 }, { "auxiliary_loss_clip": 0.01137063, "auxiliary_loss_mlp": 0.01021728, "balance_loss_clip": 1.04453969, "balance_loss_mlp": 1.01477861, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 2.1256151994246624, "language_loss": 0.6642108, "learning_rate": 1.331091325280278e-07, "loss": 0.6857987, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.6075732707977295 }, { "auxiliary_loss_clip": 0.01097594, "auxiliary_loss_mlp": 0.01023037, "balance_loss_clip": 1.04104221, "balance_loss_mlp": 1.01550007, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 2.4072341464085354, "language_loss": 0.78483021, "learning_rate": 1.3282984188055625e-07, "loss": 0.80603647, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.689286231994629 }, { "auxiliary_loss_clip": 0.01168043, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.04930806, "balance_loss_mlp": 1.01810908, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.7693179476289451, "language_loss": 0.79616678, "learning_rate": 1.3255083448131288e-07, "loss": 0.81809515, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.530109405517578 }, { "auxiliary_loss_clip": 0.01154245, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.04560828, "balance_loss_mlp": 1.01553988, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.2125253644021368, "language_loss": 0.7885735, "learning_rate": 1.3227211037262365e-07, "loss": 0.81034231, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.6480326652526855 }, { "auxiliary_loss_clip": 0.01106392, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 1.04226518, "balance_loss_mlp": 1.02770519, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 3.406498371983035, "language_loss": 0.8533268, "learning_rate": 1.319936695967696e-07, "loss": 0.87473869, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.6584959030151367 }, { "auxiliary_loss_clip": 0.01175613, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.05060768, "balance_loss_mlp": 1.0193783, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.396304025525335, "language_loss": 0.82087445, "learning_rate": 1.3171551219599097e-07, "loss": 0.84290147, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.583444595336914 }, { "auxiliary_loss_clip": 0.01170855, "auxiliary_loss_mlp": 0.01025807, "balance_loss_clip": 1.05237842, "balance_loss_mlp": 1.01816583, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.3254014191727563, "language_loss": 0.78056061, "learning_rate": 1.3143763821248377e-07, "loss": 0.80252725, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.5591559410095215 }, { "auxiliary_loss_clip": 0.01165792, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.04887319, "balance_loss_mlp": 1.02141809, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 2.602510328990243, "language_loss": 0.72211301, "learning_rate": 1.3116004768840118e-07, "loss": 0.74405277, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.556636095046997 }, { "auxiliary_loss_clip": 0.01168981, "auxiliary_loss_mlp": 0.01025076, "balance_loss_clip": 1.04909182, "balance_loss_mlp": 1.01772094, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.6698424154692666, "language_loss": 0.74254298, "learning_rate": 1.3088274066585348e-07, "loss": 0.76448357, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.546954393386841 }, { "auxiliary_loss_clip": 0.01125196, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.04355752, "balance_loss_mlp": 1.019189, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 3.096297065665467, "language_loss": 0.90714979, "learning_rate": 1.3060571718690749e-07, "loss": 0.92866898, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.674384593963623 }, { "auxiliary_loss_clip": 0.01035462, "auxiliary_loss_mlp": 0.00701448, "balance_loss_clip": 1.01781738, "balance_loss_mlp": 1.00000942, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.745499578672403, "language_loss": 0.56909889, "learning_rate": 1.3032897729358805e-07, "loss": 0.58646798, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 4.093285799026489 }, { "auxiliary_loss_clip": 0.01076717, "auxiliary_loss_mlp": 0.00711747, "balance_loss_clip": 1.03779316, "balance_loss_mlp": 1.00062954, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 2.944151091975474, "language_loss": 0.80417848, "learning_rate": 1.3005252102787645e-07, "loss": 0.82206309, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.7890818119049072 }, { "auxiliary_loss_clip": 0.01157076, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.04961395, "balance_loss_mlp": 1.02414203, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.6199161971647082, "language_loss": 0.73902667, "learning_rate": 1.297763484317105e-07, "loss": 0.76090205, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 3.49564528465271 }, { "auxiliary_loss_clip": 0.01101949, "auxiliary_loss_mlp": 0.00711905, "balance_loss_clip": 1.04099762, "balance_loss_mlp": 1.00068283, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.411852451963064, "language_loss": 0.70431495, "learning_rate": 1.2950045954698551e-07, "loss": 0.72245347, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 3.6143853664398193 }, { "auxiliary_loss_clip": 0.01115716, "auxiliary_loss_mlp": 0.01026185, "balance_loss_clip": 1.04692411, "balance_loss_mlp": 1.01938105, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.5966476037146378, "language_loss": 0.7537356, "learning_rate": 1.2922485441555343e-07, "loss": 0.77515465, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.674180746078491 }, { "auxiliary_loss_clip": 0.01166543, "auxiliary_loss_mlp": 0.01024927, "balance_loss_clip": 1.0473628, "balance_loss_mlp": 1.01826286, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.927266256066142, "language_loss": 0.81550527, "learning_rate": 1.2894953307922363e-07, "loss": 0.83741999, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.572307825088501 }, { "auxiliary_loss_clip": 0.01115418, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.04518652, "balance_loss_mlp": 1.01891112, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.8898689682346093, "language_loss": 0.83946288, "learning_rate": 1.2867449557976208e-07, "loss": 0.8608768, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.6314752101898193 }, { "auxiliary_loss_clip": 0.01151451, "auxiliary_loss_mlp": 0.01026397, "balance_loss_clip": 1.04854369, "balance_loss_mlp": 1.01939917, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 1.957845928238975, "language_loss": 0.75985807, "learning_rate": 1.283997419588916e-07, "loss": 0.7816366, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.5899553298950195 }, { "auxiliary_loss_clip": 0.01156634, "auxiliary_loss_mlp": 0.01024702, "balance_loss_clip": 1.04819417, "balance_loss_mlp": 1.01724529, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 2.3910837417272583, "language_loss": 0.6214329, "learning_rate": 1.2812527225829216e-07, "loss": 0.64324629, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.53962779045105 }, { "auxiliary_loss_clip": 0.01159959, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.05078959, "balance_loss_mlp": 1.01782775, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 2.137106582886329, "language_loss": 0.7684232, "learning_rate": 1.2785108651960052e-07, "loss": 0.790272, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.716491222381592 }, { "auxiliary_loss_clip": 0.0115479, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.0470165, "balance_loss_mlp": 1.02453685, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 2.1317190609669177, "language_loss": 0.80600619, "learning_rate": 1.2757718478441094e-07, "loss": 0.82786715, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.626981496810913 }, { "auxiliary_loss_clip": 0.01135657, "auxiliary_loss_mlp": 0.01021778, "balance_loss_clip": 1.04489565, "balance_loss_mlp": 1.01481283, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 2.1510283535074133, "language_loss": 0.77288985, "learning_rate": 1.2730356709427302e-07, "loss": 0.79446417, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.7321364879608154 }, { "auxiliary_loss_clip": 0.01153717, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.05090117, "balance_loss_mlp": 1.02429605, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.7743714751129123, "language_loss": 0.60040748, "learning_rate": 1.2703023349069542e-07, "loss": 0.622262, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.7538654804229736 }, { "auxiliary_loss_clip": 0.01147395, "auxiliary_loss_mlp": 0.01028617, "balance_loss_clip": 1.04716945, "balance_loss_mlp": 1.02158642, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 1.8172618007288879, "language_loss": 0.62002897, "learning_rate": 1.2675718401514223e-07, "loss": 0.64178908, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.6710212230682373 }, { "auxiliary_loss_clip": 0.01135508, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.04621851, "balance_loss_mlp": 1.01888561, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 3.7198028913846293, "language_loss": 0.74675411, "learning_rate": 1.264844187090346e-07, "loss": 0.76837301, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.599320888519287 }, { "auxiliary_loss_clip": 0.0113027, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.04336309, "balance_loss_mlp": 1.02196848, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.6939290163147194, "language_loss": 0.75457454, "learning_rate": 1.262119376137516e-07, "loss": 0.77616537, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.6713743209838867 }, { "auxiliary_loss_clip": 0.01141554, "auxiliary_loss_mlp": 0.01025874, "balance_loss_clip": 1.0437088, "balance_loss_mlp": 1.01867962, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.9357711687172572, "language_loss": 0.85030711, "learning_rate": 1.2593974077062707e-07, "loss": 0.87198138, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.6623685359954834 }, { "auxiliary_loss_clip": 0.01111339, "auxiliary_loss_mlp": 0.01024577, "balance_loss_clip": 1.0427469, "balance_loss_mlp": 1.01712048, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.9741908782790518, "language_loss": 0.63643098, "learning_rate": 1.2566782822095423e-07, "loss": 0.65779018, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.726121187210083 }, { "auxiliary_loss_clip": 0.01127816, "auxiliary_loss_mlp": 0.01023383, "balance_loss_clip": 1.04738283, "balance_loss_mlp": 1.01664519, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.2757685718811422, "language_loss": 0.71550053, "learning_rate": 1.2539620000598162e-07, "loss": 0.73701251, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.70389461517334 }, { "auxiliary_loss_clip": 0.01167307, "auxiliary_loss_mlp": 0.01024685, "balance_loss_clip": 1.04837322, "balance_loss_mlp": 1.01801264, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.8417545212795459, "language_loss": 0.79862976, "learning_rate": 1.2512485616691492e-07, "loss": 0.82054973, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.533431053161621 }, { "auxiliary_loss_clip": 0.01123901, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.04595852, "balance_loss_mlp": 1.02671671, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.6796666729787015, "language_loss": 0.80847132, "learning_rate": 1.2485379674491681e-07, "loss": 0.83005321, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.8308708667755127 }, { "auxiliary_loss_clip": 0.01140464, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.050493, "balance_loss_mlp": 1.01790309, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.2097278732768784, "language_loss": 0.79405737, "learning_rate": 1.2458302178110657e-07, "loss": 0.81571448, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.6234076023101807 }, { "auxiliary_loss_clip": 0.01112147, "auxiliary_loss_mlp": 0.01022765, "balance_loss_clip": 1.04288721, "balance_loss_mlp": 1.01606846, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 2.2063579407452876, "language_loss": 0.82591832, "learning_rate": 1.2431253131656118e-07, "loss": 0.84726745, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.6784605979919434 }, { "auxiliary_loss_clip": 0.011301, "auxiliary_loss_mlp": 0.01024769, "balance_loss_clip": 1.04595804, "balance_loss_mlp": 1.01751161, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 2.554054057600765, "language_loss": 0.76501542, "learning_rate": 1.240423253923133e-07, "loss": 0.78656411, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.6067757606506348 }, { "auxiliary_loss_clip": 0.0115331, "auxiliary_loss_mlp": 0.01025134, "balance_loss_clip": 1.04677916, "balance_loss_mlp": 1.01813924, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 1.996069151280833, "language_loss": 0.693937, "learning_rate": 1.237724040493533e-07, "loss": 0.71572143, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.6023917198181152 }, { "auxiliary_loss_clip": 0.01172687, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.05065346, "balance_loss_mlp": 1.01791501, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 10.343322569357584, "language_loss": 0.7300998, "learning_rate": 1.2350276732862773e-07, "loss": 0.7520842, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.5758464336395264 }, { "auxiliary_loss_clip": 0.01059452, "auxiliary_loss_mlp": 0.01003785, "balance_loss_clip": 1.01838756, "balance_loss_mlp": 1.0028317, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8428084167238253, "language_loss": 0.56670427, "learning_rate": 1.2323341527103993e-07, "loss": 0.58733666, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 4.080952882766724 }, { "auxiliary_loss_clip": 0.01167705, "auxiliary_loss_mlp": 0.01020592, "balance_loss_clip": 1.04963589, "balance_loss_mlp": 1.01389492, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.038127918402154, "language_loss": 0.84971356, "learning_rate": 1.2296434791745135e-07, "loss": 0.87159657, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.6411449909210205 }, { "auxiliary_loss_clip": 0.01153683, "auxiliary_loss_mlp": 0.01025469, "balance_loss_clip": 1.04824781, "balance_loss_mlp": 1.01785707, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 1.6664477371960058, "language_loss": 0.76643306, "learning_rate": 1.2269556530867875e-07, "loss": 0.78822458, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 3.484606981277466 }, { "auxiliary_loss_clip": 0.01176354, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.05276024, "balance_loss_mlp": 1.02321732, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 2.0332240129444474, "language_loss": 0.81717211, "learning_rate": 1.2242706748549614e-07, "loss": 0.83925009, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 3.5158700942993164 }, { "auxiliary_loss_clip": 0.01133521, "auxiliary_loss_mlp": 0.01021644, "balance_loss_clip": 1.04103231, "balance_loss_mlp": 1.01464605, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 1.929337034250815, "language_loss": 0.8217693, "learning_rate": 1.2215885448863473e-07, "loss": 0.84332097, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.7824902534484863 }, { "auxiliary_loss_clip": 0.01136592, "auxiliary_loss_mlp": 0.01026229, "balance_loss_clip": 1.04707932, "balance_loss_mlp": 1.01920712, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 2.8526285130762252, "language_loss": 0.8012948, "learning_rate": 1.2189092635878152e-07, "loss": 0.82292306, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.7260947227478027 }, { "auxiliary_loss_clip": 0.011113, "auxiliary_loss_mlp": 0.01024343, "balance_loss_clip": 1.04407358, "balance_loss_mlp": 1.01663566, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 2.792852671148715, "language_loss": 0.7746138, "learning_rate": 1.216232831365822e-07, "loss": 0.79597026, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.7020158767700195 }, { "auxiliary_loss_clip": 0.01144814, "auxiliary_loss_mlp": 0.01023779, "balance_loss_clip": 1.04901683, "balance_loss_mlp": 1.01642919, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 1.9903034166851488, "language_loss": 0.80874801, "learning_rate": 1.2135592486263678e-07, "loss": 0.83043396, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.6749396324157715 }, { "auxiliary_loss_clip": 0.0113616, "auxiliary_loss_mlp": 0.01022598, "balance_loss_clip": 1.04449093, "balance_loss_mlp": 1.0156126, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.6563375803367297, "language_loss": 0.61293191, "learning_rate": 1.2108885157750415e-07, "loss": 0.63451952, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.7866733074188232 }, { "auxiliary_loss_clip": 0.01118024, "auxiliary_loss_mlp": 0.00711904, "balance_loss_clip": 1.04571092, "balance_loss_mlp": 1.00059414, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.7428194966335802, "language_loss": 0.80311728, "learning_rate": 1.2082206332169897e-07, "loss": 0.82141662, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.6692519187927246 }, { "auxiliary_loss_clip": 0.01133872, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.01949823, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.4781864868759547, "language_loss": 0.73535204, "learning_rate": 1.2055556013569225e-07, "loss": 0.75695419, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.6102983951568604 }, { "auxiliary_loss_clip": 0.01138667, "auxiliary_loss_mlp": 0.01027552, "balance_loss_clip": 1.04656887, "balance_loss_mlp": 1.02014959, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.6265511721482728, "language_loss": 0.82234138, "learning_rate": 1.2028934205991315e-07, "loss": 0.84400356, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.6237549781799316 }, { "auxiliary_loss_clip": 0.01150919, "auxiliary_loss_mlp": 0.01027479, "balance_loss_clip": 1.04575253, "balance_loss_mlp": 1.02018297, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.4858401769781195, "language_loss": 0.76698101, "learning_rate": 1.2002340913474607e-07, "loss": 0.78876495, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.7025742530822754 }, { "auxiliary_loss_clip": 0.01171215, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.05025733, "balance_loss_mlp": 1.0212549, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 3.4194203361366493, "language_loss": 0.7424019, "learning_rate": 1.1975776140053317e-07, "loss": 0.76440883, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.6011672019958496 }, { "auxiliary_loss_clip": 0.01105447, "auxiliary_loss_mlp": 0.01033602, "balance_loss_clip": 1.04355526, "balance_loss_mlp": 1.02565694, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 2.419565807278921, "language_loss": 0.73316234, "learning_rate": 1.194923988975729e-07, "loss": 0.75455284, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.688936471939087 }, { "auxiliary_loss_clip": 0.01118324, "auxiliary_loss_mlp": 0.01023403, "balance_loss_clip": 1.04505861, "balance_loss_mlp": 1.01618791, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.3721776915756543, "language_loss": 0.73188889, "learning_rate": 1.192273216661206e-07, "loss": 0.75330615, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.6253976821899414 }, { "auxiliary_loss_clip": 0.01008492, "auxiliary_loss_mlp": 0.01000961, "balance_loss_clip": 1.01579809, "balance_loss_mlp": 1.00003135, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.7652608099762235, "language_loss": 0.57423782, "learning_rate": 1.189625297463881e-07, "loss": 0.59433234, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.303170680999756 }, { "auxiliary_loss_clip": 0.01085004, "auxiliary_loss_mlp": 0.01026116, "balance_loss_clip": 1.03921127, "balance_loss_mlp": 1.01935697, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.6535455161096433, "language_loss": 0.80004007, "learning_rate": 1.1869802317854394e-07, "loss": 0.82115132, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.0402255058288574 }, { "auxiliary_loss_clip": 0.01111309, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.04511845, "balance_loss_mlp": 1.02082705, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 2.0197811961780796, "language_loss": 0.72137666, "learning_rate": 1.1843380200271425e-07, "loss": 0.74277198, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.710148811340332 }, { "auxiliary_loss_clip": 0.01116582, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.04454255, "balance_loss_mlp": 1.02724791, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.9886209353633968, "language_loss": 0.80953515, "learning_rate": 1.181698662589805e-07, "loss": 0.83104229, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.714693307876587 }, { "auxiliary_loss_clip": 0.01150912, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.04582334, "balance_loss_mlp": 1.02163124, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 1.939670034449611, "language_loss": 0.76056355, "learning_rate": 1.1790621598738249e-07, "loss": 0.78236181, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.589750051498413 }, { "auxiliary_loss_clip": 0.01166683, "auxiliary_loss_mlp": 0.01023552, "balance_loss_clip": 1.05084133, "balance_loss_mlp": 1.01687884, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 1.9815283028142505, "language_loss": 0.75082958, "learning_rate": 1.1764285122791461e-07, "loss": 0.7727319, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.645242691040039 }, { "auxiliary_loss_clip": 0.01151774, "auxiliary_loss_mlp": 0.0102601, "balance_loss_clip": 1.04535329, "balance_loss_mlp": 1.01908135, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 5.373969977150464, "language_loss": 0.7725296, "learning_rate": 1.173797720205294e-07, "loss": 0.79430747, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.55655574798584 }, { "auxiliary_loss_clip": 0.01155604, "auxiliary_loss_mlp": 0.01030978, "balance_loss_clip": 1.05021667, "balance_loss_mlp": 1.02307439, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 2.4772714013154538, "language_loss": 0.71736932, "learning_rate": 1.1711697840513602e-07, "loss": 0.73923516, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.6910195350646973 }, { "auxiliary_loss_clip": 0.01144465, "auxiliary_loss_mlp": 0.01021845, "balance_loss_clip": 1.04562902, "balance_loss_mlp": 1.01525235, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 2.4165339814694695, "language_loss": 0.71163613, "learning_rate": 1.1685447042160012e-07, "loss": 0.73329926, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.56652569770813 }, { "auxiliary_loss_clip": 0.01168762, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 1.04755282, "balance_loss_mlp": 1.01731133, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.5648370697162184, "language_loss": 0.71687257, "learning_rate": 1.1659224810974367e-07, "loss": 0.73880708, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.583512783050537 }, { "auxiliary_loss_clip": 0.01133758, "auxiliary_loss_mlp": 0.01026888, "balance_loss_clip": 1.047364, "balance_loss_mlp": 1.01946449, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.5967578399471702, "language_loss": 0.68508488, "learning_rate": 1.1633031150934591e-07, "loss": 0.70669138, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 3.555567979812622 }, { "auxiliary_loss_clip": 0.01155271, "auxiliary_loss_mlp": 0.01025395, "balance_loss_clip": 1.04913676, "balance_loss_mlp": 1.01747942, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 1.9512053581081292, "language_loss": 0.79949337, "learning_rate": 1.1606866066014176e-07, "loss": 0.82130003, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 3.6774942874908447 }, { "auxiliary_loss_clip": 0.01119255, "auxiliary_loss_mlp": 0.01026408, "balance_loss_clip": 1.04518652, "balance_loss_mlp": 1.01883554, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.1222414952651842, "language_loss": 0.75765669, "learning_rate": 1.1580729560182434e-07, "loss": 0.77911329, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 3.5830078125 }, { "auxiliary_loss_clip": 0.0116881, "auxiliary_loss_mlp": 0.00711261, "balance_loss_clip": 1.04989529, "balance_loss_mlp": 1.00061798, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 2.0038452787194316, "language_loss": 0.70738482, "learning_rate": 1.1554621637404171e-07, "loss": 0.72618556, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 3.457120895385742 }, { "auxiliary_loss_clip": 0.01154965, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.04855871, "balance_loss_mlp": 1.01831591, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 2.49227998404097, "language_loss": 0.61314416, "learning_rate": 1.1528542301639999e-07, "loss": 0.63494718, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.524965763092041 }, { "auxiliary_loss_clip": 0.01120573, "auxiliary_loss_mlp": 0.01024815, "balance_loss_clip": 1.04156208, "balance_loss_mlp": 1.01767969, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.4200020567972302, "language_loss": 0.82424629, "learning_rate": 1.1502491556846105e-07, "loss": 0.8457002, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.6466870307922363 }, { "auxiliary_loss_clip": 0.01134257, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.0457648, "balance_loss_mlp": 1.02463722, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.4913658513501256, "language_loss": 0.81435955, "learning_rate": 1.1476469406974331e-07, "loss": 0.83601749, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.6579625606536865 }, { "auxiliary_loss_clip": 0.01167862, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.05046654, "balance_loss_mlp": 1.02367914, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 2.11065615603951, "language_loss": 0.77075434, "learning_rate": 1.1450475855972341e-07, "loss": 0.79273921, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.6043365001678467 }, { "auxiliary_loss_clip": 0.01136156, "auxiliary_loss_mlp": 0.00711836, "balance_loss_clip": 1.04612339, "balance_loss_mlp": 1.00067234, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 2.7917454205827723, "language_loss": 0.70784485, "learning_rate": 1.1424510907783158e-07, "loss": 0.72632474, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.609342575073242 }, { "auxiliary_loss_clip": 0.01139506, "auxiliary_loss_mlp": 0.01023484, "balance_loss_clip": 1.0438807, "balance_loss_mlp": 1.01664412, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.8484782990562558, "language_loss": 0.82819211, "learning_rate": 1.1398574566345787e-07, "loss": 0.84982204, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.662954092025757 }, { "auxiliary_loss_clip": 0.01141169, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.04479086, "balance_loss_mlp": 1.02044678, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.807460796468658, "language_loss": 0.82423246, "learning_rate": 1.1372666835594702e-07, "loss": 0.84592646, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.616976022720337 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01025103, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.01806688, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 2.0936285315933807, "language_loss": 0.71865666, "learning_rate": 1.1346787719460071e-07, "loss": 0.74027264, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.631706953048706 }, { "auxiliary_loss_clip": 0.01134345, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.04709423, "balance_loss_mlp": 1.01938498, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.9931929216679056, "language_loss": 0.72369981, "learning_rate": 1.1320937221867732e-07, "loss": 0.74531209, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.604461669921875 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.0102121, "balance_loss_clip": 1.04379535, "balance_loss_mlp": 1.01504695, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.990242785270515, "language_loss": 0.79349613, "learning_rate": 1.1295115346739192e-07, "loss": 0.81504864, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.6617023944854736 }, { "auxiliary_loss_clip": 0.0113904, "auxiliary_loss_mlp": 0.01027708, "balance_loss_clip": 1.0465529, "balance_loss_mlp": 1.02004313, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 4.4358778160185794, "language_loss": 0.73325908, "learning_rate": 1.1269322097991629e-07, "loss": 0.7549265, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.953826665878296 }, { "auxiliary_loss_clip": 0.01159135, "auxiliary_loss_mlp": 0.01028071, "balance_loss_clip": 1.05103302, "balance_loss_mlp": 1.02063787, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 3.8225418902302737, "language_loss": 0.67748237, "learning_rate": 1.1243557479537846e-07, "loss": 0.69935441, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.609741687774658 }, { "auxiliary_loss_clip": 0.01169385, "auxiliary_loss_mlp": 0.01023049, "balance_loss_clip": 1.04941297, "balance_loss_mlp": 1.01537776, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.546345926844086, "language_loss": 0.68992031, "learning_rate": 1.121782149528634e-07, "loss": 0.71184468, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.5719265937805176 }, { "auxiliary_loss_clip": 0.01143063, "auxiliary_loss_mlp": 0.01029735, "balance_loss_clip": 1.04944777, "balance_loss_mlp": 1.02259457, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 3.7216217841591956, "language_loss": 0.79076779, "learning_rate": 1.1192114149141208e-07, "loss": 0.81249583, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.605034112930298 }, { "auxiliary_loss_clip": 0.01140475, "auxiliary_loss_mlp": 0.01026929, "balance_loss_clip": 1.04518843, "balance_loss_mlp": 1.01901996, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.2506788236008135, "language_loss": 0.6566819, "learning_rate": 1.1166435445002197e-07, "loss": 0.67835593, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.6319525241851807 }, { "auxiliary_loss_clip": 0.0115394, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.04707778, "balance_loss_mlp": 1.02043891, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 2.9661856446834873, "language_loss": 0.68757617, "learning_rate": 1.1140785386764818e-07, "loss": 0.70938766, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.5946226119995117 }, { "auxiliary_loss_clip": 0.01146638, "auxiliary_loss_mlp": 0.01026633, "balance_loss_clip": 1.04724979, "balance_loss_mlp": 1.0191946, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 1.902074014404272, "language_loss": 0.69275403, "learning_rate": 1.1115163978320153e-07, "loss": 0.71448672, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.6084225177764893 }, { "auxiliary_loss_clip": 0.01157857, "auxiliary_loss_mlp": 0.00711911, "balance_loss_clip": 1.04858518, "balance_loss_mlp": 1.00051439, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.151447559921144, "language_loss": 0.82525063, "learning_rate": 1.1089571223554917e-07, "loss": 0.8439483, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.673137664794922 }, { "auxiliary_loss_clip": 0.01153276, "auxiliary_loss_mlp": 0.01023249, "balance_loss_clip": 1.04500806, "balance_loss_mlp": 1.01656735, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.6957352709046412, "language_loss": 0.85214031, "learning_rate": 1.1064007126351537e-07, "loss": 0.8739056, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.645094394683838 }, { "auxiliary_loss_clip": 0.0113405, "auxiliary_loss_mlp": 0.01030112, "balance_loss_clip": 1.04682374, "balance_loss_mlp": 1.02273285, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.1527738480958662, "language_loss": 0.76316321, "learning_rate": 1.1038471690588003e-07, "loss": 0.78480482, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.627915382385254 }, { "auxiliary_loss_clip": 0.01106029, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.04405499, "balance_loss_mlp": 1.02206779, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 6.775584687191706, "language_loss": 0.80138117, "learning_rate": 1.1012964920138145e-07, "loss": 0.82273066, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.7404263019561768 }, { "auxiliary_loss_clip": 0.0112931, "auxiliary_loss_mlp": 0.01021703, "balance_loss_clip": 1.04190135, "balance_loss_mlp": 1.01449656, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.71964218205998, "language_loss": 0.7589705, "learning_rate": 1.0987486818871205e-07, "loss": 0.78048056, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.6820976734161377 }, { "auxiliary_loss_clip": 0.01154467, "auxiliary_loss_mlp": 0.00711616, "balance_loss_clip": 1.04930615, "balance_loss_mlp": 1.00066662, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.5118302189528503, "language_loss": 0.73510468, "learning_rate": 1.0962037390652245e-07, "loss": 0.75376546, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 4.421561241149902 }, { "auxiliary_loss_clip": 0.01137916, "auxiliary_loss_mlp": 0.01024721, "balance_loss_clip": 1.04729736, "balance_loss_mlp": 1.01672792, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.9559638341564862, "language_loss": 0.72111166, "learning_rate": 1.0936616639341911e-07, "loss": 0.74273801, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.662325620651245 }, { "auxiliary_loss_clip": 0.01051903, "auxiliary_loss_mlp": 0.0100222, "balance_loss_clip": 1.01839852, "balance_loss_mlp": 1.00136745, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7385553512402745, "language_loss": 0.54709637, "learning_rate": 1.0911224568796473e-07, "loss": 0.56763756, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 4.1167402267456055 }, { "auxiliary_loss_clip": 0.01150879, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.04814708, "balance_loss_mlp": 1.02052128, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 2.1057752501482527, "language_loss": 0.71270525, "learning_rate": 1.0885861182867984e-07, "loss": 0.73449177, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 3.5084245204925537 }, { "auxiliary_loss_clip": 0.01139546, "auxiliary_loss_mlp": 0.01026761, "balance_loss_clip": 1.04737878, "balance_loss_mlp": 1.01979291, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 1.869571835626312, "language_loss": 0.70903277, "learning_rate": 1.0860526485403942e-07, "loss": 0.73069584, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.71919322013855 }, { "auxiliary_loss_clip": 0.01169505, "auxiliary_loss_mlp": 0.01027388, "balance_loss_clip": 1.05072737, "balance_loss_mlp": 1.01992893, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.5929698239205714, "language_loss": 0.77127171, "learning_rate": 1.0835220480247675e-07, "loss": 0.79324061, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.5573527812957764 }, { "auxiliary_loss_clip": 0.01136128, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 1.0466311, "balance_loss_mlp": 1.0205183, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 2.3503336549912492, "language_loss": 0.83920789, "learning_rate": 1.0809943171238067e-07, "loss": 0.86084527, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.6292002201080322 }, { "auxiliary_loss_clip": 0.01148822, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.05012417, "balance_loss_mlp": 1.01692414, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.5961130685312805, "language_loss": 0.62936008, "learning_rate": 1.078469456220965e-07, "loss": 0.65109771, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.641465425491333 }, { "auxiliary_loss_clip": 0.01153758, "auxiliary_loss_mlp": 0.01021631, "balance_loss_clip": 1.04714489, "balance_loss_mlp": 1.01399589, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 2.0059245676655033, "language_loss": 0.7001577, "learning_rate": 1.0759474656992606e-07, "loss": 0.72191161, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.7203314304351807 }, { "auxiliary_loss_clip": 0.01140409, "auxiliary_loss_mlp": 0.01021298, "balance_loss_clip": 1.04510307, "balance_loss_mlp": 1.01409221, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.4358700333931487, "language_loss": 0.78120762, "learning_rate": 1.0734283459412785e-07, "loss": 0.80282474, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.6152267456054688 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01023935, "balance_loss_clip": 1.04266763, "balance_loss_mlp": 1.01667261, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.875096795475544, "language_loss": 0.80566823, "learning_rate": 1.0709120973291707e-07, "loss": 0.82699847, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.674539089202881 }, { "auxiliary_loss_clip": 0.01170016, "auxiliary_loss_mlp": 0.01025427, "balance_loss_clip": 1.05036759, "balance_loss_mlp": 1.01813149, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 2.625660396172695, "language_loss": 0.7788443, "learning_rate": 1.0683987202446475e-07, "loss": 0.80079877, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.5786163806915283 }, { "auxiliary_loss_clip": 0.01157392, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.04868281, "balance_loss_mlp": 1.02064013, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 2.4129266949146033, "language_loss": 0.69965696, "learning_rate": 1.0658882150689862e-07, "loss": 0.72150886, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.6056394577026367 }, { "auxiliary_loss_clip": 0.01123532, "auxiliary_loss_mlp": 0.01025914, "balance_loss_clip": 1.04453254, "balance_loss_mlp": 1.018767, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.587557102862872, "language_loss": 0.78016651, "learning_rate": 1.0633805821830288e-07, "loss": 0.80166101, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.6414289474487305 }, { "auxiliary_loss_clip": 0.01138772, "auxiliary_loss_mlp": 0.01026106, "balance_loss_clip": 1.04768825, "balance_loss_mlp": 1.01855421, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 2.251033075411816, "language_loss": 0.82942557, "learning_rate": 1.0608758219671753e-07, "loss": 0.85107434, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.6758482456207275 }, { "auxiliary_loss_clip": 0.01142172, "auxiliary_loss_mlp": 0.01026451, "balance_loss_clip": 1.04736376, "balance_loss_mlp": 1.01953399, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.5836273826331102, "language_loss": 0.70878679, "learning_rate": 1.0583739348014065e-07, "loss": 0.73047304, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.68388295173645 }, { "auxiliary_loss_clip": 0.0117123, "auxiliary_loss_mlp": 0.01024932, "balance_loss_clip": 1.05188274, "balance_loss_mlp": 1.01830959, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 1.8314233828706972, "language_loss": 0.8473351, "learning_rate": 1.0558749210652518e-07, "loss": 0.86929679, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.5783755779266357 }, { "auxiliary_loss_clip": 0.01125046, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.04548979, "balance_loss_mlp": 1.01823282, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.8386140903991695, "language_loss": 0.85586768, "learning_rate": 1.053378781137808e-07, "loss": 0.87737226, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.708681583404541 }, { "auxiliary_loss_clip": 0.01142005, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.04817295, "balance_loss_mlp": 1.02055109, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 2.060820821407257, "language_loss": 0.77856523, "learning_rate": 1.0508855153977392e-07, "loss": 0.80026555, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.593778371810913 }, { "auxiliary_loss_clip": 0.0115232, "auxiliary_loss_mlp": 0.01021165, "balance_loss_clip": 1.04599929, "balance_loss_mlp": 1.01390529, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.7403032993656424, "language_loss": 0.67381454, "learning_rate": 1.0483951242232669e-07, "loss": 0.69554943, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.630824089050293 }, { "auxiliary_loss_clip": 0.01070879, "auxiliary_loss_mlp": 0.01001796, "balance_loss_clip": 1.019081, "balance_loss_mlp": 1.00084198, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9712611083571902, "language_loss": 0.57617867, "learning_rate": 1.0459076079921936e-07, "loss": 0.59690535, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.175361156463623 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04661465, "balance_loss_mlp": 1.02080131, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.5849389029798764, "language_loss": 0.84979147, "learning_rate": 1.0434229670818618e-07, "loss": 0.87138593, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.6495795249938965 }, { "auxiliary_loss_clip": 0.01129723, "auxiliary_loss_mlp": 0.01024896, "balance_loss_clip": 1.04532957, "balance_loss_mlp": 1.01813984, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.5812550579701765, "language_loss": 0.80058444, "learning_rate": 1.0409412018691944e-07, "loss": 0.82213068, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.6404614448547363 }, { "auxiliary_loss_clip": 0.01138009, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.04933143, "balance_loss_mlp": 1.02576816, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 2.2908827372760654, "language_loss": 0.75036931, "learning_rate": 1.0384623127306724e-07, "loss": 0.77208149, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.645352602005005 }, { "auxiliary_loss_clip": 0.01118266, "auxiliary_loss_mlp": 0.01027285, "balance_loss_clip": 1.04389203, "balance_loss_mlp": 1.0204457, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 2.5991839281498232, "language_loss": 0.79472178, "learning_rate": 1.0359863000423397e-07, "loss": 0.81617725, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.676548719406128 }, { "auxiliary_loss_clip": 0.01169394, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.04920065, "balance_loss_mlp": 1.01741111, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.629698838968122, "language_loss": 0.719396, "learning_rate": 1.0335131641798112e-07, "loss": 0.74133909, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 3.5508980751037598 }, { "auxiliary_loss_clip": 0.01043635, "auxiliary_loss_mlp": 0.0100262, "balance_loss_clip": 1.01745749, "balance_loss_mlp": 1.00178003, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8118542820047032, "language_loss": 0.55615926, "learning_rate": 1.0310429055182512e-07, "loss": 0.57662189, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 3.946664571762085 }, { "auxiliary_loss_clip": 0.01122303, "auxiliary_loss_mlp": 0.0102734, "balance_loss_clip": 1.04465556, "balance_loss_mlp": 1.01951957, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.750100007865822, "language_loss": 0.74266231, "learning_rate": 1.0285755244324024e-07, "loss": 0.76415879, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.7289721965789795 }, { "auxiliary_loss_clip": 0.01139132, "auxiliary_loss_mlp": 0.00711258, "balance_loss_clip": 1.04466856, "balance_loss_mlp": 1.00057006, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 2.5359716921315614, "language_loss": 0.68731529, "learning_rate": 1.0261110212965629e-07, "loss": 0.70581919, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.6897284984588623 }, { "auxiliary_loss_clip": 0.01138185, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.04688728, "balance_loss_mlp": 1.02186179, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 2.787789081883033, "language_loss": 0.79535335, "learning_rate": 1.023649396484596e-07, "loss": 0.81702125, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 4.369969367980957 }, { "auxiliary_loss_clip": 0.01168158, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.04850507, "balance_loss_mlp": 1.01857114, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 5.399117046381112, "language_loss": 0.67767954, "learning_rate": 1.0211906503699275e-07, "loss": 0.69962049, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.76045298576355 }, { "auxiliary_loss_clip": 0.01158435, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.05221701, "balance_loss_mlp": 1.02485371, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.7282088818114962, "language_loss": 0.82188213, "learning_rate": 1.0187347833255455e-07, "loss": 0.84379148, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.5529282093048096 }, { "auxiliary_loss_clip": 0.01168013, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.04998744, "balance_loss_mlp": 1.01946199, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 2.156426402159807, "language_loss": 0.79480791, "learning_rate": 1.0162817957240056e-07, "loss": 0.81675398, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.625929832458496 }, { "auxiliary_loss_clip": 0.0106112, "auxiliary_loss_mlp": 0.01003743, "balance_loss_clip": 1.01971447, "balance_loss_mlp": 1.00262213, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8903542689496473, "language_loss": 0.62991607, "learning_rate": 1.0138316879374253e-07, "loss": 0.65056467, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.3181800842285156 }, { "auxiliary_loss_clip": 0.01142577, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.05169082, "balance_loss_mlp": 1.01771998, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 2.276967105036687, "language_loss": 0.74139321, "learning_rate": 1.0113844603374833e-07, "loss": 0.76306748, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.663908004760742 }, { "auxiliary_loss_clip": 0.01135129, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.04441011, "balance_loss_mlp": 1.02150536, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 3.3192871636799075, "language_loss": 0.72314304, "learning_rate": 1.0089401132954178e-07, "loss": 0.74479026, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.591539144515991 }, { "auxiliary_loss_clip": 0.01138096, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.04796147, "balance_loss_mlp": 1.02150333, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.721345949556541, "language_loss": 0.72612381, "learning_rate": 1.006498647182037e-07, "loss": 0.74778956, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.7085120677948 }, { "auxiliary_loss_clip": 0.01087003, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.04078507, "balance_loss_mlp": 1.02310324, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.484562954101366, "language_loss": 0.71599871, "learning_rate": 1.004060062367713e-07, "loss": 0.73717523, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.7604146003723145 }, { "auxiliary_loss_clip": 0.01154556, "auxiliary_loss_mlp": 0.01028383, "balance_loss_clip": 1.04797339, "balance_loss_mlp": 1.02129304, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 3.038318917874432, "language_loss": 0.69703048, "learning_rate": 1.0016243592223728e-07, "loss": 0.71885991, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.628399610519409 }, { "auxiliary_loss_clip": 0.01086318, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 1.04309857, "balance_loss_mlp": 1.01987457, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 2.1099976686019835, "language_loss": 0.65015793, "learning_rate": 9.991915381155114e-08, "loss": 0.67128879, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.8969345092773438 }, { "auxiliary_loss_clip": 0.01156168, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.04803157, "balance_loss_mlp": 1.02450752, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.1171112302075703, "language_loss": 0.75273454, "learning_rate": 9.967615994161871e-08, "loss": 0.7746135, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.6376047134399414 }, { "auxiliary_loss_clip": 0.01167149, "auxiliary_loss_mlp": 0.01026185, "balance_loss_clip": 1.0486865, "balance_loss_mlp": 1.01909816, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.9777669427300177, "language_loss": 0.78221947, "learning_rate": 9.943345434930161e-08, "loss": 0.80415279, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.614436626434326 }, { "auxiliary_loss_clip": 0.01120909, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.04603767, "balance_loss_mlp": 1.02566123, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 2.4059526113108, "language_loss": 0.69303197, "learning_rate": 9.919103707141885e-08, "loss": 0.71456885, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.648963212966919 }, { "auxiliary_loss_clip": 0.01153895, "auxiliary_loss_mlp": 0.0102914, "balance_loss_clip": 1.04879653, "balance_loss_mlp": 1.02156734, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 2.2695967560670924, "language_loss": 0.76926845, "learning_rate": 9.89489081447441e-08, "loss": 0.79109883, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.6280195713043213 }, { "auxiliary_loss_clip": 0.01135676, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.04466009, "balance_loss_mlp": 1.01964545, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.9100972453251819, "language_loss": 0.83617651, "learning_rate": 9.870706760600844e-08, "loss": 0.85780716, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.6359262466430664 }, { "auxiliary_loss_clip": 0.01114464, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.0498457, "balance_loss_mlp": 1.02200603, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.9893566257133355, "language_loss": 0.72745585, "learning_rate": 9.846551549189918e-08, "loss": 0.74889326, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.706725597381592 }, { "auxiliary_loss_clip": 0.01136808, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.04697955, "balance_loss_mlp": 1.02018261, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 2.152570695701611, "language_loss": 0.68745619, "learning_rate": 9.822425183905902e-08, "loss": 0.70910132, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.7689154148101807 }, { "auxiliary_loss_clip": 0.01035578, "auxiliary_loss_mlp": 0.01005432, "balance_loss_clip": 1.01980042, "balance_loss_mlp": 1.00431108, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.914022278172819, "language_loss": 0.75141346, "learning_rate": 9.798327668408823e-08, "loss": 0.77182353, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.3677711486816406 }, { "auxiliary_loss_clip": 0.01174192, "auxiliary_loss_mlp": 0.0102595, "balance_loss_clip": 1.05000615, "balance_loss_mlp": 1.01818383, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 2.0383195540310286, "language_loss": 0.68342191, "learning_rate": 9.774259006354158e-08, "loss": 0.70542336, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.559098482131958 }, { "auxiliary_loss_clip": 0.01142255, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.04659581, "balance_loss_mlp": 1.0196296, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 1.9768640169521903, "language_loss": 0.76276636, "learning_rate": 9.750219201393184e-08, "loss": 0.78445613, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.649986743927002 }, { "auxiliary_loss_clip": 0.01154396, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.04905283, "balance_loss_mlp": 1.0211575, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 6.006267089595283, "language_loss": 0.77864802, "learning_rate": 9.726208257172697e-08, "loss": 0.80047917, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.6494545936584473 }, { "auxiliary_loss_clip": 0.01169405, "auxiliary_loss_mlp": 0.01024851, "balance_loss_clip": 1.05064201, "balance_loss_mlp": 1.01759696, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 5.353662109473524, "language_loss": 0.74969894, "learning_rate": 9.702226177335115e-08, "loss": 0.77164149, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.707078695297241 }, { "auxiliary_loss_clip": 0.01138845, "auxiliary_loss_mlp": 0.01030176, "balance_loss_clip": 1.047979, "balance_loss_mlp": 1.02271342, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.5777625735398186, "language_loss": 0.72845829, "learning_rate": 9.67827296551853e-08, "loss": 0.75014853, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 4.451396942138672 }, { "auxiliary_loss_clip": 0.0112715, "auxiliary_loss_mlp": 0.00710777, "balance_loss_clip": 1.04242229, "balance_loss_mlp": 1.00063872, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 2.301657970389232, "language_loss": 0.68678606, "learning_rate": 9.65434862535659e-08, "loss": 0.70516533, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.634431838989258 }, { "auxiliary_loss_clip": 0.01144143, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04841292, "balance_loss_mlp": 1.01757121, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 2.514431286001043, "language_loss": 0.6483109, "learning_rate": 9.630453160478635e-08, "loss": 0.66999805, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 3.5313544273376465 }, { "auxiliary_loss_clip": 0.01103801, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.04173636, "balance_loss_mlp": 1.0211463, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.7496114525677269, "language_loss": 0.8230902, "learning_rate": 9.60658657450959e-08, "loss": 0.84441072, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 3.592578649520874 }, { "auxiliary_loss_clip": 0.0112473, "auxiliary_loss_mlp": 0.01021408, "balance_loss_clip": 1.04246211, "balance_loss_mlp": 1.0144943, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.8328773187097775, "language_loss": 0.79550529, "learning_rate": 9.582748871069979e-08, "loss": 0.81696665, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 2.6150314807891846 }, { "auxiliary_loss_clip": 0.01140802, "auxiliary_loss_mlp": 0.00711648, "balance_loss_clip": 1.04600716, "balance_loss_mlp": 1.00062263, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 2.489099535292727, "language_loss": 0.83132458, "learning_rate": 9.558940053775954e-08, "loss": 0.84984899, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.6805331707000732 }, { "auxiliary_loss_clip": 0.01152364, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.04815125, "balance_loss_mlp": 1.02182245, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 2.020111648860106, "language_loss": 0.68045682, "learning_rate": 9.535160126239294e-08, "loss": 0.70227134, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.5583486557006836 }, { "auxiliary_loss_clip": 0.01150822, "auxiliary_loss_mlp": 0.01025111, "balance_loss_clip": 1.04782784, "balance_loss_mlp": 1.0179379, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.584221845665566, "language_loss": 0.70884299, "learning_rate": 9.511409092067424e-08, "loss": 0.73060226, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.6410765647888184 }, { "auxiliary_loss_clip": 0.01137553, "auxiliary_loss_mlp": 0.01030611, "balance_loss_clip": 1.04803252, "balance_loss_mlp": 1.0237776, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.976949972168618, "language_loss": 0.67565155, "learning_rate": 9.487686954863327e-08, "loss": 0.69733316, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.617753505706787 }, { "auxiliary_loss_clip": 0.01150572, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.0476892, "balance_loss_mlp": 1.01962423, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 5.4212154283833245, "language_loss": 0.77427566, "learning_rate": 9.46399371822566e-08, "loss": 0.79604995, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.6658077239990234 }, { "auxiliary_loss_clip": 0.01170832, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.05130053, "balance_loss_mlp": 1.02228332, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 2.3567103649718804, "language_loss": 0.72776949, "learning_rate": 9.440329385748657e-08, "loss": 0.74977267, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.54486346244812 }, { "auxiliary_loss_clip": 0.01122371, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 1.04615068, "balance_loss_mlp": 1.02131414, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.9853800606252794, "language_loss": 0.7033788, "learning_rate": 9.416693961022137e-08, "loss": 0.72487855, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.58447527885437 }, { "auxiliary_loss_clip": 0.0107537, "auxiliary_loss_mlp": 0.01023239, "balance_loss_clip": 1.03971982, "balance_loss_mlp": 1.01602662, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 2.5182163106221966, "language_loss": 0.77206182, "learning_rate": 9.393087447631654e-08, "loss": 0.79304796, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.747274875640869 }, { "auxiliary_loss_clip": 0.01140369, "auxiliary_loss_mlp": 0.0102776, "balance_loss_clip": 1.04628921, "balance_loss_mlp": 1.02107191, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 2.0905406911491378, "language_loss": 0.73314583, "learning_rate": 9.36950984915823e-08, "loss": 0.75482714, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.6131961345672607 }, { "auxiliary_loss_clip": 0.01170951, "auxiliary_loss_mlp": 0.010261, "balance_loss_clip": 1.05042148, "balance_loss_mlp": 1.01832771, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 4.394188084708793, "language_loss": 0.69645357, "learning_rate": 9.345961169178607e-08, "loss": 0.71842408, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.61482834815979 }, { "auxiliary_loss_clip": 0.01109796, "auxiliary_loss_mlp": 0.01026571, "balance_loss_clip": 1.04512119, "balance_loss_mlp": 1.01932621, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.3911537412672828, "language_loss": 0.72992557, "learning_rate": 9.322441411265081e-08, "loss": 0.75128925, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.707634925842285 }, { "auxiliary_loss_clip": 0.01132191, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.04641807, "balance_loss_mlp": 1.02328205, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 2.1573385187708505, "language_loss": 0.73772711, "learning_rate": 9.298950578985554e-08, "loss": 0.75935364, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.602419137954712 }, { "auxiliary_loss_clip": 0.0115093, "auxiliary_loss_mlp": 0.00712349, "balance_loss_clip": 1.0486691, "balance_loss_mlp": 1.00063705, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.8820297408855746, "language_loss": 0.71064943, "learning_rate": 9.275488675903665e-08, "loss": 0.72928226, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.5911004543304443 }, { "auxiliary_loss_clip": 0.01102871, "auxiliary_loss_mlp": 0.01022349, "balance_loss_clip": 1.04350436, "balance_loss_mlp": 1.01494646, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 2.4358353760176152, "language_loss": 0.73616618, "learning_rate": 9.252055705578454e-08, "loss": 0.75741833, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.7834115028381348 }, { "auxiliary_loss_clip": 0.01152755, "auxiliary_loss_mlp": 0.01026922, "balance_loss_clip": 1.0467695, "balance_loss_mlp": 1.02003157, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.7790974366472407, "language_loss": 0.72058463, "learning_rate": 9.228651671564747e-08, "loss": 0.74238133, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.6815602779388428 }, { "auxiliary_loss_clip": 0.01105368, "auxiliary_loss_mlp": 0.01024938, "balance_loss_clip": 1.04647017, "balance_loss_mlp": 1.0184356, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.506069753063794, "language_loss": 0.77860796, "learning_rate": 9.205276577412901e-08, "loss": 0.79991108, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.8422560691833496 }, { "auxiliary_loss_clip": 0.01144384, "auxiliary_loss_mlp": 0.00711539, "balance_loss_clip": 1.04686904, "balance_loss_mlp": 1.00058079, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.887902383627695, "language_loss": 0.77506781, "learning_rate": 9.181930426668905e-08, "loss": 0.79362702, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.7633213996887207 }, { "auxiliary_loss_clip": 0.01101284, "auxiliary_loss_mlp": 0.01023767, "balance_loss_clip": 1.04300046, "balance_loss_mlp": 1.01671612, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.6164779176923096, "language_loss": 0.67940986, "learning_rate": 9.158613222874346e-08, "loss": 0.70066041, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.8361682891845703 }, { "auxiliary_loss_clip": 0.01136462, "auxiliary_loss_mlp": 0.01023539, "balance_loss_clip": 1.04451478, "balance_loss_mlp": 1.01675558, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.7827056026506813, "language_loss": 0.8205958, "learning_rate": 9.135324969566394e-08, "loss": 0.84219587, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.804973840713501 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.04949951, "balance_loss_mlp": 1.01870251, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 1.9004010920892043, "language_loss": 0.75297421, "learning_rate": 9.112065670277913e-08, "loss": 0.77481085, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.6549460887908936 }, { "auxiliary_loss_clip": 0.01132197, "auxiliary_loss_mlp": 0.01025714, "balance_loss_clip": 1.04313064, "balance_loss_mlp": 1.01936018, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 1.8182405428467678, "language_loss": 0.7341876, "learning_rate": 9.088835328537303e-08, "loss": 0.75576669, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 3.6702113151550293 }, { "auxiliary_loss_clip": 0.01140349, "auxiliary_loss_mlp": 0.01027524, "balance_loss_clip": 1.04697394, "balance_loss_mlp": 1.0204612, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 2.5334505778607976, "language_loss": 0.71696079, "learning_rate": 9.065633947868568e-08, "loss": 0.73863947, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.608259916305542 }, { "auxiliary_loss_clip": 0.0112031, "auxiliary_loss_mlp": 0.00711374, "balance_loss_clip": 1.04734683, "balance_loss_mlp": 1.0005424, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.356812748646348, "language_loss": 0.80120611, "learning_rate": 9.042461531791379e-08, "loss": 0.81952298, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 3.6764540672302246 }, { "auxiliary_loss_clip": 0.01163879, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.04683852, "balance_loss_mlp": 1.01668286, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 2.0496114018945137, "language_loss": 0.78407449, "learning_rate": 9.019318083820903e-08, "loss": 0.80594873, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 3.4474313259124756 }, { "auxiliary_loss_clip": 0.0115208, "auxiliary_loss_mlp": 0.01028282, "balance_loss_clip": 1.04752803, "balance_loss_mlp": 1.02092695, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.699207777204609, "language_loss": 0.8509391, "learning_rate": 8.996203607468045e-08, "loss": 0.87274271, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 3.504572629928589 }, { "auxiliary_loss_clip": 0.01145812, "auxiliary_loss_mlp": 0.01023417, "balance_loss_clip": 1.04387546, "balance_loss_mlp": 1.01632428, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.5059510614098557, "language_loss": 0.75401008, "learning_rate": 8.973118106239241e-08, "loss": 0.77570236, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 2.628758430480957 }, { "auxiliary_loss_clip": 0.01090368, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.03995013, "balance_loss_mlp": 1.02104008, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 2.158633792688565, "language_loss": 0.94684923, "learning_rate": 8.95006158363656e-08, "loss": 0.96803522, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.832205295562744 }, { "auxiliary_loss_clip": 0.01155317, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.05050755, "balance_loss_mlp": 1.01875174, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.9099288691652458, "language_loss": 0.77412021, "learning_rate": 8.9270340431576e-08, "loss": 0.79593611, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.6066415309906006 }, { "auxiliary_loss_clip": 0.01153113, "auxiliary_loss_mlp": 0.01023684, "balance_loss_clip": 1.04570723, "balance_loss_mlp": 1.0169338, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.1527132301081013, "language_loss": 0.73299968, "learning_rate": 8.904035488295658e-08, "loss": 0.75476766, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.7207236289978027 }, { "auxiliary_loss_clip": 0.01060637, "auxiliary_loss_mlp": 0.00701881, "balance_loss_clip": 1.01911998, "balance_loss_mlp": 0.99992698, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.6621232491744828, "language_loss": 0.53261656, "learning_rate": 8.881065922539632e-08, "loss": 0.55024171, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.1073007583618164 }, { "auxiliary_loss_clip": 0.01115387, "auxiliary_loss_mlp": 0.01026967, "balance_loss_clip": 1.04464209, "balance_loss_mlp": 1.01982665, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 2.8949824472933, "language_loss": 0.73488814, "learning_rate": 8.85812534937389e-08, "loss": 0.75631171, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.6882266998291016 }, { "auxiliary_loss_clip": 0.0116013, "auxiliary_loss_mlp": 0.01026847, "balance_loss_clip": 1.04975104, "balance_loss_mlp": 1.01912808, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 3.0833553361269965, "language_loss": 0.672378, "learning_rate": 8.835213772278583e-08, "loss": 0.69424778, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.553407669067383 }, { "auxiliary_loss_clip": 0.01113516, "auxiliary_loss_mlp": 0.01025421, "balance_loss_clip": 1.04517198, "balance_loss_mlp": 1.01851308, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 2.1345225255482263, "language_loss": 0.7905879, "learning_rate": 8.812331194729373e-08, "loss": 0.81197727, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.7858681678771973 }, { "auxiliary_loss_clip": 0.01175326, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.05429912, "balance_loss_mlp": 1.02617025, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 2.3595624119790366, "language_loss": 0.72099543, "learning_rate": 8.789477620197461e-08, "loss": 0.74309063, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.5723469257354736 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.0102706, "balance_loss_clip": 1.045259, "balance_loss_mlp": 1.01947212, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.192528510048214, "language_loss": 0.79111034, "learning_rate": 8.766653052149831e-08, "loss": 0.81274104, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.6205058097839355 }, { "auxiliary_loss_clip": 0.01139209, "auxiliary_loss_mlp": 0.01024983, "balance_loss_clip": 1.04950142, "balance_loss_mlp": 1.01790214, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 2.159355530399871, "language_loss": 0.74625832, "learning_rate": 8.743857494048823e-08, "loss": 0.76790023, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.718095064163208 }, { "auxiliary_loss_clip": 0.01121862, "auxiliary_loss_mlp": 0.010246, "balance_loss_clip": 1.04673195, "balance_loss_mlp": 1.01693213, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 2.2002069896945673, "language_loss": 0.63034791, "learning_rate": 8.721090949352605e-08, "loss": 0.65181249, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.750746488571167 }, { "auxiliary_loss_clip": 0.01165171, "auxiliary_loss_mlp": 0.01026365, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.01947212, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 5.743869883529497, "language_loss": 0.72949517, "learning_rate": 8.698353421514793e-08, "loss": 0.75141054, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.622650623321533 }, { "auxiliary_loss_clip": 0.01150572, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.04804456, "balance_loss_mlp": 1.01916802, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.1864371875254545, "language_loss": 0.8051393, "learning_rate": 8.67564491398467e-08, "loss": 0.82690704, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.6248056888580322 }, { "auxiliary_loss_clip": 0.01156038, "auxiliary_loss_mlp": 0.01024782, "balance_loss_clip": 1.04800928, "balance_loss_mlp": 1.01740563, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 3.122804502169533, "language_loss": 0.73549712, "learning_rate": 8.652965430207104e-08, "loss": 0.75730526, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.657003164291382 }, { "auxiliary_loss_clip": 0.01156939, "auxiliary_loss_mlp": 0.01028014, "balance_loss_clip": 1.0482223, "balance_loss_mlp": 1.02125192, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 3.042980356433486, "language_loss": 0.65651172, "learning_rate": 8.630314973622521e-08, "loss": 0.6783613, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.585538387298584 }, { "auxiliary_loss_clip": 0.01151438, "auxiliary_loss_mlp": 0.01024709, "balance_loss_clip": 1.04927278, "balance_loss_mlp": 1.0173533, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 2.3254873570086976, "language_loss": 0.71018481, "learning_rate": 8.607693547666995e-08, "loss": 0.73194629, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.718611001968384 }, { "auxiliary_loss_clip": 0.01036657, "auxiliary_loss_mlp": 0.01006673, "balance_loss_clip": 1.0188396, "balance_loss_mlp": 1.00558209, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.8797430508440067, "language_loss": 0.57954752, "learning_rate": 8.585101155772201e-08, "loss": 0.59998083, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.334077835083008 }, { "auxiliary_loss_clip": 0.01129008, "auxiliary_loss_mlp": 0.0102461, "balance_loss_clip": 1.04159343, "balance_loss_mlp": 1.01699853, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 1.9695734286073223, "language_loss": 0.68566179, "learning_rate": 8.562537801365377e-08, "loss": 0.70719796, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.705937147140503 }, { "auxiliary_loss_clip": 0.01169332, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.04914951, "balance_loss_mlp": 1.0256716, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 4.253090522706542, "language_loss": 0.70170271, "learning_rate": 8.540003487869362e-08, "loss": 0.7237345, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.593487501144409 }, { "auxiliary_loss_clip": 0.01108164, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.04238629, "balance_loss_mlp": 1.01954353, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 3.0070898184255026, "language_loss": 0.79515386, "learning_rate": 8.517498218702557e-08, "loss": 0.81649804, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.6663260459899902 }, { "auxiliary_loss_clip": 0.01116188, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.04298115, "balance_loss_mlp": 1.02332616, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.7581486074709975, "language_loss": 0.69630092, "learning_rate": 8.49502199727905e-08, "loss": 0.7177676, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 3.6004703044891357 }, { "auxiliary_loss_clip": 0.01148923, "auxiliary_loss_mlp": 0.01022778, "balance_loss_clip": 1.04500282, "balance_loss_mlp": 1.01564324, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.5612649273845114, "language_loss": 0.66407841, "learning_rate": 8.472574827008428e-08, "loss": 0.68579543, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.6659111976623535 }, { "auxiliary_loss_clip": 0.01150998, "auxiliary_loss_mlp": 0.01027264, "balance_loss_clip": 1.0460788, "balance_loss_mlp": 1.01986122, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 4.7544779104267345, "language_loss": 0.8410694, "learning_rate": 8.450156711295942e-08, "loss": 0.86285198, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 3.6097874641418457 }, { "auxiliary_loss_clip": 0.01135886, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.04965639, "balance_loss_mlp": 1.01956165, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 3.0728866555376353, "language_loss": 0.86674547, "learning_rate": 8.427767653542383e-08, "loss": 0.88836884, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 4.4694249629974365 }, { "auxiliary_loss_clip": 0.01102794, "auxiliary_loss_mlp": 0.01022759, "balance_loss_clip": 1.04273415, "balance_loss_mlp": 1.01568079, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 2.0068584329733024, "language_loss": 0.70425314, "learning_rate": 8.405407657144125e-08, "loss": 0.72550869, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.715698480606079 }, { "auxiliary_loss_clip": 0.01130734, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.04463887, "balance_loss_mlp": 1.01891112, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 2.197958540327566, "language_loss": 0.72804904, "learning_rate": 8.383076725493232e-08, "loss": 0.74961364, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.71081805229187 }, { "auxiliary_loss_clip": 0.01153538, "auxiliary_loss_mlp": 0.01024205, "balance_loss_clip": 1.0479784, "balance_loss_mlp": 1.01727891, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 1.8865574085738526, "language_loss": 0.67916679, "learning_rate": 8.360774861977216e-08, "loss": 0.7009443, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.6808815002441406 }, { "auxiliary_loss_clip": 0.01136963, "auxiliary_loss_mlp": 0.01026342, "balance_loss_clip": 1.0445292, "balance_loss_mlp": 1.01967871, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 2.0714530540179625, "language_loss": 0.74652755, "learning_rate": 8.338502069979281e-08, "loss": 0.76816058, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.6828906536102295 }, { "auxiliary_loss_clip": 0.01153449, "auxiliary_loss_mlp": 0.01023994, "balance_loss_clip": 1.04390168, "balance_loss_mlp": 1.01680863, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 7.268584543954202, "language_loss": 0.79631031, "learning_rate": 8.316258352878214e-08, "loss": 0.81808472, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.537558078765869 }, { "auxiliary_loss_clip": 0.01156933, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.04678583, "balance_loss_mlp": 1.02246737, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 2.7572611764079946, "language_loss": 0.7098819, "learning_rate": 8.294043714048338e-08, "loss": 0.73175114, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.7309792041778564 }, { "auxiliary_loss_clip": 0.01047552, "auxiliary_loss_mlp": 0.01000218, "balance_loss_clip": 1.01924574, "balance_loss_mlp": 0.9992528, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7550077296411429, "language_loss": 0.60425556, "learning_rate": 8.271858156859624e-08, "loss": 0.62473327, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.30203914642334 }, { "auxiliary_loss_clip": 0.01170706, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.05159354, "balance_loss_mlp": 1.02232337, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.6998968217373287, "language_loss": 0.73808414, "learning_rate": 8.249701684677557e-08, "loss": 0.76008886, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.5888524055480957 }, { "auxiliary_loss_clip": 0.01152469, "auxiliary_loss_mlp": 0.01028945, "balance_loss_clip": 1.04902434, "balance_loss_mlp": 1.02174222, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 2.1750978778676937, "language_loss": 0.8113513, "learning_rate": 8.227574300863294e-08, "loss": 0.83316547, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.638376235961914 }, { "auxiliary_loss_clip": 0.01138492, "auxiliary_loss_mlp": 0.01021319, "balance_loss_clip": 1.04613698, "balance_loss_mlp": 1.01354408, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.8357727723602861, "language_loss": 0.6970458, "learning_rate": 8.205476008773548e-08, "loss": 0.7186439, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.802794933319092 }, { "auxiliary_loss_clip": 0.01118663, "auxiliary_loss_mlp": 0.0102823, "balance_loss_clip": 1.04901302, "balance_loss_mlp": 1.02136016, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 5.517441785951352, "language_loss": 0.82589149, "learning_rate": 8.183406811760596e-08, "loss": 0.84736049, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.7497386932373047 }, { "auxiliary_loss_clip": 0.01105886, "auxiliary_loss_mlp": 0.01028409, "balance_loss_clip": 1.04155922, "balance_loss_mlp": 1.02163529, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.6235574033862439, "language_loss": 0.74049306, "learning_rate": 8.161366713172313e-08, "loss": 0.76183605, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.713073492050171 }, { "auxiliary_loss_clip": 0.01127203, "auxiliary_loss_mlp": 0.01028606, "balance_loss_clip": 1.04479313, "balance_loss_mlp": 1.02110147, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 3.0624945525286718, "language_loss": 0.84822118, "learning_rate": 8.139355716352137e-08, "loss": 0.86977929, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.6581573486328125 }, { "auxiliary_loss_clip": 0.01139104, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 1.04543138, "balance_loss_mlp": 1.01622725, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 1.5825278902838595, "language_loss": 0.70061189, "learning_rate": 8.117373824639196e-08, "loss": 0.72223961, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.6729307174682617 }, { "auxiliary_loss_clip": 0.01070016, "auxiliary_loss_mlp": 0.01002161, "balance_loss_clip": 1.01809192, "balance_loss_mlp": 1.00118911, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7331365103210968, "language_loss": 0.59188139, "learning_rate": 8.095421041368067e-08, "loss": 0.61260319, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.092115879058838 }, { "auxiliary_loss_clip": 0.01137008, "auxiliary_loss_mlp": 0.00711612, "balance_loss_clip": 1.04808211, "balance_loss_mlp": 1.0006572, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 5.111796720333266, "language_loss": 0.70370674, "learning_rate": 8.073497369868999e-08, "loss": 0.72219294, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.6416308879852295 }, { "auxiliary_loss_clip": 0.01144469, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.04712749, "balance_loss_mlp": 1.0206542, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.99898042730032, "language_loss": 0.75580323, "learning_rate": 8.051602813467772e-08, "loss": 0.77753043, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.689222574234009 }, { "auxiliary_loss_clip": 0.01156749, "auxiliary_loss_mlp": 0.01023886, "balance_loss_clip": 1.0485698, "balance_loss_mlp": 1.01693583, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 1.733209986729379, "language_loss": 0.7116856, "learning_rate": 8.029737375485756e-08, "loss": 0.7334919, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.627943992614746 }, { "auxiliary_loss_clip": 0.01170505, "auxiliary_loss_mlp": 0.0102737, "balance_loss_clip": 1.05151939, "balance_loss_mlp": 1.02096844, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.8889980968824973, "language_loss": 0.72812247, "learning_rate": 8.007901059239986e-08, "loss": 0.75010121, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.5463953018188477 }, { "auxiliary_loss_clip": 0.01136799, "auxiliary_loss_mlp": 0.01022208, "balance_loss_clip": 1.04451632, "balance_loss_mlp": 1.01500487, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.67123180010291, "language_loss": 0.80217999, "learning_rate": 7.986093868042964e-08, "loss": 0.82377005, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.665104389190674 }, { "auxiliary_loss_clip": 0.01150335, "auxiliary_loss_mlp": 0.01027023, "balance_loss_clip": 1.04673147, "balance_loss_mlp": 1.02076769, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 1.8757857811221446, "language_loss": 0.6813435, "learning_rate": 7.964315805202826e-08, "loss": 0.70311707, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.608074188232422 }, { "auxiliary_loss_clip": 0.01138935, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 1.0480988, "balance_loss_mlp": 1.01890743, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 3.2957357615863474, "language_loss": 0.73407853, "learning_rate": 7.942566874023304e-08, "loss": 0.7557261, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.649718761444092 }, { "auxiliary_loss_clip": 0.01134651, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.04438233, "balance_loss_mlp": 1.01716542, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.2217245151698237, "language_loss": 0.69878972, "learning_rate": 7.920847077803649e-08, "loss": 0.72038424, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 3.572633743286133 }, { "auxiliary_loss_clip": 0.01092536, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.03851819, "balance_loss_mlp": 1.02254093, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 2.9766928940592754, "language_loss": 0.8216629, "learning_rate": 7.899156419838826e-08, "loss": 0.8428849, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.7201054096221924 }, { "auxiliary_loss_clip": 0.01117836, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 1.04406238, "balance_loss_mlp": 1.01889634, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 1.965290538887155, "language_loss": 0.65645021, "learning_rate": 7.87749490341918e-08, "loss": 0.67788798, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 3.641904354095459 }, { "auxiliary_loss_clip": 0.01173774, "auxiliary_loss_mlp": 0.01023762, "balance_loss_clip": 1.05278397, "balance_loss_mlp": 1.01622832, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 3.555891022562392, "language_loss": 0.83398348, "learning_rate": 7.855862531830836e-08, "loss": 0.85595882, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 3.4981184005737305 }, { "auxiliary_loss_clip": 0.01151163, "auxiliary_loss_mlp": 0.01028519, "balance_loss_clip": 1.04644251, "balance_loss_mlp": 1.02161372, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.6609463206568413, "language_loss": 0.72878242, "learning_rate": 7.834259308355373e-08, "loss": 0.7505793, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 3.5054028034210205 }, { "auxiliary_loss_clip": 0.0107164, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.03685784, "balance_loss_mlp": 1.01741612, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 2.7682951201494594, "language_loss": 0.75275081, "learning_rate": 7.812685236269989e-08, "loss": 0.77371538, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.7892160415649414 }, { "auxiliary_loss_clip": 0.01031536, "auxiliary_loss_mlp": 0.01004913, "balance_loss_clip": 1.0206697, "balance_loss_mlp": 1.00386381, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.792448801839935, "language_loss": 0.58610153, "learning_rate": 7.791140318847445e-08, "loss": 0.60646594, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.2513725757598877 }, { "auxiliary_loss_clip": 0.01132007, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.04671764, "balance_loss_mlp": 1.0220809, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.8445885979839092, "language_loss": 0.80396736, "learning_rate": 7.769624559356081e-08, "loss": 0.82557774, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.6790735721588135 }, { "auxiliary_loss_clip": 0.01155816, "auxiliary_loss_mlp": 0.01023886, "balance_loss_clip": 1.04974389, "balance_loss_mlp": 1.0164113, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 4.8904176123908325, "language_loss": 0.75210935, "learning_rate": 7.748137961059842e-08, "loss": 0.77390635, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.5754196643829346 }, { "auxiliary_loss_clip": 0.0116591, "auxiliary_loss_mlp": 0.01023251, "balance_loss_clip": 1.04919529, "balance_loss_mlp": 1.0163641, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.946798577820436, "language_loss": 0.65574712, "learning_rate": 7.726680527218211e-08, "loss": 0.67763871, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.6502246856689453 }, { "auxiliary_loss_clip": 0.01171308, "auxiliary_loss_mlp": 0.01024683, "balance_loss_clip": 1.05031896, "balance_loss_mlp": 1.01717913, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6137806679190025, "language_loss": 0.75366879, "learning_rate": 7.70525226108627e-08, "loss": 0.77562875, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.8000195026397705 }, { "auxiliary_loss_clip": 0.01158603, "auxiliary_loss_mlp": 0.01022894, "balance_loss_clip": 1.05225372, "balance_loss_mlp": 1.01589, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 1.9451744568059042, "language_loss": 0.80013597, "learning_rate": 7.683853165914666e-08, "loss": 0.82195097, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.620720148086548 }, { "auxiliary_loss_clip": 0.01105005, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.04367304, "balance_loss_mlp": 1.02328634, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 2.489006271013575, "language_loss": 0.77094561, "learning_rate": 7.662483244949602e-08, "loss": 0.79230064, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.711683750152588 }, { "auxiliary_loss_clip": 0.01114724, "auxiliary_loss_mlp": 0.01026863, "balance_loss_clip": 1.04461884, "balance_loss_mlp": 1.01997864, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.2093990333768487, "language_loss": 0.80299997, "learning_rate": 7.641142501432951e-08, "loss": 0.82441592, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.6101598739624023 }, { "auxiliary_loss_clip": 0.01131735, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.04557014, "balance_loss_mlp": 1.01855755, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6985230836071736, "language_loss": 0.73807734, "learning_rate": 7.619830938602013e-08, "loss": 0.75965089, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.731496572494507 }, { "auxiliary_loss_clip": 0.01152777, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.05046105, "balance_loss_mlp": 1.02331865, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 2.46168933506865, "language_loss": 0.8269729, "learning_rate": 7.598548559689777e-08, "loss": 0.84881091, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.6045854091644287 }, { "auxiliary_loss_clip": 0.01114671, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.0431298, "balance_loss_mlp": 1.02117622, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.7213582924341315, "language_loss": 0.81688213, "learning_rate": 7.577295367924751e-08, "loss": 0.83831286, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.659175157546997 }, { "auxiliary_loss_clip": 0.01143774, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.04934025, "balance_loss_mlp": 1.01877952, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 2.909770574691157, "language_loss": 0.82308221, "learning_rate": 7.556071366531002e-08, "loss": 0.84478021, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.667623996734619 }, { "auxiliary_loss_clip": 0.01156754, "auxiliary_loss_mlp": 0.01028585, "balance_loss_clip": 1.05115974, "balance_loss_mlp": 1.02119946, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 1.9943940864211487, "language_loss": 0.79228818, "learning_rate": 7.53487655872822e-08, "loss": 0.81414163, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.6398074626922607 }, { "auxiliary_loss_clip": 0.01109421, "auxiliary_loss_mlp": 0.01023069, "balance_loss_clip": 1.04278135, "balance_loss_mlp": 1.01638699, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 2.157455266163216, "language_loss": 0.74223292, "learning_rate": 7.513710947731656e-08, "loss": 0.76355779, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.7494161128997803 }, { "auxiliary_loss_clip": 0.01129732, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.0459826, "balance_loss_mlp": 1.01899588, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.8215555716263083, "language_loss": 0.85556209, "learning_rate": 7.492574536752095e-08, "loss": 0.8771168, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.683882236480713 }, { "auxiliary_loss_clip": 0.0114864, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.04931796, "balance_loss_mlp": 1.02434635, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 1.9252257513124582, "language_loss": 0.78352916, "learning_rate": 7.471467328995907e-08, "loss": 0.80532634, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.6397745609283447 }, { "auxiliary_loss_clip": 0.01065709, "auxiliary_loss_mlp": 0.01025738, "balance_loss_clip": 1.03903008, "balance_loss_mlp": 1.01900911, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.411137649460267, "language_loss": 0.6127497, "learning_rate": 7.450389327665018e-08, "loss": 0.63366413, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.869558811187744 }, { "auxiliary_loss_clip": 0.01125346, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.05047035, "balance_loss_mlp": 1.02380335, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 10.336019361413118, "language_loss": 0.67565912, "learning_rate": 7.429340535957029e-08, "loss": 0.69722509, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 2.7787222862243652 }, { "auxiliary_loss_clip": 0.01139619, "auxiliary_loss_mlp": 0.01026912, "balance_loss_clip": 1.04636717, "balance_loss_mlp": 1.0192709, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 3.0413798475422165, "language_loss": 0.7064392, "learning_rate": 7.40832095706494e-08, "loss": 0.72810453, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.6625354290008545 }, { "auxiliary_loss_clip": 0.0112521, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.04557216, "balance_loss_mlp": 1.02082694, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 1.779385779414283, "language_loss": 0.80556488, "learning_rate": 7.387330594177443e-08, "loss": 0.82709742, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 3.6896088123321533 }, { "auxiliary_loss_clip": 0.01114866, "auxiliary_loss_mlp": 0.01021943, "balance_loss_clip": 1.04423046, "balance_loss_mlp": 1.01455164, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.9091250841617506, "language_loss": 0.79393089, "learning_rate": 7.366369450478749e-08, "loss": 0.81529897, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.714012622833252 }, { "auxiliary_loss_clip": 0.01115219, "auxiliary_loss_mlp": 0.01024444, "balance_loss_clip": 1.04214931, "balance_loss_mlp": 1.01786113, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.5982579201196017, "language_loss": 0.66274536, "learning_rate": 7.345437529148646e-08, "loss": 0.68414199, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 3.6972904205322266 }, { "auxiliary_loss_clip": 0.01124217, "auxiliary_loss_mlp": 0.01032319, "balance_loss_clip": 1.04607141, "balance_loss_mlp": 1.02533376, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 2.299849308623764, "language_loss": 0.72505563, "learning_rate": 7.324534833362483e-08, "loss": 0.74662101, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.6763153076171875 }, { "auxiliary_loss_clip": 0.01137497, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.04760981, "balance_loss_mlp": 1.0215143, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 2.4077241762666355, "language_loss": 0.6867258, "learning_rate": 7.303661366291192e-08, "loss": 0.70838416, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 4.097229957580566 }, { "auxiliary_loss_clip": 0.01101084, "auxiliary_loss_mlp": 0.01024593, "balance_loss_clip": 1.04087341, "balance_loss_mlp": 1.01867747, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.9362681777030488, "language_loss": 0.81909019, "learning_rate": 7.28281713110126e-08, "loss": 0.84034693, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 3.634269952774048 }, { "auxiliary_loss_clip": 0.01131989, "auxiliary_loss_mlp": 0.01025034, "balance_loss_clip": 1.045259, "balance_loss_mlp": 1.01831675, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 1.9555847583755417, "language_loss": 0.77575576, "learning_rate": 7.262002130954759e-08, "loss": 0.79732597, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.715977907180786 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01024039, "balance_loss_clip": 1.04374588, "balance_loss_mlp": 1.01702619, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.905993835256568, "language_loss": 0.78819102, "learning_rate": 7.241216369009296e-08, "loss": 0.80951154, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.7262885570526123 }, { "auxiliary_loss_clip": 0.0116841, "auxiliary_loss_mlp": 0.01028349, "balance_loss_clip": 1.04798388, "balance_loss_mlp": 1.02063608, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 3.0030323210927907, "language_loss": 0.66600966, "learning_rate": 7.220459848418037e-08, "loss": 0.68797719, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.6377673149108887 }, { "auxiliary_loss_clip": 0.01167253, "auxiliary_loss_mlp": 0.01027578, "balance_loss_clip": 1.05020881, "balance_loss_mlp": 1.02067339, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.8350504684629854, "language_loss": 0.79644614, "learning_rate": 7.199732572329708e-08, "loss": 0.81839448, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.5660336017608643 }, { "auxiliary_loss_clip": 0.0112537, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.0463953, "balance_loss_mlp": 1.02369404, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.1177707608402225, "language_loss": 0.75794756, "learning_rate": 7.179034543888684e-08, "loss": 0.77951062, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.737976551055908 }, { "auxiliary_loss_clip": 0.01153314, "auxiliary_loss_mlp": 0.01020871, "balance_loss_clip": 1.04580331, "balance_loss_mlp": 1.01409125, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 2.1457823478778075, "language_loss": 0.77260655, "learning_rate": 7.158365766234808e-08, "loss": 0.79434836, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.5456883907318115 }, { "auxiliary_loss_clip": 0.01113986, "auxiliary_loss_mlp": 0.01023883, "balance_loss_clip": 1.04205012, "balance_loss_mlp": 1.01647758, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 2.9044224382325794, "language_loss": 0.72302574, "learning_rate": 7.137726242503527e-08, "loss": 0.74440444, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.668687343597412 }, { "auxiliary_loss_clip": 0.0115374, "auxiliary_loss_mlp": 0.00711399, "balance_loss_clip": 1.04923522, "balance_loss_mlp": 1.00065351, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 3.6261197547521324, "language_loss": 0.77798665, "learning_rate": 7.11711597582585e-08, "loss": 0.79663807, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.490283489227295 }, { "auxiliary_loss_clip": 0.01120701, "auxiliary_loss_mlp": 0.01021772, "balance_loss_clip": 1.04109073, "balance_loss_mlp": 1.01530766, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.6950552115055435, "language_loss": 0.80110222, "learning_rate": 7.096534969328271e-08, "loss": 0.82252693, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.5934810638427734 }, { "auxiliary_loss_clip": 0.01140677, "auxiliary_loss_mlp": 0.01021924, "balance_loss_clip": 1.04530311, "balance_loss_mlp": 1.0152483, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 2.1334675235284437, "language_loss": 0.84177208, "learning_rate": 7.075983226132987e-08, "loss": 0.86339808, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.5233709812164307 }, { "auxiliary_loss_clip": 0.01143498, "auxiliary_loss_mlp": 0.00712208, "balance_loss_clip": 1.04663289, "balance_loss_mlp": 1.00063503, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 3.351521741951971, "language_loss": 0.79098481, "learning_rate": 7.055460749357656e-08, "loss": 0.80954182, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.7001702785491943 }, { "auxiliary_loss_clip": 0.01135807, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 1.04797709, "balance_loss_mlp": 1.01942158, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.9896837213817649, "language_loss": 0.70487988, "learning_rate": 7.034967542115521e-08, "loss": 0.7265088, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.7066595554351807 }, { "auxiliary_loss_clip": 0.01144586, "auxiliary_loss_mlp": 0.00711273, "balance_loss_clip": 1.04658127, "balance_loss_mlp": 1.00060093, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.151352265869356, "language_loss": 0.75386477, "learning_rate": 7.014503607515388e-08, "loss": 0.77242333, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.597033977508545 }, { "auxiliary_loss_clip": 0.01139776, "auxiliary_loss_mlp": 0.01030368, "balance_loss_clip": 1.05093253, "balance_loss_mlp": 1.02311671, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 1.9948852785916913, "language_loss": 0.68515468, "learning_rate": 6.994068948661592e-08, "loss": 0.70685601, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.6413497924804688 }, { "auxiliary_loss_clip": 0.01152854, "auxiliary_loss_mlp": 0.01029874, "balance_loss_clip": 1.0477612, "balance_loss_mlp": 1.02247131, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.3645788327325956, "language_loss": 0.76399064, "learning_rate": 6.973663568654142e-08, "loss": 0.78581798, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.5901050567626953 }, { "auxiliary_loss_clip": 0.0116823, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.05063057, "balance_loss_mlp": 1.01944876, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.297539832839133, "language_loss": 0.65255725, "learning_rate": 6.953287470588386e-08, "loss": 0.67450339, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.5502326488494873 }, { "auxiliary_loss_clip": 0.01154764, "auxiliary_loss_mlp": 0.01027637, "balance_loss_clip": 1.04568028, "balance_loss_mlp": 1.0209558, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.537354049620026, "language_loss": 0.86112177, "learning_rate": 6.932940657555452e-08, "loss": 0.88294578, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.790004253387451 }, { "auxiliary_loss_clip": 0.01165026, "auxiliary_loss_mlp": 0.01025093, "balance_loss_clip": 1.04938626, "balance_loss_mlp": 1.01852489, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 1.453707897556747, "language_loss": 0.76674902, "learning_rate": 6.912623132641938e-08, "loss": 0.78865027, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.978767156600952 }, { "auxiliary_loss_clip": 0.0114034, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.04704833, "balance_loss_mlp": 1.01863539, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 2.00946717462993, "language_loss": 0.76634872, "learning_rate": 6.892334898929952e-08, "loss": 0.78801304, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.64050555229187 }, { "auxiliary_loss_clip": 0.011488, "auxiliary_loss_mlp": 0.01023182, "balance_loss_clip": 1.04841113, "balance_loss_mlp": 1.01633334, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 4.653908097581439, "language_loss": 0.84866351, "learning_rate": 6.872075959497236e-08, "loss": 0.87038332, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.60194993019104 }, { "auxiliary_loss_clip": 0.01155825, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.04665565, "balance_loss_mlp": 1.01680541, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 1.9076925418143251, "language_loss": 0.83221006, "learning_rate": 6.85184631741702e-08, "loss": 0.85400587, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 3.5437350273132324 }, { "auxiliary_loss_clip": 0.01148855, "auxiliary_loss_mlp": 0.01022641, "balance_loss_clip": 1.04515171, "balance_loss_mlp": 1.01551807, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 4.64128206427095, "language_loss": 0.77698576, "learning_rate": 6.831645975758161e-08, "loss": 0.79870069, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.5706441402435303 }, { "auxiliary_loss_clip": 0.01130429, "auxiliary_loss_mlp": 0.01025049, "balance_loss_clip": 1.04662967, "balance_loss_mlp": 1.01782513, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 2.118434891264721, "language_loss": 0.67377293, "learning_rate": 6.811474937585026e-08, "loss": 0.69532776, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.6282565593719482 }, { "auxiliary_loss_clip": 0.01117726, "auxiliary_loss_mlp": 0.01020854, "balance_loss_clip": 1.04502439, "balance_loss_mlp": 1.0139817, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.7270632171304867, "language_loss": 0.79263383, "learning_rate": 6.79133320595755e-08, "loss": 0.81401962, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 3.6114134788513184 }, { "auxiliary_loss_clip": 0.01139685, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 1.04818809, "balance_loss_mlp": 1.01915097, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.7732770821709176, "language_loss": 0.75436604, "learning_rate": 6.771220783931198e-08, "loss": 0.77602625, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 3.6492836475372314 }, { "auxiliary_loss_clip": 0.00992652, "auxiliary_loss_mlp": 0.00701336, "balance_loss_clip": 1.02839172, "balance_loss_mlp": 1.00002122, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.8771364739139791, "language_loss": 0.64597821, "learning_rate": 6.751137674556994e-08, "loss": 0.66291809, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 4.577433109283447 }, { "auxiliary_loss_clip": 0.01153876, "auxiliary_loss_mlp": 0.01022286, "balance_loss_clip": 1.04537272, "balance_loss_mlp": 1.015306, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 2.104183687908573, "language_loss": 0.77206045, "learning_rate": 6.731083880881572e-08, "loss": 0.79382205, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 2.7178473472595215 }, { "auxiliary_loss_clip": 0.01135945, "auxiliary_loss_mlp": 0.01023887, "balance_loss_clip": 1.04600501, "balance_loss_mlp": 1.01752472, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.2439763473687333, "language_loss": 0.80978858, "learning_rate": 6.711059405947072e-08, "loss": 0.83138692, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.6914587020874023 }, { "auxiliary_loss_clip": 0.01119141, "auxiliary_loss_mlp": 0.01022008, "balance_loss_clip": 1.04723942, "balance_loss_mlp": 1.01500773, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 1.9591199560652999, "language_loss": 0.77237868, "learning_rate": 6.691064252791156e-08, "loss": 0.79379022, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 2.6621317863464355 }, { "auxiliary_loss_clip": 0.01100879, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.04530621, "balance_loss_mlp": 1.02199352, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.700201515510312, "language_loss": 0.78207111, "learning_rate": 6.67109842444713e-08, "loss": 0.80337381, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.763949394226074 }, { "auxiliary_loss_clip": 0.01153733, "auxiliary_loss_mlp": 0.00711102, "balance_loss_clip": 1.05087876, "balance_loss_mlp": 1.00064778, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 2.0087699823533995, "language_loss": 0.76391995, "learning_rate": 6.651161923943704e-08, "loss": 0.78256834, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.587259531021118 }, { "auxiliary_loss_clip": 0.01147304, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 1.04539216, "balance_loss_mlp": 1.01937747, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 2.8611313155223588, "language_loss": 0.7696166, "learning_rate": 6.631254754305326e-08, "loss": 0.79135728, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.5996439456939697 }, { "auxiliary_loss_clip": 0.011702, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.04977179, "balance_loss_mlp": 1.02202845, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.3878833600640905, "language_loss": 0.78242052, "learning_rate": 6.611376918551848e-08, "loss": 0.80441594, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.5493903160095215 }, { "auxiliary_loss_clip": 0.01115978, "auxiliary_loss_mlp": 0.00711851, "balance_loss_clip": 1.04246724, "balance_loss_mlp": 1.00062788, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 1.9709807190901394, "language_loss": 0.79823422, "learning_rate": 6.591528419698744e-08, "loss": 0.81651253, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.6604557037353516 }, { "auxiliary_loss_clip": 0.01137612, "auxiliary_loss_mlp": 0.01024667, "balance_loss_clip": 1.04351926, "balance_loss_mlp": 1.01770556, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.6456072648585245, "language_loss": 0.83243364, "learning_rate": 6.571709260756986e-08, "loss": 0.85405648, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.622006893157959 }, { "auxiliary_loss_clip": 0.01157647, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.05332685, "balance_loss_mlp": 1.02033567, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 2.588617230278531, "language_loss": 0.76501536, "learning_rate": 6.551919444733122e-08, "loss": 0.78686309, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.577502489089966 }, { "auxiliary_loss_clip": 0.01138711, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.02633834, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 1.789015188588997, "language_loss": 0.65895498, "learning_rate": 6.53215897462931e-08, "loss": 0.68067467, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 3.0441877841949463 }, { "auxiliary_loss_clip": 0.01147811, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.04468179, "balance_loss_mlp": 1.01787496, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.322203427540928, "language_loss": 0.74963534, "learning_rate": 6.512427853443103e-08, "loss": 0.77136767, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.6661674976348877 }, { "auxiliary_loss_clip": 0.01157181, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.0486989, "balance_loss_mlp": 1.01744461, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.8110496072554565, "language_loss": 0.75950873, "learning_rate": 6.492726084167799e-08, "loss": 0.78132606, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.6777923107147217 }, { "auxiliary_loss_clip": 0.01072022, "auxiliary_loss_mlp": 0.01006455, "balance_loss_clip": 1.01898766, "balance_loss_mlp": 1.00536466, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.7777840077174142, "language_loss": 0.57455742, "learning_rate": 6.473053669792072e-08, "loss": 0.59534216, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.0178096294403076 }, { "auxiliary_loss_clip": 0.01151833, "auxiliary_loss_mlp": 0.01027194, "balance_loss_clip": 1.04715657, "balance_loss_mlp": 1.02043533, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.5135770877393555, "language_loss": 0.73546147, "learning_rate": 6.453410613300248e-08, "loss": 0.75725174, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.6150062084198 }, { "auxiliary_loss_clip": 0.01087642, "auxiliary_loss_mlp": 0.01028577, "balance_loss_clip": 1.04166389, "balance_loss_mlp": 1.02166319, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.7570088955387664, "language_loss": 0.58373952, "learning_rate": 6.43379691767214e-08, "loss": 0.60490173, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.7868940830230713 }, { "auxiliary_loss_clip": 0.01022721, "auxiliary_loss_mlp": 0.01002283, "balance_loss_clip": 1.01663876, "balance_loss_mlp": 1.00141239, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7216829613038882, "language_loss": 0.5513854, "learning_rate": 6.414212585883105e-08, "loss": 0.57163537, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.3047690391540527 }, { "auxiliary_loss_clip": 0.01139142, "auxiliary_loss_mlp": 0.01024117, "balance_loss_clip": 1.04691863, "balance_loss_mlp": 1.01742053, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.7639484610865832, "language_loss": 0.69644815, "learning_rate": 6.394657620904143e-08, "loss": 0.71808076, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.7757568359375 }, { "auxiliary_loss_clip": 0.0117308, "auxiliary_loss_mlp": 0.01024623, "balance_loss_clip": 1.05015063, "balance_loss_mlp": 1.01736069, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 2.1205908137478633, "language_loss": 0.72073102, "learning_rate": 6.375132025701657e-08, "loss": 0.74270809, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.613811492919922 }, { "auxiliary_loss_clip": 0.01172399, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.05104971, "balance_loss_mlp": 1.01920807, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.469504641466877, "language_loss": 0.6945191, "learning_rate": 6.355635803237724e-08, "loss": 0.71651417, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.622114896774292 }, { "auxiliary_loss_clip": 0.01151649, "auxiliary_loss_mlp": 0.01025602, "balance_loss_clip": 1.04736304, "balance_loss_mlp": 1.01794887, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 4.699614386522139, "language_loss": 0.79857475, "learning_rate": 6.336168956469867e-08, "loss": 0.82034725, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.5803489685058594 }, { "auxiliary_loss_clip": 0.011304, "auxiliary_loss_mlp": 0.01020295, "balance_loss_clip": 1.04769242, "balance_loss_mlp": 1.01398337, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 3.9624109410062496, "language_loss": 0.72015369, "learning_rate": 6.316731488351168e-08, "loss": 0.74166065, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 3.864877223968506 }, { "auxiliary_loss_clip": 0.01156328, "auxiliary_loss_mlp": 0.01025836, "balance_loss_clip": 1.05070782, "balance_loss_mlp": 1.01875782, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.8785264636574028, "language_loss": 0.63540387, "learning_rate": 6.297323401830334e-08, "loss": 0.65722549, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.6012725830078125 }, { "auxiliary_loss_clip": 0.01154055, "auxiliary_loss_mlp": 0.01027414, "balance_loss_clip": 1.04680681, "balance_loss_mlp": 1.02020776, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.410452367892483, "language_loss": 0.69190466, "learning_rate": 6.277944699851523e-08, "loss": 0.71371937, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 3.558483362197876 }, { "auxiliary_loss_clip": 0.01167904, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04862058, "balance_loss_mlp": 1.01762128, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 2.1932572299744097, "language_loss": 0.7339294, "learning_rate": 6.25859538535447e-08, "loss": 0.75585759, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 3.803478240966797 }, { "auxiliary_loss_clip": 0.01137528, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.0159986, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.7695383392125414, "language_loss": 0.78005159, "learning_rate": 6.239275461274474e-08, "loss": 0.80165803, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 3.5323853492736816 }, { "auxiliary_loss_clip": 0.01153717, "auxiliary_loss_mlp": 0.01026108, "balance_loss_clip": 1.0482831, "balance_loss_mlp": 1.01906598, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.9315048560909456, "language_loss": 0.85893261, "learning_rate": 6.219984930542299e-08, "loss": 0.88073087, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.5953190326690674 }, { "auxiliary_loss_clip": 0.01156451, "auxiliary_loss_mlp": 0.01023399, "balance_loss_clip": 1.04898524, "balance_loss_mlp": 1.01616907, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.95036013403, "language_loss": 0.75720835, "learning_rate": 6.200723796084383e-08, "loss": 0.77900684, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.614471912384033 }, { "auxiliary_loss_clip": 0.01039429, "auxiliary_loss_mlp": 0.0100282, "balance_loss_clip": 1.01961136, "balance_loss_mlp": 1.00172949, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7618121636106038, "language_loss": 0.62976986, "learning_rate": 6.181492060822546e-08, "loss": 0.65019232, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.182985544204712 }, { "auxiliary_loss_clip": 0.01101471, "auxiliary_loss_mlp": 0.01023144, "balance_loss_clip": 1.04109776, "balance_loss_mlp": 1.01585472, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.6028392084290055, "language_loss": 0.81780529, "learning_rate": 6.162289727674274e-08, "loss": 0.83905143, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.737959623336792 }, { "auxiliary_loss_clip": 0.01120656, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.04305983, "balance_loss_mlp": 1.01826048, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.177281650072994, "language_loss": 0.87843931, "learning_rate": 6.143116799552527e-08, "loss": 0.89989203, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.6509554386138916 }, { "auxiliary_loss_clip": 0.01155397, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.04814959, "balance_loss_mlp": 1.0275023, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.2844162195931124, "language_loss": 0.55872309, "learning_rate": 6.123973279365802e-08, "loss": 0.58062679, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.6502673625946045 }, { "auxiliary_loss_clip": 0.01159587, "auxiliary_loss_mlp": 0.01027075, "balance_loss_clip": 1.05231261, "balance_loss_mlp": 1.02026272, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 2.315685007607418, "language_loss": 0.78055418, "learning_rate": 6.10485917001824e-08, "loss": 0.80242074, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.5975840091705322 }, { "auxiliary_loss_clip": 0.0114167, "auxiliary_loss_mlp": 0.0102437, "balance_loss_clip": 1.04677153, "balance_loss_mlp": 1.01766753, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.554625754771526, "language_loss": 0.80989724, "learning_rate": 6.085774474409322e-08, "loss": 0.83155763, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.6867144107818604 }, { "auxiliary_loss_clip": 0.01137633, "auxiliary_loss_mlp": 0.0102424, "balance_loss_clip": 1.04833543, "balance_loss_mlp": 1.01703691, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.0390015124285785, "language_loss": 0.70171303, "learning_rate": 6.066719195434267e-08, "loss": 0.72333181, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.626964807510376 }, { "auxiliary_loss_clip": 0.01158267, "auxiliary_loss_mlp": 0.01029129, "balance_loss_clip": 1.04992867, "balance_loss_mlp": 1.02127957, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.857031749144452, "language_loss": 0.66783804, "learning_rate": 6.047693335983717e-08, "loss": 0.68971199, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.6333072185516357 }, { "auxiliary_loss_clip": 0.01155182, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.04707861, "balance_loss_mlp": 1.01737487, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 3.188680560813544, "language_loss": 0.82668185, "learning_rate": 6.028696898943853e-08, "loss": 0.84848154, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.744126081466675 }, { "auxiliary_loss_clip": 0.01135229, "auxiliary_loss_mlp": 0.00712138, "balance_loss_clip": 1.04351974, "balance_loss_mlp": 1.00059056, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 2.2101243137130626, "language_loss": 0.70763552, "learning_rate": 6.00972988719648e-08, "loss": 0.72610915, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.6355652809143066 }, { "auxiliary_loss_clip": 0.01122395, "auxiliary_loss_mlp": 0.0071187, "balance_loss_clip": 1.04460359, "balance_loss_mlp": 1.00057793, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 3.1544956525200982, "language_loss": 0.69991273, "learning_rate": 5.990792303618807e-08, "loss": 0.7182554, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.7499427795410156 }, { "auxiliary_loss_clip": 0.01122126, "auxiliary_loss_mlp": 0.01022877, "balance_loss_clip": 1.04616129, "balance_loss_mlp": 1.01578379, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.6632031994045688, "language_loss": 0.69558072, "learning_rate": 5.971884151083695e-08, "loss": 0.71703076, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.741403579711914 }, { "auxiliary_loss_clip": 0.01136997, "auxiliary_loss_mlp": 0.01027171, "balance_loss_clip": 1.04543543, "balance_loss_mlp": 1.02044487, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 3.4827452582862675, "language_loss": 0.74281514, "learning_rate": 5.9530054324595124e-08, "loss": 0.76445681, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.740567445755005 }, { "auxiliary_loss_clip": 0.01055468, "auxiliary_loss_mlp": 0.00701817, "balance_loss_clip": 1.01739895, "balance_loss_mlp": 1.00004888, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7183215923222527, "language_loss": 0.57500887, "learning_rate": 5.934156150610103e-08, "loss": 0.59258163, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.2369744777679443 }, { "auxiliary_loss_clip": 0.01137332, "auxiliary_loss_mlp": 0.01021943, "balance_loss_clip": 1.04741859, "balance_loss_mlp": 1.01494551, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 3.26697457827145, "language_loss": 0.78785586, "learning_rate": 5.915336308394914e-08, "loss": 0.8094486, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.68209171295166 }, { "auxiliary_loss_clip": 0.01148116, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.04635882, "balance_loss_mlp": 1.01814485, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.6861493340233817, "language_loss": 0.76868308, "learning_rate": 5.89654590866886e-08, "loss": 0.7904098, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.6109139919281006 }, { "auxiliary_loss_clip": 0.01095794, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.04603744, "balance_loss_mlp": 1.02162874, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 2.5435858357005556, "language_loss": 0.88324428, "learning_rate": 5.877784954282483e-08, "loss": 0.90449172, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.7713449001312256 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.04817557, "balance_loss_mlp": 1.018098, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 2.2877935569782473, "language_loss": 0.7232691, "learning_rate": 5.8590534480817963e-08, "loss": 0.74509203, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.7518932819366455 }, { "auxiliary_loss_clip": 0.01171462, "auxiliary_loss_mlp": 0.01026565, "balance_loss_clip": 1.0510354, "balance_loss_mlp": 1.01946056, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 4.724711809763814, "language_loss": 0.72578943, "learning_rate": 5.840351392908349e-08, "loss": 0.74776971, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.5430099964141846 }, { "auxiliary_loss_clip": 0.0114372, "auxiliary_loss_mlp": 0.00712023, "balance_loss_clip": 1.04580522, "balance_loss_mlp": 1.0005784, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 2.417539698986265, "language_loss": 0.7101081, "learning_rate": 5.821678791599205e-08, "loss": 0.72866553, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 3.6034765243530273 }, { "auxiliary_loss_clip": 0.01137001, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.04736972, "balance_loss_mlp": 1.02208865, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 2.110745736664741, "language_loss": 0.80996156, "learning_rate": 5.803035646986965e-08, "loss": 0.8316201, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.591522693634033 }, { "auxiliary_loss_clip": 0.01170107, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.0487299, "balance_loss_mlp": 1.02755737, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.4668206140849582, "language_loss": 0.67423505, "learning_rate": 5.7844219618998766e-08, "loss": 0.69629049, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 3.4934608936309814 }, { "auxiliary_loss_clip": 0.01107797, "auxiliary_loss_mlp": 0.01027573, "balance_loss_clip": 1.04106581, "balance_loss_mlp": 1.02055728, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 2.1097794163277275, "language_loss": 0.71661967, "learning_rate": 5.765837739161505e-08, "loss": 0.73797333, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 3.6482717990875244 }, { "auxiliary_loss_clip": 0.01123047, "auxiliary_loss_mlp": 0.01023265, "balance_loss_clip": 1.04664898, "balance_loss_mlp": 1.01652956, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 2.211993622283163, "language_loss": 0.74426913, "learning_rate": 5.7472829815911504e-08, "loss": 0.76573223, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 3.818960428237915 }, { "auxiliary_loss_clip": 0.01132658, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.04611361, "balance_loss_mlp": 1.02384973, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.8886417254081973, "language_loss": 0.81583804, "learning_rate": 5.7287576920035164e-08, "loss": 0.83747923, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.7466702461242676 }, { "auxiliary_loss_clip": 0.01116437, "auxiliary_loss_mlp": 0.01027388, "balance_loss_clip": 1.04516053, "balance_loss_mlp": 1.02028322, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 2.6539196828537777, "language_loss": 0.76569045, "learning_rate": 5.7102618732088435e-08, "loss": 0.78712869, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.8190605640411377 }, { "auxiliary_loss_clip": 0.01141354, "auxiliary_loss_mlp": 0.01022906, "balance_loss_clip": 1.04660535, "balance_loss_mlp": 1.01598573, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.6656635366723656, "language_loss": 0.74863058, "learning_rate": 5.6917955280130216e-08, "loss": 0.77027321, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.791247844696045 }, { "auxiliary_loss_clip": 0.0115223, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.04980266, "balance_loss_mlp": 1.02561927, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.656174840546648, "language_loss": 0.71914399, "learning_rate": 5.6733586592172755e-08, "loss": 0.74099469, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.739973783493042 }, { "auxiliary_loss_clip": 0.01132192, "auxiliary_loss_mlp": 0.00711202, "balance_loss_clip": 1.0440017, "balance_loss_mlp": 1.00061238, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 3.1442690324565796, "language_loss": 0.80001879, "learning_rate": 5.6549512696185244e-08, "loss": 0.81845272, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.745438814163208 }, { "auxiliary_loss_clip": 0.01167506, "auxiliary_loss_mlp": 0.0103177, "balance_loss_clip": 1.04987264, "balance_loss_mlp": 1.02450418, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 2.1679583898888612, "language_loss": 0.68156898, "learning_rate": 5.636573362009156e-08, "loss": 0.70356178, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.6849095821380615 }, { "auxiliary_loss_clip": 0.01172428, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.05014086, "balance_loss_mlp": 1.02821159, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 4.162579255772698, "language_loss": 0.77084935, "learning_rate": 5.618224939177074e-08, "loss": 0.79293263, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.657904863357544 }, { "auxiliary_loss_clip": 0.01129616, "auxiliary_loss_mlp": 0.01031952, "balance_loss_clip": 1.04606771, "balance_loss_mlp": 1.0249362, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.7250557281639822, "language_loss": 0.70225596, "learning_rate": 5.599906003905719e-08, "loss": 0.72387159, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.8697469234466553 }, { "auxiliary_loss_clip": 0.01150332, "auxiliary_loss_mlp": 0.01022188, "balance_loss_clip": 1.04993594, "balance_loss_mlp": 1.01454091, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.6809348490533265, "language_loss": 0.81916952, "learning_rate": 5.581616558974023e-08, "loss": 0.8408947, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.7366254329681396 }, { "auxiliary_loss_clip": 0.01158678, "auxiliary_loss_mlp": 0.00711491, "balance_loss_clip": 1.04847813, "balance_loss_mlp": 1.00052154, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 2.0587832765587324, "language_loss": 0.79066169, "learning_rate": 5.5633566071565444e-08, "loss": 0.80936342, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.7116153240203857 }, { "auxiliary_loss_clip": 0.01096398, "auxiliary_loss_mlp": 0.01026038, "balance_loss_clip": 1.04263568, "balance_loss_mlp": 1.01915646, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 3.023843265136223, "language_loss": 0.70902371, "learning_rate": 5.5451261512232896e-08, "loss": 0.73024809, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 3.074090003967285 }, { "auxiliary_loss_clip": 0.01156502, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.04590619, "balance_loss_mlp": 1.02340186, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 1.926397241174188, "language_loss": 0.6263628, "learning_rate": 5.5269251939397576e-08, "loss": 0.64823735, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.6315245628356934 }, { "auxiliary_loss_clip": 0.01121415, "auxiliary_loss_mlp": 0.01026853, "balance_loss_clip": 1.04227221, "balance_loss_mlp": 1.01922965, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.04385658181745, "language_loss": 0.76867545, "learning_rate": 5.508753738067073e-08, "loss": 0.79015815, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.7996602058410645 }, { "auxiliary_loss_clip": 0.01153738, "auxiliary_loss_mlp": 0.01027331, "balance_loss_clip": 1.04568863, "balance_loss_mlp": 1.02029181, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 2.1532829903774897, "language_loss": 0.79196036, "learning_rate": 5.4906117863617875e-08, "loss": 0.81377101, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.6528890132904053 }, { "auxiliary_loss_clip": 0.01115992, "auxiliary_loss_mlp": 0.01022974, "balance_loss_clip": 1.04213786, "balance_loss_mlp": 1.01638484, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 2.3360046481667625, "language_loss": 0.78459328, "learning_rate": 5.4724993415760533e-08, "loss": 0.80598295, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.779355049133301 }, { "auxiliary_loss_clip": 0.0113106, "auxiliary_loss_mlp": 0.00711569, "balance_loss_clip": 1.04560578, "balance_loss_mlp": 1.00062776, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 2.1012881243388275, "language_loss": 0.74618727, "learning_rate": 5.454416406457496e-08, "loss": 0.76461351, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.678740978240967 }, { "auxiliary_loss_clip": 0.01151634, "auxiliary_loss_mlp": 0.01021974, "balance_loss_clip": 1.0471406, "balance_loss_mlp": 1.01513147, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 4.117262826333909, "language_loss": 0.74312031, "learning_rate": 5.436362983749299e-08, "loss": 0.76485634, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.668144702911377 }, { "auxiliary_loss_clip": 0.01115957, "auxiliary_loss_mlp": 0.01031515, "balance_loss_clip": 1.04636729, "balance_loss_mlp": 1.02455926, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 2.0795664434193055, "language_loss": 0.64329898, "learning_rate": 5.418339076190137e-08, "loss": 0.6647737, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.72301983833313 }, { "auxiliary_loss_clip": 0.01132887, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.02038169, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 2.3054239132352388, "language_loss": 0.88747412, "learning_rate": 5.400344686514202e-08, "loss": 0.90908206, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.7230417728424072 }, { "auxiliary_loss_clip": 0.01150674, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.04792094, "balance_loss_mlp": 1.02583718, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 2.3785829789868234, "language_loss": 0.66490561, "learning_rate": 5.38237981745131e-08, "loss": 0.68673861, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.676440954208374 }, { "auxiliary_loss_clip": 0.01154394, "auxiliary_loss_mlp": 0.00711407, "balance_loss_clip": 1.04796815, "balance_loss_mlp": 1.00054383, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.9358710737246305, "language_loss": 0.81264746, "learning_rate": 5.364444471726592e-08, "loss": 0.83130538, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.669510841369629 }, { "auxiliary_loss_clip": 0.01150554, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.04635429, "balance_loss_mlp": 1.01936305, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 2.352626889773669, "language_loss": 0.80501044, "learning_rate": 5.346538652060939e-08, "loss": 0.82678479, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 3.6256165504455566 }, { "auxiliary_loss_clip": 0.01132345, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.04595423, "balance_loss_mlp": 1.0198487, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 2.008358411624084, "language_loss": 0.7044304, "learning_rate": 5.3286623611705994e-08, "loss": 0.72601867, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 3.6270194053649902 }, { "auxiliary_loss_clip": 0.01070958, "auxiliary_loss_mlp": 0.01004558, "balance_loss_clip": 1.01859498, "balance_loss_mlp": 1.00354433, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8130780580815511, "language_loss": 0.60519373, "learning_rate": 5.3108156017673824e-08, "loss": 0.62594891, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.288818836212158 }, { "auxiliary_loss_clip": 0.01143073, "auxiliary_loss_mlp": 0.01026239, "balance_loss_clip": 1.04623652, "balance_loss_mlp": 1.01858914, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.8203577766244832, "language_loss": 0.71812797, "learning_rate": 5.2929983765586775e-08, "loss": 0.73982108, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 3.5168111324310303 }, { "auxiliary_loss_clip": 0.01166988, "auxiliary_loss_mlp": 0.01024217, "balance_loss_clip": 1.04886532, "balance_loss_mlp": 1.01706743, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.810948283673697, "language_loss": 0.621898, "learning_rate": 5.275210688247278e-08, "loss": 0.64381003, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.632140636444092 }, { "auxiliary_loss_clip": 0.01106967, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.04467797, "balance_loss_mlp": 1.02054799, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 2.5086104131503086, "language_loss": 0.8521136, "learning_rate": 5.257452539531604e-08, "loss": 0.87345803, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 3.5840699672698975 }, { "auxiliary_loss_clip": 0.01152402, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.04560769, "balance_loss_mlp": 1.01875544, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.6505465378559863, "language_loss": 0.68725491, "learning_rate": 5.2397239331055445e-08, "loss": 0.70903784, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.6531832218170166 }, { "auxiliary_loss_clip": 0.01135265, "auxiliary_loss_mlp": 0.01030895, "balance_loss_clip": 1.04823089, "balance_loss_mlp": 1.02401352, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.703442667222356, "language_loss": 0.80919045, "learning_rate": 5.2220248716585036e-08, "loss": 0.83085203, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.5842061042785645 }, { "auxiliary_loss_clip": 0.01143845, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.04458296, "balance_loss_mlp": 1.02018023, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.2539995859049253, "language_loss": 0.74990094, "learning_rate": 5.204355357875445e-08, "loss": 0.77161771, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.6307997703552246 }, { "auxiliary_loss_clip": 0.01134358, "auxiliary_loss_mlp": 0.01030306, "balance_loss_clip": 1.04537535, "balance_loss_mlp": 1.02290344, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 2.9120829535605672, "language_loss": 0.70201194, "learning_rate": 5.1867153944367584e-08, "loss": 0.72365868, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.671161413192749 }, { "auxiliary_loss_clip": 0.01127888, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.04655695, "balance_loss_mlp": 1.02334452, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.672329143608207, "language_loss": 0.73566115, "learning_rate": 5.16910498401848e-08, "loss": 0.75724375, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.8585636615753174 }, { "auxiliary_loss_clip": 0.01168413, "auxiliary_loss_mlp": 0.01028219, "balance_loss_clip": 1.05042326, "balance_loss_mlp": 1.02145672, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.928133528470368, "language_loss": 0.83574283, "learning_rate": 5.151524129292073e-08, "loss": 0.85770917, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.5485682487487793 }, { "auxiliary_loss_clip": 0.01151634, "auxiliary_loss_mlp": 0.01024276, "balance_loss_clip": 1.04836011, "balance_loss_mlp": 1.01734662, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.766834012164424, "language_loss": 0.6639033, "learning_rate": 5.1339728329245155e-08, "loss": 0.68566239, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.7228081226348877 }, { "auxiliary_loss_clip": 0.0117297, "auxiliary_loss_mlp": 0.01029025, "balance_loss_clip": 1.04979062, "balance_loss_mlp": 1.02134824, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 3.084592147691191, "language_loss": 0.7947427, "learning_rate": 5.116451097578367e-08, "loss": 0.81676269, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.5758438110351562 }, { "auxiliary_loss_clip": 0.01120777, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.04548228, "balance_loss_mlp": 1.01882553, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.8303291954466476, "language_loss": 0.74401122, "learning_rate": 5.0989589259115895e-08, "loss": 0.76547015, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.7045061588287354 }, { "auxiliary_loss_clip": 0.01149116, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.04550934, "balance_loss_mlp": 1.02149987, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 1.9488524005961951, "language_loss": 0.71966517, "learning_rate": 5.081496320577816e-08, "loss": 0.74145758, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.533921480178833 }, { "auxiliary_loss_clip": 0.0105421, "auxiliary_loss_mlp": 0.01001655, "balance_loss_clip": 1.03062558, "balance_loss_mlp": 1.00079107, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 1.0310100087185015, "language_loss": 0.61229879, "learning_rate": 5.0640632842260835e-08, "loss": 0.63285744, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.3374695777893066 }, { "auxiliary_loss_clip": 0.0112165, "auxiliary_loss_mlp": 0.0071124, "balance_loss_clip": 1.04701662, "balance_loss_mlp": 1.00052118, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 2.173480449624514, "language_loss": 0.72657114, "learning_rate": 5.0466598195009426e-08, "loss": 0.74490005, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.0067124366760254 }, { "auxiliary_loss_clip": 0.01125121, "auxiliary_loss_mlp": 0.01024159, "balance_loss_clip": 1.04640841, "balance_loss_mlp": 1.01695013, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 1.9309661227799466, "language_loss": 0.70218766, "learning_rate": 5.0292859290425036e-08, "loss": 0.7236805, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.6676793098449707 }, { "auxiliary_loss_clip": 0.01167349, "auxiliary_loss_mlp": 0.01026824, "balance_loss_clip": 1.04850185, "balance_loss_mlp": 1.01971042, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 2.0776086068399997, "language_loss": 0.779369, "learning_rate": 5.011941615486348e-08, "loss": 0.80131078, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.5887413024902344 }, { "auxiliary_loss_clip": 0.01166947, "auxiliary_loss_mlp": 0.01028161, "balance_loss_clip": 1.04824376, "balance_loss_mlp": 1.0214051, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.0108543619405768, "language_loss": 0.84564936, "learning_rate": 4.994626881463659e-08, "loss": 0.8676005, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.514402151107788 }, { "auxiliary_loss_clip": 0.01089553, "auxiliary_loss_mlp": 0.01023778, "balance_loss_clip": 1.03969574, "balance_loss_mlp": 1.01642275, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 2.2955572183001167, "language_loss": 0.71376932, "learning_rate": 4.9773417296009814e-08, "loss": 0.73490268, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.7906131744384766 }, { "auxiliary_loss_clip": 0.01157091, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.04937673, "balance_loss_mlp": 1.02235031, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 1.8727829179679487, "language_loss": 0.65809166, "learning_rate": 4.960086162520527e-08, "loss": 0.67995954, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.618701696395874 }, { "auxiliary_loss_clip": 0.0111262, "auxiliary_loss_mlp": 0.01027258, "balance_loss_clip": 1.04496682, "balance_loss_mlp": 1.01996267, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.9941524207572012, "language_loss": 0.82708019, "learning_rate": 4.942860182839936e-08, "loss": 0.84847897, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.730135202407837 }, { "auxiliary_loss_clip": 0.01133603, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.04495704, "balance_loss_mlp": 1.02081954, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 2.1363508418825115, "language_loss": 0.79607511, "learning_rate": 4.925663793172341e-08, "loss": 0.81769305, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.710636615753174 }, { "auxiliary_loss_clip": 0.01044298, "auxiliary_loss_mlp": 0.00701775, "balance_loss_clip": 1.01848841, "balance_loss_mlp": 0.99996549, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7811149557257482, "language_loss": 0.5648607, "learning_rate": 4.908496996126477e-08, "loss": 0.58232141, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.240987539291382 }, { "auxiliary_loss_clip": 0.01155846, "auxiliary_loss_mlp": 0.0102779, "balance_loss_clip": 1.05334449, "balance_loss_mlp": 1.02103424, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.6986852289040941, "language_loss": 0.76386565, "learning_rate": 4.89135979430646e-08, "loss": 0.78570199, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 3.6031248569488525 }, { "auxiliary_loss_clip": 0.011695, "auxiliary_loss_mlp": 0.01025096, "balance_loss_clip": 1.05153561, "balance_loss_mlp": 1.01846194, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.8465878425647213, "language_loss": 0.85558784, "learning_rate": 4.874252190312078e-08, "loss": 0.87753379, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 3.5460076332092285 }, { "auxiliary_loss_clip": 0.01155673, "auxiliary_loss_mlp": 0.01024107, "balance_loss_clip": 1.04765248, "balance_loss_mlp": 1.0164243, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.5648918013838062, "language_loss": 0.64977646, "learning_rate": 4.857174186738477e-08, "loss": 0.67157429, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.6904654502868652 }, { "auxiliary_loss_clip": 0.011724, "auxiliary_loss_mlp": 0.0102906, "balance_loss_clip": 1.05210149, "balance_loss_mlp": 1.02193737, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 5.337848815379001, "language_loss": 0.7335965, "learning_rate": 4.840125786176408e-08, "loss": 0.75561106, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 3.432421922683716 }, { "auxiliary_loss_clip": 0.01133006, "auxiliary_loss_mlp": 0.01032674, "balance_loss_clip": 1.0460732, "balance_loss_mlp": 1.02590024, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.85491871062935, "language_loss": 0.77508533, "learning_rate": 4.823106991212067e-08, "loss": 0.7967422, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.6466867923736572 }, { "auxiliary_loss_clip": 0.011544, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.04684567, "balance_loss_mlp": 1.02000856, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 2.264773344649879, "language_loss": 0.83271956, "learning_rate": 4.806117804427212e-08, "loss": 0.85453224, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 3.502424955368042 }, { "auxiliary_loss_clip": 0.01148395, "auxiliary_loss_mlp": 0.01029869, "balance_loss_clip": 1.04713094, "balance_loss_mlp": 1.02251399, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 2.6785170195286927, "language_loss": 0.64269853, "learning_rate": 4.7891582283990926e-08, "loss": 0.66448116, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.5624537467956543 }, { "auxiliary_loss_clip": 0.01121544, "auxiliary_loss_mlp": 0.01028124, "balance_loss_clip": 1.04503, "balance_loss_mlp": 1.02112627, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.8177339331155553, "language_loss": 0.726928, "learning_rate": 4.772228265700473e-08, "loss": 0.74842465, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.753822088241577 }, { "auxiliary_loss_clip": 0.01155341, "auxiliary_loss_mlp": 0.0102052, "balance_loss_clip": 1.04820323, "balance_loss_mlp": 1.0129205, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.1109414872754613, "language_loss": 0.76124597, "learning_rate": 4.75532791889961e-08, "loss": 0.78300452, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.5913138389587402 }, { "auxiliary_loss_clip": 0.01148853, "auxiliary_loss_mlp": 0.01022537, "balance_loss_clip": 1.04488993, "balance_loss_mlp": 1.01502681, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 2.1024682467167386, "language_loss": 0.6588763, "learning_rate": 4.738457190560252e-08, "loss": 0.68059021, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.572420120239258 }, { "auxiliary_loss_clip": 0.01105715, "auxiliary_loss_mlp": 0.01025566, "balance_loss_clip": 1.04608893, "balance_loss_mlp": 1.01781154, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 9.061552764626597, "language_loss": 0.79239917, "learning_rate": 4.721616083241664e-08, "loss": 0.813712, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.676943302154541 }, { "auxiliary_loss_clip": 0.01147804, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.04654324, "balance_loss_mlp": 1.01936579, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 1.8120971427286907, "language_loss": 0.77431548, "learning_rate": 4.7048045994986684e-08, "loss": 0.79605812, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.667538642883301 }, { "auxiliary_loss_clip": 0.01159416, "auxiliary_loss_mlp": 0.01022589, "balance_loss_clip": 1.05006194, "balance_loss_mlp": 1.01528168, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.3775966155875716, "language_loss": 0.91027462, "learning_rate": 4.688022741881559e-08, "loss": 0.93209463, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.7079782485961914 }, { "auxiliary_loss_clip": 0.01148292, "auxiliary_loss_mlp": 0.01023004, "balance_loss_clip": 1.04670131, "balance_loss_mlp": 1.01616716, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.639501784427694, "language_loss": 0.7522198, "learning_rate": 4.671270512936076e-08, "loss": 0.7739327, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.5902957916259766 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.0102441, "balance_loss_clip": 1.04340577, "balance_loss_mlp": 1.01757956, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.7041232049223891, "language_loss": 0.82749283, "learning_rate": 4.6545479152035884e-08, "loss": 0.84888613, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.721604108810425 }, { "auxiliary_loss_clip": 0.01156038, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.04966784, "balance_loss_mlp": 1.01865864, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 2.1797990726367464, "language_loss": 0.76288557, "learning_rate": 4.637854951220821e-08, "loss": 0.78470552, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.569124937057495 }, { "auxiliary_loss_clip": 0.01116202, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.04424119, "balance_loss_mlp": 1.0209558, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 1.8047619861258415, "language_loss": 0.75013196, "learning_rate": 4.621191623520171e-08, "loss": 0.77157646, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.6748411655426025 }, { "auxiliary_loss_clip": 0.01104052, "auxiliary_loss_mlp": 0.0103031, "balance_loss_clip": 1.04434335, "balance_loss_mlp": 1.02351475, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.645566150730998, "language_loss": 0.84530103, "learning_rate": 4.604557934629372e-08, "loss": 0.86664462, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.728818655014038 }, { "auxiliary_loss_clip": 0.01132972, "auxiliary_loss_mlp": 0.01027676, "balance_loss_clip": 1.04650927, "balance_loss_mlp": 1.02064872, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.7304057463410185, "language_loss": 0.80429989, "learning_rate": 4.587953887071805e-08, "loss": 0.8259064, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.6437265872955322 }, { "auxiliary_loss_clip": 0.01129378, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.04265034, "balance_loss_mlp": 1.0181613, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 1.8986876772053252, "language_loss": 0.8596887, "learning_rate": 4.5713794833662554e-08, "loss": 0.88123047, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.6376569271087646 }, { "auxiliary_loss_clip": 0.01170507, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.05053735, "balance_loss_mlp": 1.02042377, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.8839048440778472, "language_loss": 0.63520104, "learning_rate": 4.5548347260270236e-08, "loss": 0.65718758, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.601874828338623 }, { "auxiliary_loss_clip": 0.01117331, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.04542065, "balance_loss_mlp": 1.02105427, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.9659469161279315, "language_loss": 0.69485939, "learning_rate": 4.538319617564012e-08, "loss": 0.71630955, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.6455202102661133 }, { "auxiliary_loss_clip": 0.0113519, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.04435277, "balance_loss_mlp": 1.02286315, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 2.6670969227344266, "language_loss": 0.74604928, "learning_rate": 4.521834160482485e-08, "loss": 0.76770645, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.679436206817627 }, { "auxiliary_loss_clip": 0.01157379, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.05004442, "balance_loss_mlp": 1.01653564, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.6568508400111406, "language_loss": 0.82378936, "learning_rate": 4.5053783572832846e-08, "loss": 0.84559798, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.6270604133605957 }, { "auxiliary_loss_clip": 0.01153189, "auxiliary_loss_mlp": 0.0102212, "balance_loss_clip": 1.04874396, "balance_loss_mlp": 1.01490748, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 3.005163662610499, "language_loss": 0.76389831, "learning_rate": 4.488952210462771e-08, "loss": 0.78565139, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.626161813735962 }, { "auxiliary_loss_clip": 0.01167727, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.04925203, "balance_loss_mlp": 1.02005863, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 1.9018146937938356, "language_loss": 0.86187351, "learning_rate": 4.4725557225127495e-08, "loss": 0.88382328, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.6166141033172607 }, { "auxiliary_loss_clip": 0.01154978, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.04937935, "balance_loss_mlp": 1.018296, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 2.2038664763857545, "language_loss": 0.79668325, "learning_rate": 4.456188895920565e-08, "loss": 0.81848359, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 3.6076228618621826 }, { "auxiliary_loss_clip": 0.01168681, "auxiliary_loss_mlp": 0.01026074, "balance_loss_clip": 1.04992795, "balance_loss_mlp": 1.01849794, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 12.134421798615913, "language_loss": 0.8550415, "learning_rate": 4.439851733169031e-08, "loss": 0.87698901, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 3.499295949935913 }, { "auxiliary_loss_clip": 0.01121392, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.04428494, "balance_loss_mlp": 1.02389312, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.259701023387754, "language_loss": 0.6922769, "learning_rate": 4.4235442367365204e-08, "loss": 0.71379846, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.6602470874786377 }, { "auxiliary_loss_clip": 0.01132373, "auxiliary_loss_mlp": 0.01027859, "balance_loss_clip": 1.04308224, "balance_loss_mlp": 1.0208075, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 2.2736465002399577, "language_loss": 0.79576546, "learning_rate": 4.4072664090968545e-08, "loss": 0.81736779, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 3.5272631645202637 }, { "auxiliary_loss_clip": 0.01135598, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.04373717, "balance_loss_mlp": 1.02215981, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.814591039646085, "language_loss": 0.84703249, "learning_rate": 4.391018252719347e-08, "loss": 0.86868334, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.685455083847046 }, { "auxiliary_loss_clip": 0.01137709, "auxiliary_loss_mlp": 0.01027834, "balance_loss_clip": 1.04478788, "balance_loss_mlp": 1.02019835, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 2.7542718130387067, "language_loss": 0.69142699, "learning_rate": 4.374799770068849e-08, "loss": 0.71308243, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 3.511343002319336 }, { "auxiliary_loss_clip": 0.01151996, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.05006409, "balance_loss_mlp": 1.02045, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 2.2977186857905325, "language_loss": 0.74737018, "learning_rate": 4.358610963605658e-08, "loss": 0.76916713, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.667353630065918 }, { "auxiliary_loss_clip": 0.0117182, "auxiliary_loss_mlp": 0.01027717, "balance_loss_clip": 1.05040479, "balance_loss_mlp": 1.020046, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.59584814164065, "language_loss": 0.68499768, "learning_rate": 4.342451835785677e-08, "loss": 0.7069931, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.6142959594726562 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01025452, "balance_loss_clip": 1.04664111, "balance_loss_mlp": 1.0187881, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.6231811190552556, "language_loss": 0.74989581, "learning_rate": 4.3263223890601665e-08, "loss": 0.77149332, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.6793971061706543 }, { "auxiliary_loss_clip": 0.01151726, "auxiliary_loss_mlp": 0.00711413, "balance_loss_clip": 1.05068398, "balance_loss_mlp": 1.0005821, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.635147767400534, "language_loss": 0.7946583, "learning_rate": 4.31022262587597e-08, "loss": 0.81328964, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.5832247734069824 }, { "auxiliary_loss_clip": 0.01154567, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.04918444, "balance_loss_mlp": 1.01990271, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.6388456906096325, "language_loss": 0.66117895, "learning_rate": 4.2941525486754225e-08, "loss": 0.68299752, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.551030158996582 }, { "auxiliary_loss_clip": 0.01115375, "auxiliary_loss_mlp": 0.01022566, "balance_loss_clip": 1.0456121, "balance_loss_mlp": 1.01582503, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.9795929548223332, "language_loss": 0.79485321, "learning_rate": 4.278112159896286e-08, "loss": 0.81623262, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.711587429046631 }, { "auxiliary_loss_clip": 0.01126893, "auxiliary_loss_mlp": 0.01020486, "balance_loss_clip": 1.04108822, "balance_loss_mlp": 1.01358128, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.9497165973240824, "language_loss": 0.67931151, "learning_rate": 4.2621014619719896e-08, "loss": 0.70078534, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.6445670127868652 }, { "auxiliary_loss_clip": 0.01046132, "auxiliary_loss_mlp": 0.01002436, "balance_loss_clip": 1.01738441, "balance_loss_mlp": 1.001315, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.71916586307148, "language_loss": 0.58609515, "learning_rate": 4.246120457331215e-08, "loss": 0.60658085, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.2130825519561768 }, { "auxiliary_loss_clip": 0.011332, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.04863787, "balance_loss_mlp": 1.02031624, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 2.0389478683118276, "language_loss": 0.71863127, "learning_rate": 4.2301691483983325e-08, "loss": 0.74024034, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.6454269886016846 }, { "auxiliary_loss_clip": 0.01156588, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.04820991, "balance_loss_mlp": 1.02133965, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 1.8717016617746827, "language_loss": 0.76117265, "learning_rate": 4.214247537593163e-08, "loss": 0.78302586, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.664264678955078 }, { "auxiliary_loss_clip": 0.01137461, "auxiliary_loss_mlp": 0.01022775, "balance_loss_clip": 1.04550385, "balance_loss_mlp": 1.015679, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 3.1449801423290578, "language_loss": 0.81116199, "learning_rate": 4.1983556273309293e-08, "loss": 0.83276433, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.603484869003296 }, { "auxiliary_loss_clip": 0.01168915, "auxiliary_loss_mlp": 0.01025535, "balance_loss_clip": 1.04903865, "balance_loss_mlp": 1.01789331, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 2.315261531272689, "language_loss": 0.69625223, "learning_rate": 4.182493420022526e-08, "loss": 0.71819675, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.5705230236053467 }, { "auxiliary_loss_clip": 0.01121207, "auxiliary_loss_mlp": 0.01019777, "balance_loss_clip": 1.04309154, "balance_loss_mlp": 1.01335716, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 2.678753005077074, "language_loss": 0.7849555, "learning_rate": 4.166660918074139e-08, "loss": 0.80636525, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.708404064178467 }, { "auxiliary_loss_clip": 0.01117912, "auxiliary_loss_mlp": 0.01024533, "balance_loss_clip": 1.04447055, "balance_loss_mlp": 1.01784182, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.4595025926657046, "language_loss": 0.73562819, "learning_rate": 4.15085812388758e-08, "loss": 0.75705266, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.72578763961792 }, { "auxiliary_loss_clip": 0.01136393, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.0467217, "balance_loss_mlp": 1.02574778, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 2.2790639665418904, "language_loss": 0.78603631, "learning_rate": 4.135085039860153e-08, "loss": 0.80772573, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.6327826976776123 }, { "auxiliary_loss_clip": 0.01134114, "auxiliary_loss_mlp": 0.01026244, "balance_loss_clip": 1.04805231, "balance_loss_mlp": 1.01880503, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.0820091202647695, "language_loss": 0.78547168, "learning_rate": 4.1193416683845906e-08, "loss": 0.8070752, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.6291720867156982 }, { "auxiliary_loss_clip": 0.011238, "auxiliary_loss_mlp": 0.01019558, "balance_loss_clip": 1.04661834, "balance_loss_mlp": 1.01312065, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.446719775646089, "language_loss": 0.83830613, "learning_rate": 4.103628011849136e-08, "loss": 0.85973972, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.627822160720825 }, { "auxiliary_loss_clip": 0.01138517, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.04676008, "balance_loss_mlp": 1.01838803, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 2.092112893825189, "language_loss": 0.76120889, "learning_rate": 4.0879440726375506e-08, "loss": 0.7828474, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.623469352722168 }, { "auxiliary_loss_clip": 0.01134089, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 1.04432988, "balance_loss_mlp": 1.01851249, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.6123698755051192, "language_loss": 0.5586518, "learning_rate": 4.0722898531291074e-08, "loss": 0.58024997, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.642932415008545 }, { "auxiliary_loss_clip": 0.01142616, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.04703045, "balance_loss_mlp": 1.01899314, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.899525003814417, "language_loss": 0.76812065, "learning_rate": 4.0566653556985295e-08, "loss": 0.78980809, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.648106336593628 }, { "auxiliary_loss_clip": 0.01078044, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.04150116, "balance_loss_mlp": 1.02332568, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.2662505816505534, "language_loss": 0.81372726, "learning_rate": 4.0410705827159886e-08, "loss": 0.83481961, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 3.728672981262207 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01024864, "balance_loss_clip": 1.04282522, "balance_loss_mlp": 1.01771104, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.276094402106386, "language_loss": 0.71303868, "learning_rate": 4.0255055365472356e-08, "loss": 0.73460519, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 3.863447904586792 }, { "auxiliary_loss_clip": 0.01090731, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.04156041, "balance_loss_mlp": 1.02104747, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.3359171502664484, "language_loss": 0.7507174, "learning_rate": 4.009970219553471e-08, "loss": 0.77190953, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 3.6546125411987305 }, { "auxiliary_loss_clip": 0.011596, "auxiliary_loss_mlp": 0.01025005, "balance_loss_clip": 1.04955077, "balance_loss_mlp": 1.01731598, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 2.301947683943587, "language_loss": 0.76668274, "learning_rate": 3.99446463409141e-08, "loss": 0.78852874, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.6385421752929688 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.01030301, "balance_loss_clip": 1.04729295, "balance_loss_mlp": 1.02254641, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.7207794609012335, "language_loss": 0.69037271, "learning_rate": 3.978988782513215e-08, "loss": 0.71226454, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.5850160121917725 }, { "auxiliary_loss_clip": 0.01157803, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.04825497, "balance_loss_mlp": 1.01714408, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.8343532831400553, "language_loss": 0.76370525, "learning_rate": 3.963542667166586e-08, "loss": 0.78553069, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 3.812718152999878 }, { "auxiliary_loss_clip": 0.01124167, "auxiliary_loss_mlp": 0.01031173, "balance_loss_clip": 1.04768074, "balance_loss_mlp": 1.02371061, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.83468188275489, "language_loss": 0.68349719, "learning_rate": 3.9481262903946486e-08, "loss": 0.70505065, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.694857358932495 }, { "auxiliary_loss_clip": 0.01026104, "auxiliary_loss_mlp": 0.01003988, "balance_loss_clip": 1.01844096, "balance_loss_mlp": 1.00289726, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7741088482222581, "language_loss": 0.545017, "learning_rate": 3.932739654536066e-08, "loss": 0.56531787, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.2518324851989746 }, { "auxiliary_loss_clip": 0.01151083, "auxiliary_loss_mlp": 0.01028257, "balance_loss_clip": 1.0477283, "balance_loss_mlp": 1.02119112, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.54354796826402, "language_loss": 0.74193645, "learning_rate": 3.917382761925014e-08, "loss": 0.76372981, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.633033037185669 }, { "auxiliary_loss_clip": 0.0114853, "auxiliary_loss_mlp": 0.01023015, "balance_loss_clip": 1.047804, "balance_loss_mlp": 1.01599634, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 2.2316752998354112, "language_loss": 0.79144323, "learning_rate": 3.9020556148910754e-08, "loss": 0.81315869, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.5948941707611084 }, { "auxiliary_loss_clip": 0.01052255, "auxiliary_loss_mlp": 0.01005773, "balance_loss_clip": 1.0204668, "balance_loss_mlp": 1.00471234, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7564263533950829, "language_loss": 0.56649488, "learning_rate": 3.8867582157593895e-08, "loss": 0.58707517, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 3.0892388820648193 }, { "auxiliary_loss_clip": 0.01152571, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.05127907, "balance_loss_mlp": 1.02185977, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.9125800061535942, "language_loss": 0.76867467, "learning_rate": 3.871490566850544e-08, "loss": 0.79048818, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.6980514526367188 }, { "auxiliary_loss_clip": 0.01132475, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.01865327, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 2.3068737649050988, "language_loss": 0.70884871, "learning_rate": 3.856252670480642e-08, "loss": 0.73042536, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.6004300117492676 }, { "auxiliary_loss_clip": 0.01132697, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.04368746, "balance_loss_mlp": 1.02028537, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.909159184354198, "language_loss": 0.81537771, "learning_rate": 3.841044528961279e-08, "loss": 0.83697367, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.63850998878479 }, { "auxiliary_loss_clip": 0.01168977, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.04781342, "balance_loss_mlp": 1.0181663, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.8924897928675364, "language_loss": 0.79065979, "learning_rate": 3.825866144599477e-08, "loss": 0.81260097, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.5692996978759766 }, { "auxiliary_loss_clip": 0.01135133, "auxiliary_loss_mlp": 0.01022383, "balance_loss_clip": 1.04321516, "balance_loss_mlp": 1.01508176, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 2.597051933918788, "language_loss": 0.755337, "learning_rate": 3.8107175196978145e-08, "loss": 0.77691221, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.6410844326019287 }, { "auxiliary_loss_clip": 0.01118155, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.04440165, "balance_loss_mlp": 1.01660836, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 2.0347631721449644, "language_loss": 0.7687521, "learning_rate": 3.7955986565542996e-08, "loss": 0.79017019, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.641728162765503 }, { "auxiliary_loss_clip": 0.01118325, "auxiliary_loss_mlp": 0.0102975, "balance_loss_clip": 1.04265058, "balance_loss_mlp": 1.02190638, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 1.973273470291568, "language_loss": 0.68375874, "learning_rate": 3.780509557462497e-08, "loss": 0.70523947, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.8036296367645264 }, { "auxiliary_loss_clip": 0.011373, "auxiliary_loss_mlp": 0.01022719, "balance_loss_clip": 1.04668927, "balance_loss_mlp": 1.01522338, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.665733396223614, "language_loss": 0.75498259, "learning_rate": 3.765450224711375e-08, "loss": 0.77658278, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.6770942211151123 }, { "auxiliary_loss_clip": 0.01131393, "auxiliary_loss_mlp": 0.01021723, "balance_loss_clip": 1.04618704, "balance_loss_mlp": 1.01451075, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 2.7537807891105137, "language_loss": 0.79848331, "learning_rate": 3.750420660585396e-08, "loss": 0.82001448, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.7229652404785156 }, { "auxiliary_loss_clip": 0.01167372, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.04938436, "balance_loss_mlp": 1.02080214, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.974498791181418, "language_loss": 0.79559177, "learning_rate": 3.735420867364603e-08, "loss": 0.81754154, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.573296070098877 }, { "auxiliary_loss_clip": 0.01080414, "auxiliary_loss_mlp": 0.01022208, "balance_loss_clip": 1.03859532, "balance_loss_mlp": 1.01558042, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.7047212048872755, "language_loss": 0.61419022, "learning_rate": 3.7204508473244186e-08, "loss": 0.63521647, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.882020950317383 }, { "auxiliary_loss_clip": 0.01068246, "auxiliary_loss_mlp": 0.01028332, "balance_loss_clip": 1.04062033, "balance_loss_mlp": 1.02165008, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.580961844976667, "language_loss": 0.69155908, "learning_rate": 3.7055106027357395e-08, "loss": 0.71252483, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.857935905456543 }, { "auxiliary_loss_clip": 0.0114879, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04781783, "balance_loss_mlp": 1.02074838, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 4.629105456559287, "language_loss": 0.71848321, "learning_rate": 3.690600135865063e-08, "loss": 0.74024892, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 3.0942506790161133 }, { "auxiliary_loss_clip": 0.01024103, "auxiliary_loss_mlp": 0.01005637, "balance_loss_clip": 1.01870227, "balance_loss_mlp": 1.00456417, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.7885236082722452, "language_loss": 0.58108974, "learning_rate": 3.675719448974246e-08, "loss": 0.60138714, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.3763816356658936 }, { "auxiliary_loss_clip": 0.01101487, "auxiliary_loss_mlp": 0.00711846, "balance_loss_clip": 1.04230952, "balance_loss_mlp": 1.00058472, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 1.7956068942931789, "language_loss": 0.60559589, "learning_rate": 3.6608685443207054e-08, "loss": 0.62372923, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.7189836502075195 }, { "auxiliary_loss_clip": 0.01120503, "auxiliary_loss_mlp": 0.01027079, "balance_loss_clip": 1.04436564, "balance_loss_mlp": 1.02008772, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.5065051975546755, "language_loss": 0.66740644, "learning_rate": 3.646047424157306e-08, "loss": 0.68888223, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 4.10818338394165 }, { "auxiliary_loss_clip": 0.01136867, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.04764187, "balance_loss_mlp": 1.02428162, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 3.665108191670531, "language_loss": 0.68726701, "learning_rate": 3.631256090732382e-08, "loss": 0.70895207, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 3.6688477993011475 }, { "auxiliary_loss_clip": 0.01123982, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.04736364, "balance_loss_mlp": 1.02041185, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 1.9226828011824761, "language_loss": 0.82779539, "learning_rate": 3.6164945462897833e-08, "loss": 0.84931183, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 3.623195171356201 }, { "auxiliary_loss_clip": 0.01150894, "auxiliary_loss_mlp": 0.00711, "balance_loss_clip": 1.04768062, "balance_loss_mlp": 1.0006988, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 1.8177021499394364, "language_loss": 0.75821817, "learning_rate": 3.6017627930687856e-08, "loss": 0.77683711, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.557610511779785 }, { "auxiliary_loss_clip": 0.01100087, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.04122663, "balance_loss_mlp": 1.01951039, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 2.534132476186155, "language_loss": 0.77329642, "learning_rate": 3.587060833304267e-08, "loss": 0.79456496, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.6847879886627197 }, { "auxiliary_loss_clip": 0.01156172, "auxiliary_loss_mlp": 0.01023039, "balance_loss_clip": 1.04874539, "balance_loss_mlp": 1.0163306, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 2.1959327005074325, "language_loss": 0.64065373, "learning_rate": 3.5723886692264225e-08, "loss": 0.66244584, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 3.5291175842285156 }, { "auxiliary_loss_clip": 0.01134987, "auxiliary_loss_mlp": 0.0102552, "balance_loss_clip": 1.04463506, "balance_loss_mlp": 1.01873446, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 2.5411944012950243, "language_loss": 0.6192627, "learning_rate": 3.557746303061071e-08, "loss": 0.64086771, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.7081809043884277 }, { "auxiliary_loss_clip": 0.01135196, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.0453397, "balance_loss_mlp": 1.01964068, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 2.1822287757927183, "language_loss": 0.72546411, "learning_rate": 3.543133737029391e-08, "loss": 0.7470876, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.684506416320801 }, { "auxiliary_loss_clip": 0.0115675, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.04836249, "balance_loss_mlp": 1.02457726, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 1.947029923418235, "language_loss": 0.69182575, "learning_rate": 3.5285509733481214e-08, "loss": 0.71371365, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.6417126655578613 }, { "auxiliary_loss_clip": 0.01147687, "auxiliary_loss_mlp": 0.01027537, "balance_loss_clip": 1.04575682, "balance_loss_mlp": 1.0205574, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.792107643885485, "language_loss": 0.76954716, "learning_rate": 3.513998014229469e-08, "loss": 0.7912994, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.5712404251098633 }, { "auxiliary_loss_clip": 0.01139259, "auxiliary_loss_mlp": 0.01023824, "balance_loss_clip": 1.04709589, "balance_loss_mlp": 1.01706779, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.7465634216605825, "language_loss": 0.86685693, "learning_rate": 3.499474861881069e-08, "loss": 0.88848776, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.615954875946045 }, { "auxiliary_loss_clip": 0.01089317, "auxiliary_loss_mlp": 0.01024547, "balance_loss_clip": 1.04280388, "balance_loss_mlp": 1.01712656, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 2.5095302691065386, "language_loss": 0.67906916, "learning_rate": 3.4849815185061136e-08, "loss": 0.70020783, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.7670962810516357 }, { "auxiliary_loss_clip": 0.01152195, "auxiliary_loss_mlp": 0.01028552, "balance_loss_clip": 1.04633808, "balance_loss_mlp": 1.02157784, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 2.064230188161012, "language_loss": 0.76182735, "learning_rate": 3.470517986303223e-08, "loss": 0.7836349, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.5897910594940186 }, { "auxiliary_loss_clip": 0.01121313, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.04778171, "balance_loss_mlp": 1.02124834, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.8894162492298396, "language_loss": 0.79389375, "learning_rate": 3.4560842674664856e-08, "loss": 0.81539142, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.6739532947540283 }, { "auxiliary_loss_clip": 0.01154077, "auxiliary_loss_mlp": 0.01023147, "balance_loss_clip": 1.0457356, "balance_loss_mlp": 1.01599991, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 2.219807349321506, "language_loss": 0.75586724, "learning_rate": 3.441680364185506e-08, "loss": 0.77763945, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.6017801761627197 }, { "auxiliary_loss_clip": 0.01141883, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.04928637, "balance_loss_mlp": 1.02027893, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 3.396015833647719, "language_loss": 0.75057149, "learning_rate": 3.427306278645314e-08, "loss": 0.77226883, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.637570858001709 }, { "auxiliary_loss_clip": 0.01106997, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.04626286, "balance_loss_mlp": 1.01748073, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 1.849736496203068, "language_loss": 0.72955573, "learning_rate": 3.4129620130264767e-08, "loss": 0.75086933, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.682203531265259 }, { "auxiliary_loss_clip": 0.01139395, "auxiliary_loss_mlp": 0.00711199, "balance_loss_clip": 1.04708183, "balance_loss_mlp": 1.00056958, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 3.4214828853565136, "language_loss": 0.78457546, "learning_rate": 3.398647569505009e-08, "loss": 0.80308139, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.658459424972534 }, { "auxiliary_loss_clip": 0.01129738, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.04763734, "balance_loss_mlp": 1.01897669, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 3.572950467936353, "language_loss": 0.7449187, "learning_rate": 3.384362950252373e-08, "loss": 0.76648271, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.6263318061828613 }, { "auxiliary_loss_clip": 0.01134866, "auxiliary_loss_mlp": 0.01025418, "balance_loss_clip": 1.04488885, "balance_loss_mlp": 1.01817656, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 2.181346267561948, "language_loss": 0.57057524, "learning_rate": 3.3701081574355473e-08, "loss": 0.59217811, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.7500181198120117 }, { "auxiliary_loss_clip": 0.01049908, "auxiliary_loss_mlp": 0.01004106, "balance_loss_clip": 1.01928902, "balance_loss_mlp": 1.00309229, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6437103037042434, "language_loss": 0.51592273, "learning_rate": 3.3558831932169796e-08, "loss": 0.53646284, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.2646923065185547 }, { "auxiliary_loss_clip": 0.01151178, "auxiliary_loss_mlp": 0.01021373, "balance_loss_clip": 1.04805303, "balance_loss_mlp": 1.01430082, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 2.554434103762977, "language_loss": 0.88475764, "learning_rate": 3.341688059754588e-08, "loss": 0.90648317, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.6746644973754883 }, { "auxiliary_loss_clip": 0.011278, "auxiliary_loss_mlp": 0.00710851, "balance_loss_clip": 1.04423714, "balance_loss_mlp": 1.0005908, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.4062927568787544, "language_loss": 0.77816844, "learning_rate": 3.327522759201762e-08, "loss": 0.79655504, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.682467222213745 }, { "auxiliary_loss_clip": 0.01118419, "auxiliary_loss_mlp": 0.01021714, "balance_loss_clip": 1.04442525, "balance_loss_mlp": 1.01489484, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.320242935925659, "language_loss": 0.66886377, "learning_rate": 3.313387293707359e-08, "loss": 0.69026512, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.655116319656372 }, { "auxiliary_loss_clip": 0.01116858, "auxiliary_loss_mlp": 0.01024145, "balance_loss_clip": 1.04603815, "balance_loss_mlp": 1.01677752, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 3.4046805031562823, "language_loss": 0.68266028, "learning_rate": 3.29928166541571e-08, "loss": 0.70407033, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.6469051837921143 }, { "auxiliary_loss_clip": 0.01127261, "auxiliary_loss_mlp": 0.01022744, "balance_loss_clip": 1.04460287, "balance_loss_mlp": 1.0155468, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 2.3260169669130626, "language_loss": 0.80649418, "learning_rate": 3.2852058764666346e-08, "loss": 0.82799423, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.612039566040039 }, { "auxiliary_loss_clip": 0.01111062, "auxiliary_loss_mlp": 0.01024445, "balance_loss_clip": 1.04624534, "balance_loss_mlp": 1.01776934, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 1.9607285799300351, "language_loss": 0.68675971, "learning_rate": 3.2711599289954264e-08, "loss": 0.7081148, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 3.748291492462158 }, { "auxiliary_loss_clip": 0.01090299, "auxiliary_loss_mlp": 0.01025068, "balance_loss_clip": 1.04252326, "balance_loss_mlp": 1.01820123, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 2.254696336844132, "language_loss": 0.77569199, "learning_rate": 3.257143825132847e-08, "loss": 0.79684567, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 3.7110512256622314 }, { "auxiliary_loss_clip": 0.01134474, "auxiliary_loss_mlp": 0.0102128, "balance_loss_clip": 1.04462409, "balance_loss_mlp": 1.01463962, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.767761665916759, "language_loss": 0.75988227, "learning_rate": 3.243157567005106e-08, "loss": 0.78143978, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 3.5978376865386963 }, { "auxiliary_loss_clip": 0.01173505, "auxiliary_loss_mlp": 0.01025934, "balance_loss_clip": 1.05287671, "balance_loss_mlp": 1.01831079, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 2.201172227967302, "language_loss": 0.64320773, "learning_rate": 3.2292011567339296e-08, "loss": 0.66520214, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.5847954750061035 }, { "auxiliary_loss_clip": 0.01152827, "auxiliary_loss_mlp": 0.007111, "balance_loss_clip": 1.04621065, "balance_loss_mlp": 1.00052953, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.3694146767372293, "language_loss": 0.56373024, "learning_rate": 3.21527459643649e-08, "loss": 0.58236945, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 2.6014575958251953 }, { "auxiliary_loss_clip": 0.01159876, "auxiliary_loss_mlp": 0.01027002, "balance_loss_clip": 1.05152261, "balance_loss_mlp": 1.02004576, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 3.4692685266537437, "language_loss": 0.74062526, "learning_rate": 3.2013778882254536e-08, "loss": 0.76249403, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 3.507535457611084 }, { "auxiliary_loss_clip": 0.01144631, "auxiliary_loss_mlp": 0.01024683, "balance_loss_clip": 1.046386, "balance_loss_mlp": 1.01783407, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 3.3160088062175483, "language_loss": 0.75591087, "learning_rate": 3.1875110342088676e-08, "loss": 0.77760398, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.6916344165802 }, { "auxiliary_loss_clip": 0.01132029, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04675961, "balance_loss_mlp": 1.01830888, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.8815249314024285, "language_loss": 0.65476578, "learning_rate": 3.1736740364904035e-08, "loss": 0.67633951, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.6264636516571045 }, { "auxiliary_loss_clip": 0.01103711, "auxiliary_loss_mlp": 0.00711642, "balance_loss_clip": 1.04400468, "balance_loss_mlp": 1.00052571, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.0071251377350623, "language_loss": 0.77186543, "learning_rate": 3.159866897169094e-08, "loss": 0.79001898, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.7088534832000732 }, { "auxiliary_loss_clip": 0.01131108, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.04821551, "balance_loss_mlp": 1.01799178, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.9842360445096403, "language_loss": 0.75741148, "learning_rate": 3.146089618339487e-08, "loss": 0.77897704, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.6158595085144043 }, { "auxiliary_loss_clip": 0.01121946, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.04544044, "balance_loss_mlp": 1.02134907, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.9265854172504928, "language_loss": 0.67978847, "learning_rate": 3.132342202091554e-08, "loss": 0.70129704, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.744629383087158 }, { "auxiliary_loss_clip": 0.01169449, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.04923129, "balance_loss_mlp": 1.02254713, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.2047756790491944, "language_loss": 0.68399417, "learning_rate": 3.1186246505107595e-08, "loss": 0.70599425, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.530388355255127 }, { "auxiliary_loss_clip": 0.01152091, "auxiliary_loss_mlp": 0.01026552, "balance_loss_clip": 1.04847097, "balance_loss_mlp": 1.0198642, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.8623798228373574, "language_loss": 0.83780354, "learning_rate": 3.104936965678084e-08, "loss": 0.85958993, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.6142895221710205 }, { "auxiliary_loss_clip": 0.01153847, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.04733229, "balance_loss_mlp": 1.02617097, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 2.738866928164414, "language_loss": 0.82145244, "learning_rate": 3.091279149669956e-08, "loss": 0.84333336, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.587498188018799 }, { "auxiliary_loss_clip": 0.01154025, "auxiliary_loss_mlp": 0.00711289, "balance_loss_clip": 1.04931736, "balance_loss_mlp": 1.0005331, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 2.2582829929756403, "language_loss": 0.73782778, "learning_rate": 3.0776512045581624e-08, "loss": 0.75648093, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.6015055179595947 }, { "auxiliary_loss_clip": 0.01129786, "auxiliary_loss_mlp": 0.01027137, "balance_loss_clip": 1.04664457, "balance_loss_mlp": 1.01967454, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 1.9007484232947591, "language_loss": 0.77727532, "learning_rate": 3.0640531324101384e-08, "loss": 0.79884458, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.6185755729675293 }, { "auxiliary_loss_clip": 0.01152992, "auxiliary_loss_mlp": 0.0102637, "balance_loss_clip": 1.05022621, "balance_loss_mlp": 1.01922905, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.9893294357372417, "language_loss": 0.76233989, "learning_rate": 3.0504849352886554e-08, "loss": 0.78413355, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.620314121246338 }, { "auxiliary_loss_clip": 0.01150462, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.04763544, "balance_loss_mlp": 1.01871133, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 3.1895026933133153, "language_loss": 0.71763086, "learning_rate": 3.036946615252023e-08, "loss": 0.73939073, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.545126438140869 }, { "auxiliary_loss_clip": 0.01143118, "auxiliary_loss_mlp": 0.01033894, "balance_loss_clip": 1.0478369, "balance_loss_mlp": 1.02690506, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.6164856678270483, "language_loss": 0.66874468, "learning_rate": 3.0234381743539984e-08, "loss": 0.6905148, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.674633026123047 }, { "auxiliary_loss_clip": 0.01140728, "auxiliary_loss_mlp": 0.01024511, "balance_loss_clip": 1.04609466, "balance_loss_mlp": 1.01774895, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 2.145839409632728, "language_loss": 0.80257952, "learning_rate": 3.0099596146437863e-08, "loss": 0.82423186, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.5903475284576416 }, { "auxiliary_loss_clip": 0.01071905, "auxiliary_loss_mlp": 0.01005003, "balance_loss_clip": 1.01879799, "balance_loss_mlp": 1.00392389, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7729225998145298, "language_loss": 0.60024095, "learning_rate": 2.996510938166086e-08, "loss": 0.62100995, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.2130684852600098 }, { "auxiliary_loss_clip": 0.01152258, "auxiliary_loss_mlp": 0.01025747, "balance_loss_clip": 1.04941916, "balance_loss_mlp": 1.01880622, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 2.05323182817094, "language_loss": 0.73877239, "learning_rate": 2.983092146960997e-08, "loss": 0.76055241, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.6176562309265137 }, { "auxiliary_loss_clip": 0.01137478, "auxiliary_loss_mlp": 0.0102602, "balance_loss_clip": 1.04511881, "balance_loss_mlp": 1.01895952, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 2.1489102124506063, "language_loss": 0.79885119, "learning_rate": 2.9697032430642256e-08, "loss": 0.82048619, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.64044189453125 }, { "auxiliary_loss_clip": 0.01164226, "auxiliary_loss_mlp": 0.01022822, "balance_loss_clip": 1.04781413, "balance_loss_mlp": 1.01613772, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.388595959418048, "language_loss": 0.73783249, "learning_rate": 2.9563442285067906e-08, "loss": 0.75970292, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.6008005142211914 }, { "auxiliary_loss_clip": 0.01155012, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 1.04887807, "balance_loss_mlp": 1.02412403, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 2.9567418107020056, "language_loss": 0.7970767, "learning_rate": 2.943015105315294e-08, "loss": 0.8189413, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.633765697479248 }, { "auxiliary_loss_clip": 0.01106375, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.04141593, "balance_loss_mlp": 1.01913965, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 3.235377207927021, "language_loss": 0.66619587, "learning_rate": 2.929715875511718e-08, "loss": 0.68752944, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 3.6798274517059326 }, { "auxiliary_loss_clip": 0.01152226, "auxiliary_loss_mlp": 0.01024299, "balance_loss_clip": 1.04539967, "balance_loss_mlp": 1.01736701, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 2.0016213759422508, "language_loss": 0.70098746, "learning_rate": 2.9164465411135375e-08, "loss": 0.72275269, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.662260055541992 }, { "auxiliary_loss_clip": 0.01152556, "auxiliary_loss_mlp": 0.01027093, "balance_loss_clip": 1.0490129, "balance_loss_mlp": 1.01969051, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 2.0435673397750693, "language_loss": 0.81155831, "learning_rate": 2.9032071041337426e-08, "loss": 0.83335477, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 3.594062089920044 }, { "auxiliary_loss_clip": 0.01128082, "auxiliary_loss_mlp": 0.0102384, "balance_loss_clip": 1.0457859, "balance_loss_mlp": 1.01712537, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.7300781747176153, "language_loss": 0.73065805, "learning_rate": 2.889997566580704e-08, "loss": 0.75217724, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 3.5344204902648926 }, { "auxiliary_loss_clip": 0.01170627, "auxiliary_loss_mlp": 0.01028098, "balance_loss_clip": 1.04948592, "balance_loss_mlp": 1.02055204, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 1.6926553479357478, "language_loss": 0.70210922, "learning_rate": 2.8768179304583086e-08, "loss": 0.72409642, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.5961945056915283 }, { "auxiliary_loss_clip": 0.01120683, "auxiliary_loss_mlp": 0.01026998, "balance_loss_clip": 1.04705846, "balance_loss_mlp": 1.01985478, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.8407696812398227, "language_loss": 0.73694801, "learning_rate": 2.8636681977659117e-08, "loss": 0.75842488, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 3.6188809871673584 }, { "auxiliary_loss_clip": 0.01104127, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.04512477, "balance_loss_mlp": 1.02076197, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 3.95320661231282, "language_loss": 0.77972728, "learning_rate": 2.850548370498318e-08, "loss": 0.80105311, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.716608762741089 }, { "auxiliary_loss_clip": 0.01150398, "auxiliary_loss_mlp": 0.01022033, "balance_loss_clip": 1.04529619, "balance_loss_mlp": 1.01559007, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.5601206721090595, "language_loss": 0.71675885, "learning_rate": 2.8374584506457798e-08, "loss": 0.73848307, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.622467517852783 }, { "auxiliary_loss_clip": 0.01136268, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.04724801, "balance_loss_mlp": 1.02017784, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 4.020158195570225, "language_loss": 0.67554033, "learning_rate": 2.824398440193998e-08, "loss": 0.69717896, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.6531076431274414 }, { "auxiliary_loss_clip": 0.01096683, "auxiliary_loss_mlp": 0.01026258, "balance_loss_clip": 1.04025376, "balance_loss_mlp": 1.01883113, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 3.1501157566410445, "language_loss": 0.71266377, "learning_rate": 2.811368341124232e-08, "loss": 0.7338931, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.643265962600708 }, { "auxiliary_loss_clip": 0.01136358, "auxiliary_loss_mlp": 0.01022702, "balance_loss_clip": 1.04637766, "balance_loss_mlp": 1.01576972, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 2.600138672403829, "language_loss": 0.68027782, "learning_rate": 2.7983681554131222e-08, "loss": 0.70186847, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.7517337799072266 }, { "auxiliary_loss_clip": 0.01135774, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.04584765, "balance_loss_mlp": 1.01905251, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.4631546469329613, "language_loss": 0.70298505, "learning_rate": 2.7853978850327365e-08, "loss": 0.72460538, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.6324362754821777 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01027816, "balance_loss_clip": 1.04694057, "balance_loss_mlp": 1.020854, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.7975342089349844, "language_loss": 0.87108517, "learning_rate": 2.7724575319507225e-08, "loss": 0.89255965, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.7819855213165283 }, { "auxiliary_loss_clip": 0.01151451, "auxiliary_loss_mlp": 0.01022876, "balance_loss_clip": 1.04569435, "balance_loss_mlp": 1.01613498, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 2.1846978245025253, "language_loss": 0.77144748, "learning_rate": 2.759547098130044e-08, "loss": 0.79319072, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.6152520179748535 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.04841721, "balance_loss_mlp": 1.02467752, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 2.19204965921444, "language_loss": 0.7649911, "learning_rate": 2.746666585529267e-08, "loss": 0.7869668, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.583434581756592 }, { "auxiliary_loss_clip": 0.01144418, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.0456655, "balance_loss_mlp": 1.02258921, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.2685141300236977, "language_loss": 0.74680501, "learning_rate": 2.73381599610234e-08, "loss": 0.76855075, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.7341346740722656 }, { "auxiliary_loss_clip": 0.01147626, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.04614329, "balance_loss_mlp": 1.02045107, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 2.2482922033103057, "language_loss": 0.7137931, "learning_rate": 2.7209953317987033e-08, "loss": 0.73555028, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.6636741161346436 }, { "auxiliary_loss_clip": 0.01154455, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.04893804, "balance_loss_mlp": 1.02083611, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 3.4667112974195966, "language_loss": 0.78595275, "learning_rate": 2.7082045945631793e-08, "loss": 0.80777645, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.6758370399475098 }, { "auxiliary_loss_clip": 0.01111308, "auxiliary_loss_mlp": 0.01026143, "balance_loss_clip": 1.04369557, "balance_loss_mlp": 1.01905286, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 1.9894525713347813, "language_loss": 0.69591284, "learning_rate": 2.6954437863361712e-08, "loss": 0.7172873, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.663892984390259 }, { "auxiliary_loss_clip": 0.01091927, "auxiliary_loss_mlp": 0.01020503, "balance_loss_clip": 1.04191923, "balance_loss_mlp": 1.01403546, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.153948581554912, "language_loss": 0.71033549, "learning_rate": 2.6827129090534862e-08, "loss": 0.73145986, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.7778940200805664 }, { "auxiliary_loss_clip": 0.01135421, "auxiliary_loss_mlp": 0.01023155, "balance_loss_clip": 1.04699337, "balance_loss_mlp": 1.0155015, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 2.8356454017551616, "language_loss": 0.77854276, "learning_rate": 2.670011964646335e-08, "loss": 0.80012858, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.600304365158081 }, { "auxiliary_loss_clip": 0.01077653, "auxiliary_loss_mlp": 0.01027287, "balance_loss_clip": 1.03597641, "balance_loss_mlp": 1.02014089, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 4.331729512484407, "language_loss": 0.68000424, "learning_rate": 2.657340955041487e-08, "loss": 0.70105362, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.889162302017212 }, { "auxiliary_loss_clip": 0.01137067, "auxiliary_loss_mlp": 0.0102568, "balance_loss_clip": 1.04835939, "balance_loss_mlp": 1.01828885, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 5.590812459745357, "language_loss": 0.72013247, "learning_rate": 2.6446998821611167e-08, "loss": 0.74175996, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.871629238128662 }, { "auxiliary_loss_clip": 0.01105626, "auxiliary_loss_mlp": 0.01020569, "balance_loss_clip": 1.04210281, "balance_loss_mlp": 1.01354718, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.804682218521336, "language_loss": 0.71378058, "learning_rate": 2.6320887479228228e-08, "loss": 0.73504257, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.659160614013672 }, { "auxiliary_loss_clip": 0.01139795, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04505134, "balance_loss_mlp": 1.02247441, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.8566443723768713, "language_loss": 0.72810465, "learning_rate": 2.619507554239786e-08, "loss": 0.74979985, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.7087485790252686 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.0102626, "balance_loss_clip": 1.04690218, "balance_loss_mlp": 1.01918483, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 2.413528649596629, "language_loss": 0.70039248, "learning_rate": 2.606956303020502e-08, "loss": 0.72203791, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.720390796661377 }, { "auxiliary_loss_clip": 0.01152237, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.04842961, "balance_loss_mlp": 1.02606845, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.6821931077074441, "language_loss": 0.83984113, "learning_rate": 2.5944349961690036e-08, "loss": 0.86169636, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 3.545299530029297 }, { "auxiliary_loss_clip": 0.01123172, "auxiliary_loss_mlp": 0.01024487, "balance_loss_clip": 1.04623675, "balance_loss_mlp": 1.01718223, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.6826427846898209, "language_loss": 0.73263943, "learning_rate": 2.581943635584749e-08, "loss": 0.75411606, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.81977915763855 }, { "auxiliary_loss_clip": 0.01126683, "auxiliary_loss_mlp": 0.01024524, "balance_loss_clip": 1.04437923, "balance_loss_mlp": 1.01754737, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 3.3974010445084804, "language_loss": 0.6560936, "learning_rate": 2.569482223162689e-08, "loss": 0.67760563, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 3.7908058166503906 }, { "auxiliary_loss_clip": 0.01153038, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.04685211, "balance_loss_mlp": 1.02200198, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.7985252676021146, "language_loss": 0.7246322, "learning_rate": 2.5570507607932e-08, "loss": 0.74645627, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 3.6088740825653076 }, { "auxiliary_loss_clip": 0.01158253, "auxiliary_loss_mlp": 0.01023572, "balance_loss_clip": 1.04902577, "balance_loss_mlp": 1.01674688, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.4970735613953674, "language_loss": 0.6359297, "learning_rate": 2.54464925036213e-08, "loss": 0.65774798, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.550920248031616 }, { "auxiliary_loss_clip": 0.01153618, "auxiliary_loss_mlp": 0.0102407, "balance_loss_clip": 1.04957366, "balance_loss_mlp": 1.0170989, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 2.2877692466946002, "language_loss": 0.61306107, "learning_rate": 2.532277693750773e-08, "loss": 0.63483799, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 3.5904974937438965 }, { "auxiliary_loss_clip": 0.01101327, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.04456306, "balance_loss_mlp": 1.0187571, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 3.330089116265206, "language_loss": 0.76028955, "learning_rate": 2.5199360928358948e-08, "loss": 0.78156263, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.6877050399780273 }, { "auxiliary_loss_clip": 0.0114053, "auxiliary_loss_mlp": 0.00710386, "balance_loss_clip": 1.04370284, "balance_loss_mlp": 1.00065792, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 3.088584842163477, "language_loss": 0.87232649, "learning_rate": 2.507624449489665e-08, "loss": 0.89083564, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.6131324768066406 }, { "auxiliary_loss_clip": 0.01139397, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.04902959, "balance_loss_mlp": 1.02264225, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 1.8923830631557075, "language_loss": 0.64838618, "learning_rate": 2.495342765579811e-08, "loss": 0.67007756, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.6616764068603516 }, { "auxiliary_loss_clip": 0.01104983, "auxiliary_loss_mlp": 0.01026227, "balance_loss_clip": 1.04514873, "balance_loss_mlp": 1.0193342, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 1.9081281037036515, "language_loss": 0.71114552, "learning_rate": 2.4830910429693984e-08, "loss": 0.73245764, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.7348055839538574 }, { "auxiliary_loss_clip": 0.01168323, "auxiliary_loss_mlp": 0.01023128, "balance_loss_clip": 1.0485431, "balance_loss_mlp": 1.01644635, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 2.423223694032488, "language_loss": 0.79784596, "learning_rate": 2.470869283517052e-08, "loss": 0.81976044, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.5356884002685547 }, { "auxiliary_loss_clip": 0.01146547, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 1.0455339, "balance_loss_mlp": 1.02147174, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.726967293988934, "language_loss": 0.77283949, "learning_rate": 2.458677489076777e-08, "loss": 0.79458773, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.594855546951294 }, { "auxiliary_loss_clip": 0.01140766, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.04310024, "balance_loss_mlp": 1.02102768, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.9003842675894647, "language_loss": 0.83139741, "learning_rate": 2.446515661498072e-08, "loss": 0.85308677, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.5739879608154297 }, { "auxiliary_loss_clip": 0.01089639, "auxiliary_loss_mlp": 0.0102352, "balance_loss_clip": 1.04281402, "balance_loss_mlp": 1.01662922, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 5.439294088678849, "language_loss": 0.74253571, "learning_rate": 2.434383802625861e-08, "loss": 0.76366723, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.7825279235839844 }, { "auxiliary_loss_clip": 0.01121827, "auxiliary_loss_mlp": 0.01026628, "balance_loss_clip": 1.0432936, "balance_loss_mlp": 1.01962197, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.8753677896561969, "language_loss": 0.73769259, "learning_rate": 2.4222819143005168e-08, "loss": 0.75917715, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.7247841358184814 }, { "auxiliary_loss_clip": 0.0116896, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.05082154, "balance_loss_mlp": 1.01873231, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.9346661607500406, "language_loss": 0.81188822, "learning_rate": 2.4102099983579706e-08, "loss": 0.83383846, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.574483633041382 }, { "auxiliary_loss_clip": 0.01151843, "auxiliary_loss_mlp": 0.01023657, "balance_loss_clip": 1.04660404, "balance_loss_mlp": 1.01614046, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.8282472403316734, "language_loss": 0.77506387, "learning_rate": 2.3981680566294236e-08, "loss": 0.79681885, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.6701486110687256 }, { "auxiliary_loss_clip": 0.01166014, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.04865003, "balance_loss_mlp": 1.02144134, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.815272627007283, "language_loss": 0.73413026, "learning_rate": 2.3861560909416822e-08, "loss": 0.7560724, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.577667713165283 }, { "auxiliary_loss_clip": 0.01107637, "auxiliary_loss_mlp": 0.01028922, "balance_loss_clip": 1.04678941, "balance_loss_mlp": 1.02211201, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.6743052243014818, "language_loss": 0.82436901, "learning_rate": 2.3741741031169325e-08, "loss": 0.8457346, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.702106475830078 }, { "auxiliary_loss_clip": 0.0110046, "auxiliary_loss_mlp": 0.01029939, "balance_loss_clip": 1.04256201, "balance_loss_mlp": 1.02276897, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 2.0765832621183593, "language_loss": 0.71926928, "learning_rate": 2.3622220949728544e-08, "loss": 0.74057329, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.63181209564209 }, { "auxiliary_loss_clip": 0.0114414, "auxiliary_loss_mlp": 0.01028492, "balance_loss_clip": 1.04431987, "balance_loss_mlp": 1.02132738, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.934152555880011, "language_loss": 0.61432743, "learning_rate": 2.3503000683225526e-08, "loss": 0.63605368, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.7385013103485107 }, { "auxiliary_loss_clip": 0.01169835, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04972517, "balance_loss_mlp": 1.02243137, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 3.23224647172149, "language_loss": 0.84199196, "learning_rate": 2.3384080249745585e-08, "loss": 0.8639859, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.630613088607788 }, { "auxiliary_loss_clip": 0.01109125, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.04386735, "balance_loss_mlp": 1.02071536, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.564593659921123, "language_loss": 0.82858807, "learning_rate": 2.3265459667329178e-08, "loss": 0.84995687, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.8205485343933105 }, { "auxiliary_loss_clip": 0.01136181, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.0466702, "balance_loss_mlp": 1.01899207, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.2603001291233533, "language_loss": 0.8701961, "learning_rate": 2.31471389539708e-08, "loss": 0.89181983, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.580247402191162 }, { "auxiliary_loss_clip": 0.01151967, "auxiliary_loss_mlp": 0.00710661, "balance_loss_clip": 1.04745042, "balance_loss_mlp": 1.00054836, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.646255215158514, "language_loss": 0.72877443, "learning_rate": 2.3029118127619872e-08, "loss": 0.7474007, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.664374589920044 }, { "auxiliary_loss_clip": 0.01129204, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.0448786, "balance_loss_mlp": 1.01968956, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.3558935655382265, "language_loss": 0.86620504, "learning_rate": 2.2911397206179628e-08, "loss": 0.88777012, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.608022451400757 }, { "auxiliary_loss_clip": 0.0116722, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.04904878, "balance_loss_mlp": 1.01727402, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 1.9693556474829594, "language_loss": 0.63134313, "learning_rate": 2.279397620750845e-08, "loss": 0.65325755, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 3.413172483444214 }, { "auxiliary_loss_clip": 0.01129583, "auxiliary_loss_mlp": 0.01023118, "balance_loss_clip": 1.04266059, "balance_loss_mlp": 1.01627803, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 2.092635468443496, "language_loss": 0.78906155, "learning_rate": 2.2676855149419195e-08, "loss": 0.81058854, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.7446787357330322 }, { "auxiliary_loss_clip": 0.01134831, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.04972374, "balance_loss_mlp": 1.01862729, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.4968704219959923, "language_loss": 0.75439185, "learning_rate": 2.2560034049678988e-08, "loss": 0.77599478, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 4.244102478027344 }, { "auxiliary_loss_clip": 0.01170851, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.05104339, "balance_loss_mlp": 1.02156782, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.8220435096391736, "language_loss": 0.75707495, "learning_rate": 2.2443512926008988e-08, "loss": 0.77906883, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.7360260486602783 }, { "auxiliary_loss_clip": 0.01125528, "auxiliary_loss_mlp": 0.01029779, "balance_loss_clip": 1.04564118, "balance_loss_mlp": 1.02301717, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.334720655291663, "language_loss": 0.70122939, "learning_rate": 2.2327291796085946e-08, "loss": 0.72278243, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 3.7422921657562256 }, { "auxiliary_loss_clip": 0.01170256, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.04902148, "balance_loss_mlp": 1.02012181, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 2.54163411126229, "language_loss": 0.76803744, "learning_rate": 2.2211370677540197e-08, "loss": 0.79001415, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 3.6190667152404785 }, { "auxiliary_loss_clip": 0.01168102, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.04939425, "balance_loss_mlp": 1.01698005, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 2.3958136102986316, "language_loss": 0.78427768, "learning_rate": 2.2095749587957012e-08, "loss": 0.80619943, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.670691728591919 }, { "auxiliary_loss_clip": 0.0113307, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.04472589, "balance_loss_mlp": 1.01993823, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 3.9230866112999605, "language_loss": 0.69483531, "learning_rate": 2.1980428544876138e-08, "loss": 0.71643835, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.761580467224121 }, { "auxiliary_loss_clip": 0.0109646, "auxiliary_loss_mlp": 0.01025756, "balance_loss_clip": 1.03779793, "balance_loss_mlp": 1.0187223, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.6155494904260634, "language_loss": 0.74506497, "learning_rate": 2.1865407565791584e-08, "loss": 0.76628709, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.7837917804718018 }, { "auxiliary_loss_clip": 0.01135896, "auxiliary_loss_mlp": 0.01028137, "balance_loss_clip": 1.04416585, "balance_loss_mlp": 1.02101707, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 2.514665985228439, "language_loss": 0.77316129, "learning_rate": 2.175068666815183e-08, "loss": 0.79480165, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.7300262451171875 }, { "auxiliary_loss_clip": 0.01120748, "auxiliary_loss_mlp": 0.01026269, "balance_loss_clip": 1.04422808, "balance_loss_mlp": 1.01916969, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.0161783457621025, "language_loss": 0.79197347, "learning_rate": 2.163626586935985e-08, "loss": 0.81344366, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.712568521499634 }, { "auxiliary_loss_clip": 0.01148483, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.04635608, "balance_loss_mlp": 1.01946199, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 1.8119290963607257, "language_loss": 0.63070214, "learning_rate": 2.1522145186773755e-08, "loss": 0.65245438, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.772693634033203 }, { "auxiliary_loss_clip": 0.01136773, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.04665673, "balance_loss_mlp": 1.01935244, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 1.7653524888315149, "language_loss": 0.85605013, "learning_rate": 2.140832463770481e-08, "loss": 0.87767923, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.7939817905426025 }, { "auxiliary_loss_clip": 0.01140074, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.04536593, "balance_loss_mlp": 1.01941538, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.2037109626315767, "language_loss": 0.75707144, "learning_rate": 2.129480423941987e-08, "loss": 0.77873766, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.7710089683532715 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.04776573, "balance_loss_mlp": 1.01955056, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.6957663884655598, "language_loss": 0.80259562, "learning_rate": 2.1181584009140052e-08, "loss": 0.82427603, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.719820261001587 }, { "auxiliary_loss_clip": 0.01128107, "auxiliary_loss_mlp": 0.01025358, "balance_loss_clip": 1.04563689, "balance_loss_mlp": 1.01883388, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.104413493149033, "language_loss": 0.83881712, "learning_rate": 2.10686639640405e-08, "loss": 0.86035174, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.8130266666412354 }, { "auxiliary_loss_clip": 0.01154307, "auxiliary_loss_mlp": 0.01022843, "balance_loss_clip": 1.04657674, "balance_loss_mlp": 1.01612258, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 2.1715933822063014, "language_loss": 0.81186277, "learning_rate": 2.0956044121251294e-08, "loss": 0.83363426, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.7126047611236572 }, { "auxiliary_loss_clip": 0.01119252, "auxiliary_loss_mlp": 0.01027343, "balance_loss_clip": 1.04636538, "balance_loss_mlp": 1.02011347, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 1.8075096116013776, "language_loss": 0.80739892, "learning_rate": 2.084372449785654e-08, "loss": 0.82886481, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.666635751724243 }, { "auxiliary_loss_clip": 0.01131535, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.04417515, "balance_loss_mlp": 1.02039242, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.8014444579786975, "language_loss": 0.68583721, "learning_rate": 2.0731705110895282e-08, "loss": 0.70742393, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.7142679691314697 }, { "auxiliary_loss_clip": 0.01158677, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.05194473, "balance_loss_mlp": 1.0211724, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 2.4014762208283034, "language_loss": 0.86860073, "learning_rate": 2.0619985977360587e-08, "loss": 0.89047921, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.614013433456421 }, { "auxiliary_loss_clip": 0.01105008, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.04058301, "balance_loss_mlp": 1.02118981, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.6549683171566076, "language_loss": 0.76905894, "learning_rate": 2.0508567114200237e-08, "loss": 0.79038274, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.8254342079162598 }, { "auxiliary_loss_clip": 0.01138891, "auxiliary_loss_mlp": 0.01026734, "balance_loss_clip": 1.04563284, "balance_loss_mlp": 1.01944423, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 3.769169593357927, "language_loss": 0.78457731, "learning_rate": 2.0397448538316485e-08, "loss": 0.80623347, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.707094669342041 }, { "auxiliary_loss_clip": 0.01117271, "auxiliary_loss_mlp": 0.01022756, "balance_loss_clip": 1.04335916, "balance_loss_mlp": 1.01623869, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 3.2305174416359876, "language_loss": 0.66438508, "learning_rate": 2.028663026656563e-08, "loss": 0.68578541, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.722599744796753 }, { "auxiliary_loss_clip": 0.01167617, "auxiliary_loss_mlp": 0.00711302, "balance_loss_clip": 1.05017185, "balance_loss_mlp": 1.00060081, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 2.3697808289714395, "language_loss": 0.71943569, "learning_rate": 2.0176112315758885e-08, "loss": 0.73822498, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.5492753982543945 }, { "auxiliary_loss_clip": 0.01109901, "auxiliary_loss_mlp": 0.01023845, "balance_loss_clip": 1.04364622, "balance_loss_mlp": 1.01653135, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 2.3047310538134447, "language_loss": 0.69380271, "learning_rate": 2.0065894702661957e-08, "loss": 0.71514016, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.6954948902130127 }, { "auxiliary_loss_clip": 0.01113187, "auxiliary_loss_mlp": 0.00711172, "balance_loss_clip": 1.04194856, "balance_loss_mlp": 1.00058246, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 2.472994653725053, "language_loss": 0.78397083, "learning_rate": 1.9955977443994577e-08, "loss": 0.80221438, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.711428642272949 }, { "auxiliary_loss_clip": 0.01136723, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.04671192, "balance_loss_mlp": 1.01798964, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.807120524212325, "language_loss": 0.62165129, "learning_rate": 1.9846360556430965e-08, "loss": 0.64327961, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 3.594453811645508 }, { "auxiliary_loss_clip": 0.0116758, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.04944324, "balance_loss_mlp": 1.01926875, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.3811672448516803, "language_loss": 0.61432803, "learning_rate": 1.973704405660004e-08, "loss": 0.63626403, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.681676149368286 }, { "auxiliary_loss_clip": 0.01085851, "auxiliary_loss_mlp": 0.01019903, "balance_loss_clip": 1.04122162, "balance_loss_mlp": 1.01290202, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.6740036208725673, "language_loss": 0.77850324, "learning_rate": 1.9628027961085203e-08, "loss": 0.79956079, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 3.788668394088745 }, { "auxiliary_loss_clip": 0.01108526, "auxiliary_loss_mlp": 0.01025162, "balance_loss_clip": 1.04125738, "balance_loss_mlp": 1.01810467, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 3.9981780092799286, "language_loss": 0.83886969, "learning_rate": 1.9519312286423894e-08, "loss": 0.86020654, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.7705485820770264 }, { "auxiliary_loss_clip": 0.01150396, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.0474925, "balance_loss_mlp": 1.0248431, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.7098002854189698, "language_loss": 0.78156561, "learning_rate": 1.9410897049108255e-08, "loss": 0.80338567, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 3.5032875537872314 }, { "auxiliary_loss_clip": 0.01174815, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.05282104, "balance_loss_mlp": 1.01937795, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 3.05648758455291, "language_loss": 0.91043508, "learning_rate": 1.9302782265584905e-08, "loss": 0.93245113, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 3.4776360988616943 }, { "auxiliary_loss_clip": 0.0109295, "auxiliary_loss_mlp": 0.01023255, "balance_loss_clip": 1.04236078, "balance_loss_mlp": 1.01626015, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.2452777979139764, "language_loss": 0.87106895, "learning_rate": 1.9194967952254282e-08, "loss": 0.89223105, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.6866559982299805 }, { "auxiliary_loss_clip": 0.01153706, "auxiliary_loss_mlp": 0.01029207, "balance_loss_clip": 1.05168772, "balance_loss_mlp": 1.02158964, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.5180352697855612, "language_loss": 0.8133719, "learning_rate": 1.9087454125472635e-08, "loss": 0.83520108, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.549542188644409 }, { "auxiliary_loss_clip": 0.01169876, "auxiliary_loss_mlp": 0.01030188, "balance_loss_clip": 1.05066347, "balance_loss_mlp": 1.02284455, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 1.9029071828181372, "language_loss": 0.78017223, "learning_rate": 1.8980240801548696e-08, "loss": 0.8021729, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.639983892440796 }, { "auxiliary_loss_clip": 0.01137611, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.04974449, "balance_loss_mlp": 1.02849042, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 2.8644551648922234, "language_loss": 0.74346274, "learning_rate": 1.8873327996747458e-08, "loss": 0.76519704, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.715846061706543 }, { "auxiliary_loss_clip": 0.01152475, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.0453186, "balance_loss_mlp": 1.01757073, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 1.881146259901525, "language_loss": 0.66022915, "learning_rate": 1.8766715727287053e-08, "loss": 0.68200046, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.6829123497009277 }, { "auxiliary_loss_clip": 0.01158355, "auxiliary_loss_mlp": 0.00711482, "balance_loss_clip": 1.04860544, "balance_loss_mlp": 1.00070012, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 2.7606790450158196, "language_loss": 0.79602957, "learning_rate": 1.8660404009340546e-08, "loss": 0.81472796, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.696406364440918 }, { "auxiliary_loss_clip": 0.01057625, "auxiliary_loss_mlp": 0.0100352, "balance_loss_clip": 1.01674867, "balance_loss_mlp": 1.00258374, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8743208274113474, "language_loss": 0.59531736, "learning_rate": 1.8554392859035485e-08, "loss": 0.61592877, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.202077627182007 }, { "auxiliary_loss_clip": 0.01077602, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.03974581, "balance_loss_mlp": 1.02779067, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 2.0716138202431793, "language_loss": 0.78980064, "learning_rate": 1.8448682292453444e-08, "loss": 0.81092334, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.7501254081726074 }, { "auxiliary_loss_clip": 0.01169458, "auxiliary_loss_mlp": 0.01022245, "balance_loss_clip": 1.05057883, "balance_loss_mlp": 1.01497281, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.6782816302363737, "language_loss": 0.66142344, "learning_rate": 1.8343272325631154e-08, "loss": 0.68334049, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.5533082485198975 }, { "auxiliary_loss_clip": 0.01078373, "auxiliary_loss_mlp": 0.00711411, "balance_loss_clip": 1.04143167, "balance_loss_mlp": 1.0005424, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.368882267042034, "language_loss": 0.78257388, "learning_rate": 1.8238162974558492e-08, "loss": 0.80047178, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.871692180633545 }, { "auxiliary_loss_clip": 0.0113496, "auxiliary_loss_mlp": 0.01023572, "balance_loss_clip": 1.04738712, "balance_loss_mlp": 1.01614535, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 1.9335653594010673, "language_loss": 0.74677008, "learning_rate": 1.8133354255181144e-08, "loss": 0.76835543, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.618584394454956 }, { "auxiliary_loss_clip": 0.01146447, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.04533303, "balance_loss_mlp": 1.01537454, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 2.336982485295046, "language_loss": 0.74460268, "learning_rate": 1.802884618339795e-08, "loss": 0.76628661, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.648813247680664 }, { "auxiliary_loss_clip": 0.01152927, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.04755688, "balance_loss_mlp": 1.01856685, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 1.8472505951125595, "language_loss": 0.80755496, "learning_rate": 1.7924638775062894e-08, "loss": 0.82934511, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.595834255218506 }, { "auxiliary_loss_clip": 0.01114031, "auxiliary_loss_mlp": 0.0102063, "balance_loss_clip": 1.0455277, "balance_loss_mlp": 1.01387644, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 1.9907448592455992, "language_loss": 0.81817269, "learning_rate": 1.7820732045984444e-08, "loss": 0.83951926, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.669008731842041 }, { "auxiliary_loss_clip": 0.0114923, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.04748678, "balance_loss_mlp": 1.02291918, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 2.3078786715666375, "language_loss": 0.74141419, "learning_rate": 1.7717126011924655e-08, "loss": 0.76321077, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.7076876163482666 }, { "auxiliary_loss_clip": 0.0109721, "auxiliary_loss_mlp": 0.0102659, "balance_loss_clip": 1.0397414, "balance_loss_mlp": 1.01941347, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.4228498676001493, "language_loss": 0.77177781, "learning_rate": 1.7613820688600957e-08, "loss": 0.79301584, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.6593353748321533 }, { "auxiliary_loss_clip": 0.01142241, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 1.04506707, "balance_loss_mlp": 1.02194643, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 5.544409713935905, "language_loss": 0.78490394, "learning_rate": 1.7510816091684588e-08, "loss": 0.80661488, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.6943466663360596 }, { "auxiliary_loss_clip": 0.01137509, "auxiliary_loss_mlp": 0.01029536, "balance_loss_clip": 1.04789948, "balance_loss_mlp": 1.02158475, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 2.3077172411555207, "language_loss": 0.78925723, "learning_rate": 1.740811223680083e-08, "loss": 0.81092763, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.619823455810547 }, { "auxiliary_loss_clip": 0.01168556, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.04943347, "balance_loss_mlp": 1.02136612, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 11.760314099154748, "language_loss": 0.73776066, "learning_rate": 1.7305709139530334e-08, "loss": 0.75972807, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.619001626968384 }, { "auxiliary_loss_clip": 0.0114426, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.04372633, "balance_loss_mlp": 1.02068043, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.5625011066844747, "language_loss": 0.74911332, "learning_rate": 1.7203606815407334e-08, "loss": 0.77083623, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.69500470161438 }, { "auxiliary_loss_clip": 0.01142117, "auxiliary_loss_mlp": 0.01024418, "balance_loss_clip": 1.04879618, "balance_loss_mlp": 1.01733351, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.7632093334438295, "language_loss": 0.79388767, "learning_rate": 1.7101805279920557e-08, "loss": 0.81555301, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 3.5900635719299316 }, { "auxiliary_loss_clip": 0.01170851, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.05126762, "balance_loss_mlp": 1.01908743, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.0475325240460647, "language_loss": 0.80972242, "learning_rate": 1.7000304548513643e-08, "loss": 0.83169127, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.697672128677368 }, { "auxiliary_loss_clip": 0.01118632, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.04485238, "balance_loss_mlp": 1.01827717, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.241830521988905, "language_loss": 0.82804048, "learning_rate": 1.6899104636583394e-08, "loss": 0.84948123, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 3.7023682594299316 }, { "auxiliary_loss_clip": 0.01059166, "auxiliary_loss_mlp": 0.01002153, "balance_loss_clip": 1.0180043, "balance_loss_mlp": 1.00124669, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7313470356147372, "language_loss": 0.61911517, "learning_rate": 1.6798205559482638e-08, "loss": 0.63972837, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.371561288833618 }, { "auxiliary_loss_clip": 0.01122471, "auxiliary_loss_mlp": 0.01025002, "balance_loss_clip": 1.04491305, "balance_loss_mlp": 1.01789141, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 20.50858192438435, "language_loss": 0.76752508, "learning_rate": 1.669760733251713e-08, "loss": 0.7889998, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 3.683342695236206 }, { "auxiliary_loss_clip": 0.0109246, "auxiliary_loss_mlp": 0.01024186, "balance_loss_clip": 1.04023635, "balance_loss_mlp": 1.01762342, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.6201120221659588, "language_loss": 0.82478225, "learning_rate": 1.659730997094755e-08, "loss": 0.8459487, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 3.8223729133605957 }, { "auxiliary_loss_clip": 0.01146124, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.04701114, "balance_loss_mlp": 1.02002883, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 1.8134000417948124, "language_loss": 0.62090981, "learning_rate": 1.6497313489989283e-08, "loss": 0.64263737, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.7332675457000732 }, { "auxiliary_loss_clip": 0.01100857, "auxiliary_loss_mlp": 0.01024343, "balance_loss_clip": 1.03773868, "balance_loss_mlp": 1.01733053, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 3.28390085743871, "language_loss": 0.69786406, "learning_rate": 1.639761790481131e-08, "loss": 0.71911603, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.8506522178649902 }, { "auxiliary_loss_clip": 0.01154274, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.04813039, "balance_loss_mlp": 1.02276921, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 1.975357899409671, "language_loss": 0.79321492, "learning_rate": 1.6298223230537754e-08, "loss": 0.81505477, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.7952890396118164 }, { "auxiliary_loss_clip": 0.01134881, "auxiliary_loss_mlp": 0.00711874, "balance_loss_clip": 1.04559851, "balance_loss_mlp": 1.00061488, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.218828429307126, "language_loss": 0.70009983, "learning_rate": 1.619912948224611e-08, "loss": 0.71856737, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 2.860358953475952 }, { "auxiliary_loss_clip": 0.01116393, "auxiliary_loss_mlp": 0.01031982, "balance_loss_clip": 1.04541874, "balance_loss_mlp": 1.024472, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.241478647184205, "language_loss": 0.61309171, "learning_rate": 1.6100336674969682e-08, "loss": 0.63457549, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.7880051136016846 }, { "auxiliary_loss_clip": 0.01109041, "auxiliary_loss_mlp": 0.0102427, "balance_loss_clip": 1.04270041, "balance_loss_mlp": 1.01681352, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.762555870926184, "language_loss": 0.76404202, "learning_rate": 1.600184482369449e-08, "loss": 0.78537512, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.8360159397125244 }, { "auxiliary_loss_clip": 0.0112238, "auxiliary_loss_mlp": 0.01024864, "balance_loss_clip": 1.04235053, "balance_loss_mlp": 1.01736569, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 3.034242200220366, "language_loss": 0.89596951, "learning_rate": 1.5903653943362126e-08, "loss": 0.91744196, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.7257378101348877 }, { "auxiliary_loss_clip": 0.01139443, "auxiliary_loss_mlp": 0.01026744, "balance_loss_clip": 1.04810345, "balance_loss_mlp": 1.0196569, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.8129371169793356, "language_loss": 0.77049744, "learning_rate": 1.580576404886802e-08, "loss": 0.79215932, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.715646505355835 }, { "auxiliary_loss_clip": 0.01153359, "auxiliary_loss_mlp": 0.01021647, "balance_loss_clip": 1.04825139, "balance_loss_mlp": 1.01508427, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 2.3407426176651907, "language_loss": 0.79786074, "learning_rate": 1.570817515506162e-08, "loss": 0.81961077, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.68605375289917 }, { "auxiliary_loss_clip": 0.01167872, "auxiliary_loss_mlp": 0.01021833, "balance_loss_clip": 1.05028129, "balance_loss_mlp": 1.01488566, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.1660107199347562, "language_loss": 0.81122839, "learning_rate": 1.561088727674753e-08, "loss": 0.83312547, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.6340320110321045 }, { "auxiliary_loss_clip": 0.01120983, "auxiliary_loss_mlp": 0.01023853, "balance_loss_clip": 1.04642618, "balance_loss_mlp": 1.01602387, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 4.839485108864191, "language_loss": 0.71202481, "learning_rate": 1.551390042868417e-08, "loss": 0.73347318, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.76991868019104 }, { "auxiliary_loss_clip": 0.01153945, "auxiliary_loss_mlp": 0.01024716, "balance_loss_clip": 1.04874086, "balance_loss_mlp": 1.01803088, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 2.0747916082745532, "language_loss": 0.70518994, "learning_rate": 1.5417214625584207e-08, "loss": 0.72697657, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.7208526134490967 }, { "auxiliary_loss_clip": 0.01146185, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.04574466, "balance_loss_mlp": 1.02250934, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.7661702710095981, "language_loss": 0.85570812, "learning_rate": 1.5320829882114806e-08, "loss": 0.87746465, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.7239367961883545 }, { "auxiliary_loss_clip": 0.01170055, "auxiliary_loss_mlp": 0.01024349, "balance_loss_clip": 1.04918504, "balance_loss_mlp": 1.01737523, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 2.8181405330778535, "language_loss": 0.78910208, "learning_rate": 1.5224746212897378e-08, "loss": 0.81104612, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.667255163192749 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.0102099, "balance_loss_clip": 1.0488205, "balance_loss_mlp": 1.01425457, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 3.1507307415322123, "language_loss": 0.77544743, "learning_rate": 1.512896363250804e-08, "loss": 0.79729998, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.726018190383911 }, { "auxiliary_loss_clip": 0.01154201, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.04640806, "balance_loss_mlp": 1.01973295, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 2.1497366052021203, "language_loss": 0.75638855, "learning_rate": 1.503348215547673e-08, "loss": 0.77819932, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.665170669555664 }, { "auxiliary_loss_clip": 0.0113435, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.04597461, "balance_loss_mlp": 1.02347767, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 1.9316437598133311, "language_loss": 0.80842704, "learning_rate": 1.4938301796288078e-08, "loss": 0.83007789, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.676135301589966 }, { "auxiliary_loss_clip": 0.0116994, "auxiliary_loss_mlp": 0.01020777, "balance_loss_clip": 1.05071521, "balance_loss_mlp": 1.01333284, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 12.008645383266545, "language_loss": 0.81895441, "learning_rate": 1.4843422569380537e-08, "loss": 0.84086162, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.6935348510742188 }, { "auxiliary_loss_clip": 0.01103732, "auxiliary_loss_mlp": 0.01026771, "balance_loss_clip": 1.04171479, "balance_loss_mlp": 1.01980317, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 2.020347338141256, "language_loss": 0.82874566, "learning_rate": 1.4748844489147483e-08, "loss": 0.85005069, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.795074224472046 }, { "auxiliary_loss_clip": 0.01132137, "auxiliary_loss_mlp": 0.01022282, "balance_loss_clip": 1.04214656, "balance_loss_mlp": 1.01560652, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 1.974962703386171, "language_loss": 0.71196842, "learning_rate": 1.4654567569936326e-08, "loss": 0.73351264, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.7067174911499023 }, { "auxiliary_loss_clip": 0.01098755, "auxiliary_loss_mlp": 0.01023312, "balance_loss_clip": 1.04234493, "balance_loss_mlp": 1.01654351, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 2.201138339401755, "language_loss": 0.82967919, "learning_rate": 1.456059182604874e-08, "loss": 0.85089988, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 3.7097926139831543 }, { "auxiliary_loss_clip": 0.0116773, "auxiliary_loss_mlp": 0.01026, "balance_loss_clip": 1.04884648, "balance_loss_mlp": 1.01903558, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 1.7486355119614714, "language_loss": 0.76450628, "learning_rate": 1.4466917271740653e-08, "loss": 0.78644359, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.6809728145599365 }, { "auxiliary_loss_clip": 0.01133059, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.0465467, "balance_loss_mlp": 1.02016091, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 2.23907366479767, "language_loss": 0.67865312, "learning_rate": 1.4373543921222697e-08, "loss": 0.70025766, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 3.640333414077759 }, { "auxiliary_loss_clip": 0.01136261, "auxiliary_loss_mlp": 0.01027361, "balance_loss_clip": 1.0469712, "balance_loss_mlp": 1.01992512, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 2.180652340264768, "language_loss": 0.78228712, "learning_rate": 1.428047178865932e-08, "loss": 0.80392337, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 3.5937907695770264 }, { "auxiliary_loss_clip": 0.01133621, "auxiliary_loss_mlp": 0.01029423, "balance_loss_clip": 1.04386425, "balance_loss_mlp": 1.02212465, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.6418760871383606, "language_loss": 0.74686801, "learning_rate": 1.4187700888169451e-08, "loss": 0.76849842, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.739924669265747 }, { "auxiliary_loss_clip": 0.01057003, "auxiliary_loss_mlp": 0.01000873, "balance_loss_clip": 1.01801312, "balance_loss_mlp": 0.9999792, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7495216479019587, "language_loss": 0.56994534, "learning_rate": 1.40952312338265e-08, "loss": 0.59052414, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 4.1847004890441895 }, { "auxiliary_loss_clip": 0.01123935, "auxiliary_loss_mlp": 0.01026744, "balance_loss_clip": 1.0447309, "balance_loss_mlp": 1.01961553, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 2.901624574187183, "language_loss": 0.68675029, "learning_rate": 1.4003062839657909e-08, "loss": 0.70825708, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.930405378341675 }, { "auxiliary_loss_clip": 0.01126948, "auxiliary_loss_mlp": 0.01022022, "balance_loss_clip": 1.0463593, "balance_loss_mlp": 1.01470852, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 2.4076390547923276, "language_loss": 0.79921842, "learning_rate": 1.391119571964583e-08, "loss": 0.8207081, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.800447463989258 }, { "auxiliary_loss_clip": 0.01152094, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.04880905, "balance_loss_mlp": 1.01908708, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 8.309214575836533, "language_loss": 0.72922069, "learning_rate": 1.3819629887726225e-08, "loss": 0.75100183, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.6582860946655273 }, { "auxiliary_loss_clip": 0.01145163, "auxiliary_loss_mlp": 0.01027386, "balance_loss_clip": 1.04982972, "balance_loss_mlp": 1.02066851, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 2.0297304478794005, "language_loss": 0.76649529, "learning_rate": 1.3728365357789317e-08, "loss": 0.78822076, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 2.7211618423461914 }, { "auxiliary_loss_clip": 0.01076687, "auxiliary_loss_mlp": 0.01022433, "balance_loss_clip": 1.03832269, "balance_loss_mlp": 1.01533377, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.3766718745205764, "language_loss": 0.76475203, "learning_rate": 1.3637402143680254e-08, "loss": 0.78574324, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.7422988414764404 }, { "auxiliary_loss_clip": 0.010295, "auxiliary_loss_mlp": 0.01000733, "balance_loss_clip": 1.01847744, "balance_loss_mlp": 0.99985045, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7243199608813794, "language_loss": 0.55057251, "learning_rate": 1.3546740259197998e-08, "loss": 0.57087481, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.269850492477417 }, { "auxiliary_loss_clip": 0.01138024, "auxiliary_loss_mlp": 0.01026978, "balance_loss_clip": 1.04657149, "balance_loss_mlp": 1.01984, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.4455903336875666, "language_loss": 0.70296264, "learning_rate": 1.3456379718095989e-08, "loss": 0.72461259, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.7725298404693604 }, { "auxiliary_loss_clip": 0.01044772, "auxiliary_loss_mlp": 0.01001005, "balance_loss_clip": 1.01751661, "balance_loss_mlp": 1.00002182, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8422206771556116, "language_loss": 0.62016267, "learning_rate": 1.3366320534081487e-08, "loss": 0.64062047, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.2940032482147217 }, { "auxiliary_loss_clip": 0.01151264, "auxiliary_loss_mlp": 0.01023212, "balance_loss_clip": 1.0471971, "balance_loss_mlp": 1.01563668, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.4507102789421773, "language_loss": 0.76000196, "learning_rate": 1.3276562720816675e-08, "loss": 0.78174669, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.7464325428009033 }, { "auxiliary_loss_clip": 0.01167984, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.04818344, "balance_loss_mlp": 1.01807141, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.1139008475365455, "language_loss": 0.82706249, "learning_rate": 1.3187106291917549e-08, "loss": 0.84899688, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.6405892372131348 }, { "auxiliary_loss_clip": 0.01147646, "auxiliary_loss_mlp": 0.01020454, "balance_loss_clip": 1.04772878, "balance_loss_mlp": 1.01353645, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.8622906432161306, "language_loss": 0.70714808, "learning_rate": 1.309795126095503e-08, "loss": 0.72882903, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.689060926437378 }, { "auxiliary_loss_clip": 0.01069627, "auxiliary_loss_mlp": 0.01024534, "balance_loss_clip": 1.03879535, "balance_loss_mlp": 1.01752758, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.3938559202945595, "language_loss": 0.81110084, "learning_rate": 1.3009097641453192e-08, "loss": 0.8320424, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.799929618835449 }, { "auxiliary_loss_clip": 0.01139532, "auxiliary_loss_mlp": 0.01023398, "balance_loss_clip": 1.0495553, "balance_loss_mlp": 1.01630545, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.9060503197144991, "language_loss": 0.76122439, "learning_rate": 1.2920545446891474e-08, "loss": 0.78285372, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.7224488258361816 }, { "auxiliary_loss_clip": 0.01143097, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.04977643, "balance_loss_mlp": 1.02390862, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.971905477685287, "language_loss": 0.70640624, "learning_rate": 1.2832294690703127e-08, "loss": 0.72814775, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.677696466445923 }, { "auxiliary_loss_clip": 0.01152893, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.0496397, "balance_loss_mlp": 1.01998997, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 1.9958286681305106, "language_loss": 0.77603185, "learning_rate": 1.2744345386275668e-08, "loss": 0.79783887, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.7776904106140137 }, { "auxiliary_loss_clip": 0.01144187, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.05077982, "balance_loss_mlp": 1.02207875, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 1.6559436516075061, "language_loss": 0.78508985, "learning_rate": 1.265669754695109e-08, "loss": 0.80682486, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.7148401737213135 }, { "auxiliary_loss_clip": 0.01092135, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.0397383, "balance_loss_mlp": 1.02149904, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 2.3302062580735545, "language_loss": 0.82276136, "learning_rate": 1.2569351186025201e-08, "loss": 0.84397018, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.777538537979126 }, { "auxiliary_loss_clip": 0.01113582, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.04463744, "balance_loss_mlp": 1.01743042, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.563781613595874, "language_loss": 0.75517279, "learning_rate": 1.2482306316748737e-08, "loss": 0.77655017, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.7940590381622314 }, { "auxiliary_loss_clip": 0.01158135, "auxiliary_loss_mlp": 0.01023131, "balance_loss_clip": 1.04755068, "balance_loss_mlp": 1.01601672, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.4486273781469547, "language_loss": 0.78511858, "learning_rate": 1.2395562952326021e-08, "loss": 0.80693126, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.630967140197754 }, { "auxiliary_loss_clip": 0.01147459, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.04935145, "balance_loss_mlp": 1.02010965, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 2.2990186026937613, "language_loss": 0.81124759, "learning_rate": 1.2309121105916309e-08, "loss": 0.83299756, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.7483701705932617 }, { "auxiliary_loss_clip": 0.01156042, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.04828608, "balance_loss_mlp": 1.01772881, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 3.150214999426561, "language_loss": 0.69321024, "learning_rate": 1.222298079063222e-08, "loss": 0.71502292, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.7879576683044434 }, { "auxiliary_loss_clip": 0.01153666, "auxiliary_loss_mlp": 0.01026394, "balance_loss_clip": 1.04852712, "balance_loss_mlp": 1.01968884, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 2.2822949712869676, "language_loss": 0.72727275, "learning_rate": 1.2137142019541524e-08, "loss": 0.74907333, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 3.59464168548584 }, { "auxiliary_loss_clip": 0.01143052, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 1.04654741, "balance_loss_mlp": 1.01773369, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 2.1638782562983585, "language_loss": 0.73454005, "learning_rate": 1.2051604805666027e-08, "loss": 0.75622076, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 3.702583074569702 }, { "auxiliary_loss_clip": 0.01167834, "auxiliary_loss_mlp": 0.00711291, "balance_loss_clip": 1.04932582, "balance_loss_mlp": 1.00065804, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 7.888057911224753, "language_loss": 0.78547013, "learning_rate": 1.196636916198135e-08, "loss": 0.80426145, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 3.5424580574035645 }, { "auxiliary_loss_clip": 0.01171213, "auxiliary_loss_mlp": 0.01024717, "balance_loss_clip": 1.04951334, "balance_loss_mlp": 1.01782691, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 11.507590410016535, "language_loss": 0.77322417, "learning_rate": 1.1881435101418036e-08, "loss": 0.79518348, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.7016048431396484 }, { "auxiliary_loss_clip": 0.01047528, "auxiliary_loss_mlp": 0.01004329, "balance_loss_clip": 1.01927543, "balance_loss_mlp": 1.0032444, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.7239850001992091, "language_loss": 0.65473932, "learning_rate": 1.1796802636860003e-08, "loss": 0.67525792, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 4.228469133377075 }, { "auxiliary_loss_clip": 0.01169595, "auxiliary_loss_mlp": 0.01026074, "balance_loss_clip": 1.0486877, "balance_loss_mlp": 1.01868916, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 1.9018897467382367, "language_loss": 0.73359847, "learning_rate": 1.1712471781146316e-08, "loss": 0.75555515, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.631963014602661 }, { "auxiliary_loss_clip": 0.01165079, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.0465045, "balance_loss_mlp": 1.01975548, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.8522430993917414, "language_loss": 0.6719088, "learning_rate": 1.1628442547069628e-08, "loss": 0.69382799, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.77868914604187 }, { "auxiliary_loss_clip": 0.01156277, "auxiliary_loss_mlp": 0.00711694, "balance_loss_clip": 1.04792643, "balance_loss_mlp": 1.00063062, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 2.3404960373844457, "language_loss": 0.77295947, "learning_rate": 1.1544714947377521e-08, "loss": 0.79163915, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.6454248428344727 }, { "auxiliary_loss_clip": 0.01171367, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 1.0509181, "balance_loss_mlp": 1.02025104, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.3336448548232056, "language_loss": 0.70452774, "learning_rate": 1.1461288994770945e-08, "loss": 0.7265203, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.6170825958251953 }, { "auxiliary_loss_clip": 0.01170693, "auxiliary_loss_mlp": 0.01025085, "balance_loss_clip": 1.04873085, "balance_loss_mlp": 1.01777196, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.982767727366814, "language_loss": 0.77680999, "learning_rate": 1.1378164701906002e-08, "loss": 0.79876781, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 2.6174752712249756 }, { "auxiliary_loss_clip": 0.01171603, "auxiliary_loss_mlp": 0.01022095, "balance_loss_clip": 1.05061316, "balance_loss_mlp": 1.01449215, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 2.0828922724099113, "language_loss": 0.66940355, "learning_rate": 1.1295342081392156e-08, "loss": 0.69134057, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.5890891551971436 }, { "auxiliary_loss_clip": 0.01138462, "auxiliary_loss_mlp": 0.01023159, "balance_loss_clip": 1.04643905, "balance_loss_mlp": 1.01633739, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.6727474840959007, "language_loss": 0.69223058, "learning_rate": 1.1212821145793804e-08, "loss": 0.7138468, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.641514778137207 }, { "auxiliary_loss_clip": 0.01134588, "auxiliary_loss_mlp": 0.01022848, "balance_loss_clip": 1.04462051, "balance_loss_mlp": 1.01596069, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 2.079097249917001, "language_loss": 0.79310286, "learning_rate": 1.1130601907629156e-08, "loss": 0.81467724, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.6793015003204346 }, { "auxiliary_loss_clip": 0.01059255, "auxiliary_loss_mlp": 0.01001791, "balance_loss_clip": 1.01786232, "balance_loss_mlp": 1.00084877, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8117042496281692, "language_loss": 0.64754784, "learning_rate": 1.1048684379370899e-08, "loss": 0.66815829, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.206752061843872 }, { "auxiliary_loss_clip": 0.01129167, "auxiliary_loss_mlp": 0.01018267, "balance_loss_clip": 1.04674125, "balance_loss_mlp": 1.01168919, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 2.320636866633352, "language_loss": 0.74525368, "learning_rate": 1.0967068573445759e-08, "loss": 0.76672798, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.6241061687469482 }, { "auxiliary_loss_clip": 0.01132221, "auxiliary_loss_mlp": 0.0102503, "balance_loss_clip": 1.04258657, "balance_loss_mlp": 1.01726937, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.6533981465764285, "language_loss": 0.64765179, "learning_rate": 1.0885754502234945e-08, "loss": 0.66922426, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.645118474960327 }, { "auxiliary_loss_clip": 0.01117598, "auxiliary_loss_mlp": 0.01027069, "balance_loss_clip": 1.04405046, "balance_loss_mlp": 1.02025318, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 1.9791558208841789, "language_loss": 0.77859139, "learning_rate": 1.08047421780737e-08, "loss": 0.80003798, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.661529779434204 }, { "auxiliary_loss_clip": 0.01143496, "auxiliary_loss_mlp": 0.00711336, "balance_loss_clip": 1.04571474, "balance_loss_mlp": 1.00055695, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 2.5193636042669847, "language_loss": 0.742248, "learning_rate": 1.0724031613251305e-08, "loss": 0.76079631, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.6497316360473633 }, { "auxiliary_loss_clip": 0.01157081, "auxiliary_loss_mlp": 0.01029224, "balance_loss_clip": 1.04757953, "balance_loss_mlp": 1.02209544, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.362199958455066, "language_loss": 0.66781497, "learning_rate": 1.0643622820011744e-08, "loss": 0.68967795, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.6020708084106445 }, { "auxiliary_loss_clip": 0.01172398, "auxiliary_loss_mlp": 0.01024763, "balance_loss_clip": 1.04938412, "balance_loss_mlp": 1.01702654, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 2.5207408273642704, "language_loss": 0.68114203, "learning_rate": 1.0563515810552814e-08, "loss": 0.70311362, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.680706739425659 }, { "auxiliary_loss_clip": 0.01171952, "auxiliary_loss_mlp": 0.01031753, "balance_loss_clip": 1.05214679, "balance_loss_mlp": 1.02449024, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.7549110044537122, "language_loss": 0.73276734, "learning_rate": 1.0483710597026795e-08, "loss": 0.75480443, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.57613205909729 }, { "auxiliary_loss_clip": 0.01121011, "auxiliary_loss_mlp": 0.01021924, "balance_loss_clip": 1.04470026, "balance_loss_mlp": 1.01472414, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 1.9981939599136038, "language_loss": 0.74213004, "learning_rate": 1.0404207191540227e-08, "loss": 0.7635594, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.752224922180176 }, { "auxiliary_loss_clip": 0.01167196, "auxiliary_loss_mlp": 0.01023615, "balance_loss_clip": 1.04841471, "balance_loss_mlp": 1.01659966, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 2.0843678957377834, "language_loss": 0.74829799, "learning_rate": 1.0325005606153236e-08, "loss": 0.77020609, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.572883129119873 }, { "auxiliary_loss_clip": 0.01109249, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 1.0431869, "balance_loss_mlp": 1.01834321, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.638868940044498, "language_loss": 0.79794359, "learning_rate": 1.0246105852881104e-08, "loss": 0.81929183, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.695922613143921 }, { "auxiliary_loss_clip": 0.01170237, "auxiliary_loss_mlp": 0.01024735, "balance_loss_clip": 1.04889286, "balance_loss_mlp": 1.01754069, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 2.2766720827267233, "language_loss": 0.78479427, "learning_rate": 1.0167507943692476e-08, "loss": 0.80674398, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.5695738792419434 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.0492413, "balance_loss_mlp": 1.02252769, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.271334032324452, "language_loss": 0.71980041, "learning_rate": 1.008921189051093e-08, "loss": 0.7416175, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 3.5391955375671387 }, { "auxiliary_loss_clip": 0.01171745, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.05103195, "balance_loss_mlp": 1.018888, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 2.931163126047132, "language_loss": 0.77738893, "learning_rate": 1.0011217705213848e-08, "loss": 0.79936963, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.5869340896606445 }, { "auxiliary_loss_clip": 0.01151393, "auxiliary_loss_mlp": 0.01021973, "balance_loss_clip": 1.04923439, "balance_loss_mlp": 1.01577389, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 1.741927528789687, "language_loss": 0.7475723, "learning_rate": 9.933525399632658e-09, "loss": 0.76930594, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 3.6574723720550537 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01032226, "balance_loss_clip": 1.0460124, "balance_loss_mlp": 1.02488518, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.9574106337927693, "language_loss": 0.65310645, "learning_rate": 9.856134985553488e-09, "loss": 0.67477936, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.723677635192871 }, { "auxiliary_loss_clip": 0.01167293, "auxiliary_loss_mlp": 0.01031885, "balance_loss_clip": 1.04892397, "balance_loss_mlp": 1.02475619, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.5630563961703527, "language_loss": 0.73421633, "learning_rate": 9.77904647471628e-09, "loss": 0.75620806, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 3.5863983631134033 }, { "auxiliary_loss_clip": 0.01095111, "auxiliary_loss_mlp": 0.01025879, "balance_loss_clip": 1.04168415, "balance_loss_mlp": 1.01937366, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.5093069113027675, "language_loss": 0.74193764, "learning_rate": 9.702259878815454e-09, "loss": 0.76314759, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.713393211364746 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.05092657, "balance_loss_mlp": 1.02009141, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.4103723324320603, "language_loss": 0.74808526, "learning_rate": 9.625775209499254e-09, "loss": 0.76995808, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 3.5363495349884033 }, { "auxiliary_loss_clip": 0.01111492, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.04143262, "balance_loss_mlp": 1.02406502, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 2.1282370708605054, "language_loss": 0.74075764, "learning_rate": 9.549592478370172e-09, "loss": 0.76218009, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.6068427562713623 }, { "auxiliary_loss_clip": 0.01155885, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.04817379, "balance_loss_mlp": 1.01917434, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 2.1699024882418887, "language_loss": 0.793908, "learning_rate": 9.473711696985632e-09, "loss": 0.81572819, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.645494222640991 }, { "auxiliary_loss_clip": 0.01136916, "auxiliary_loss_mlp": 0.01025818, "balance_loss_clip": 1.04567564, "balance_loss_mlp": 1.01841772, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.595493046891028, "language_loss": 0.7640273, "learning_rate": 9.398132876856201e-09, "loss": 0.7856546, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.5831727981567383 }, { "auxiliary_loss_clip": 0.01023682, "auxiliary_loss_mlp": 0.01001344, "balance_loss_clip": 1.02087212, "balance_loss_mlp": 1.00051522, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7760782214646008, "language_loss": 0.60775054, "learning_rate": 9.322856029447379e-09, "loss": 0.6280008, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.255455493927002 }, { "auxiliary_loss_clip": 0.01167226, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.04977667, "balance_loss_mlp": 1.0206399, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 2.212913897507115, "language_loss": 0.8043645, "learning_rate": 9.247881166178695e-09, "loss": 0.82631099, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.6217358112335205 }, { "auxiliary_loss_clip": 0.01128947, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 1.04562962, "balance_loss_mlp": 1.01771474, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.565148234001431, "language_loss": 0.76664317, "learning_rate": 9.173208298423274e-09, "loss": 0.78818077, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.702160120010376 }, { "auxiliary_loss_clip": 0.01106, "auxiliary_loss_mlp": 0.00711541, "balance_loss_clip": 1.04287517, "balance_loss_mlp": 1.00055218, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.7928083029852433, "language_loss": 0.76043785, "learning_rate": 9.09883743750961e-09, "loss": 0.77861321, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.807448625564575 }, { "auxiliary_loss_clip": 0.01134006, "auxiliary_loss_mlp": 0.01022936, "balance_loss_clip": 1.04629886, "balance_loss_mlp": 1.01578927, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 2.234625414835112, "language_loss": 0.83910453, "learning_rate": 9.024768594719124e-09, "loss": 0.86067396, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.573255777359009 }, { "auxiliary_loss_clip": 0.01121074, "auxiliary_loss_mlp": 0.01027286, "balance_loss_clip": 1.04542661, "balance_loss_mlp": 1.02027678, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.3113824754990198, "language_loss": 0.72748357, "learning_rate": 8.95100178128816e-09, "loss": 0.74896717, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.700023651123047 }, { "auxiliary_loss_clip": 0.01139546, "auxiliary_loss_mlp": 0.01029291, "balance_loss_clip": 1.04716027, "balance_loss_mlp": 1.02138174, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.778938413734703, "language_loss": 0.70277286, "learning_rate": 8.877537008407321e-09, "loss": 0.72446126, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.7015247344970703 }, { "auxiliary_loss_clip": 0.01139829, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.0466888, "balance_loss_mlp": 1.02221537, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 2.622889197863162, "language_loss": 0.68708283, "learning_rate": 8.804374287221028e-09, "loss": 0.70877546, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.687101125717163 }, { "auxiliary_loss_clip": 0.0111362, "auxiliary_loss_mlp": 0.01020231, "balance_loss_clip": 1.04062545, "balance_loss_mlp": 1.01337957, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.6414244978952928, "language_loss": 0.84942079, "learning_rate": 8.731513628827958e-09, "loss": 0.87075931, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.6955931186676025 }, { "auxiliary_loss_clip": 0.01156602, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.04999435, "balance_loss_mlp": 1.01645505, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 2.3692228496305923, "language_loss": 0.82556331, "learning_rate": 8.658955044280825e-09, "loss": 0.84736413, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.630894184112549 }, { "auxiliary_loss_clip": 0.01150645, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.04749107, "balance_loss_mlp": 1.01868105, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.5550138891334395, "language_loss": 0.77282143, "learning_rate": 8.586698544587268e-09, "loss": 0.79458559, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.597133159637451 }, { "auxiliary_loss_clip": 0.01128125, "auxiliary_loss_mlp": 0.01025965, "balance_loss_clip": 1.04472291, "balance_loss_mlp": 1.01899683, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 1.9326254925242743, "language_loss": 0.741202, "learning_rate": 8.514744140707853e-09, "loss": 0.76274288, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.62408185005188 }, { "auxiliary_loss_clip": 0.0116627, "auxiliary_loss_mlp": 0.01026691, "balance_loss_clip": 1.04839456, "balance_loss_mlp": 1.01987529, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.6856279738914621, "language_loss": 0.76140451, "learning_rate": 8.443091843558515e-09, "loss": 0.78333408, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.643556594848633 }, { "auxiliary_loss_clip": 0.01130659, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.04560101, "balance_loss_mlp": 1.02252817, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.6726349776037908, "language_loss": 0.64814377, "learning_rate": 8.37174166400878e-09, "loss": 0.66975087, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.662536382675171 }, { "auxiliary_loss_clip": 0.01172093, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.05270576, "balance_loss_mlp": 1.02107179, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 2.1716733222536333, "language_loss": 0.85510761, "learning_rate": 8.300693612881992e-09, "loss": 0.87711155, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.6159732341766357 }, { "auxiliary_loss_clip": 0.01153107, "auxiliary_loss_mlp": 0.00711098, "balance_loss_clip": 1.04923534, "balance_loss_mlp": 1.00056875, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 4.932137416730669, "language_loss": 0.81664896, "learning_rate": 8.22994770095664e-09, "loss": 0.83529097, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.6382200717926025 }, { "auxiliary_loss_clip": 0.01138892, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.05073071, "balance_loss_mlp": 1.02256179, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.389508377453609, "language_loss": 0.75413889, "learning_rate": 8.159503938964585e-09, "loss": 0.77583253, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 3.502946376800537 }, { "auxiliary_loss_clip": 0.01112744, "auxiliary_loss_mlp": 0.01024628, "balance_loss_clip": 1.04369521, "balance_loss_mlp": 1.01779413, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.7940669370902442, "language_loss": 0.70224744, "learning_rate": 8.089362337592164e-09, "loss": 0.72362113, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.700655937194824 }, { "auxiliary_loss_clip": 0.01133094, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.04576135, "balance_loss_mlp": 1.02236176, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.6657877961748917, "language_loss": 0.72147369, "learning_rate": 8.019522907479536e-09, "loss": 0.74309516, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 3.636726140975952 }, { "auxiliary_loss_clip": 0.01155428, "auxiliary_loss_mlp": 0.01026061, "balance_loss_clip": 1.04811144, "balance_loss_mlp": 1.01933479, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.304617174522352, "language_loss": 0.77397096, "learning_rate": 7.949985659221558e-09, "loss": 0.79578584, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.5942530632019043 }, { "auxiliary_loss_clip": 0.011384, "auxiliary_loss_mlp": 0.0102114, "balance_loss_clip": 1.04534853, "balance_loss_mlp": 1.01429737, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.219947139939789, "language_loss": 0.79286253, "learning_rate": 7.880750603366904e-09, "loss": 0.81445789, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 3.5411787033081055 }, { "auxiliary_loss_clip": 0.01130176, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.04490042, "balance_loss_mlp": 1.02145112, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 2.4762237219979633, "language_loss": 0.79603112, "learning_rate": 7.811817750418282e-09, "loss": 0.81762975, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.717677354812622 }, { "auxiliary_loss_clip": 0.01118304, "auxiliary_loss_mlp": 0.01027013, "balance_loss_clip": 1.04569209, "balance_loss_mlp": 1.02000356, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.6497727656754702, "language_loss": 0.79786539, "learning_rate": 7.743187110833105e-09, "loss": 0.81931859, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 3.6128690242767334 }, { "auxiliary_loss_clip": 0.01139176, "auxiliary_loss_mlp": 0.01022833, "balance_loss_clip": 1.04444683, "balance_loss_mlp": 1.01599312, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 2.0957474447887106, "language_loss": 0.80835783, "learning_rate": 7.674858695022602e-09, "loss": 0.82997799, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.6431119441986084 }, { "auxiliary_loss_clip": 0.0117104, "auxiliary_loss_mlp": 0.01029864, "balance_loss_clip": 1.05035543, "balance_loss_mlp": 1.02184153, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.8984369889080743, "language_loss": 0.75935578, "learning_rate": 7.606832513351591e-09, "loss": 0.7813648, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.582587242126465 }, { "auxiliary_loss_clip": 0.01069384, "auxiliary_loss_mlp": 0.00701338, "balance_loss_clip": 1.01746023, "balance_loss_mlp": 0.99993229, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8220068750671669, "language_loss": 0.6386708, "learning_rate": 7.539108576140264e-09, "loss": 0.65637803, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.2116262912750244 }, { "auxiliary_loss_clip": 0.01104233, "auxiliary_loss_mlp": 0.01024792, "balance_loss_clip": 1.04352999, "balance_loss_mlp": 1.01809812, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.123331915206663, "language_loss": 0.70383966, "learning_rate": 7.471686893661732e-09, "loss": 0.7251299, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.691610336303711 }, { "auxiliary_loss_clip": 0.01136412, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.04817653, "balance_loss_mlp": 1.01839995, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 2.5541600851346904, "language_loss": 0.64225119, "learning_rate": 7.4045674761442636e-09, "loss": 0.66387296, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.612959384918213 }, { "auxiliary_loss_clip": 0.01168124, "auxiliary_loss_mlp": 0.00711346, "balance_loss_clip": 1.04968667, "balance_loss_mlp": 1.00053692, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 1.8323532119171748, "language_loss": 0.74589801, "learning_rate": 7.337750333769488e-09, "loss": 0.76469266, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.6257505416870117 }, { "auxiliary_loss_clip": 0.01138236, "auxiliary_loss_mlp": 0.01024897, "balance_loss_clip": 1.04171944, "balance_loss_mlp": 1.01742554, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.9203955305807854, "language_loss": 0.73261178, "learning_rate": 7.2712354766737425e-09, "loss": 0.75424308, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.7210235595703125 }, { "auxiliary_loss_clip": 0.01112991, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 1.04551709, "balance_loss_mlp": 1.02651536, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.6352733637264507, "language_loss": 0.80972672, "learning_rate": 7.2050229149469565e-09, "loss": 0.83119047, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.6852340698242188 }, { "auxiliary_loss_clip": 0.01123594, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 1.04212475, "balance_loss_mlp": 1.01799059, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 2.0429622748632315, "language_loss": 0.63701463, "learning_rate": 7.139112658633984e-09, "loss": 0.65850198, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.6937923431396484 }, { "auxiliary_loss_clip": 0.01117847, "auxiliary_loss_mlp": 0.01026753, "balance_loss_clip": 1.04580164, "balance_loss_mlp": 1.01941848, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.8203003571116882, "language_loss": 0.70501953, "learning_rate": 7.073504717733048e-09, "loss": 0.72646558, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.7354888916015625 }, { "auxiliary_loss_clip": 0.01020483, "auxiliary_loss_mlp": 0.0100173, "balance_loss_clip": 1.01831722, "balance_loss_mlp": 1.00078857, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7791014087855986, "language_loss": 0.57213104, "learning_rate": 7.008199102196855e-09, "loss": 0.59235311, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.2242588996887207 }, { "auxiliary_loss_clip": 0.0104225, "auxiliary_loss_mlp": 0.0100194, "balance_loss_clip": 1.0178206, "balance_loss_mlp": 1.00081336, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.8273509846247218, "language_loss": 0.58930004, "learning_rate": 6.9431958219321464e-09, "loss": 0.60974193, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.189070224761963 }, { "auxiliary_loss_clip": 0.01135515, "auxiliary_loss_mlp": 0.0102568, "balance_loss_clip": 1.04538012, "balance_loss_mlp": 1.01805365, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.6513247895897045, "language_loss": 0.77798218, "learning_rate": 6.878494886800146e-09, "loss": 0.79959416, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.672858715057373 }, { "auxiliary_loss_clip": 0.01138787, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.04758692, "balance_loss_mlp": 1.01701283, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 1.8262322101881359, "language_loss": 0.76279449, "learning_rate": 6.814096306615669e-09, "loss": 0.78442526, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.5989465713500977 }, { "auxiliary_loss_clip": 0.01143646, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.0457691, "balance_loss_mlp": 1.02049303, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.4542897071131646, "language_loss": 0.65819114, "learning_rate": 6.750000091148011e-09, "loss": 0.67990786, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.6695590019226074 }, { "auxiliary_loss_clip": 0.01170632, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.05036318, "balance_loss_mlp": 1.01758933, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 1.946768380276803, "language_loss": 0.72662103, "learning_rate": 6.686206250120729e-09, "loss": 0.74857831, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.651167392730713 }, { "auxiliary_loss_clip": 0.01126074, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.04510009, "balance_loss_mlp": 1.02174091, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 2.5375774739229264, "language_loss": 0.74807096, "learning_rate": 6.622714793210749e-09, "loss": 0.76962197, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.658304452896118 }, { "auxiliary_loss_clip": 0.01169783, "auxiliary_loss_mlp": 0.01025068, "balance_loss_clip": 1.05054617, "balance_loss_mlp": 1.01770115, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 2.1353664984064564, "language_loss": 0.78680992, "learning_rate": 6.559525730050364e-09, "loss": 0.80875838, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.5264484882354736 }, { "auxiliary_loss_clip": 0.01124244, "auxiliary_loss_mlp": 0.01027532, "balance_loss_clip": 1.04666591, "balance_loss_mlp": 1.02097499, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 1.8115604807611423, "language_loss": 0.7569232, "learning_rate": 6.496639070224574e-09, "loss": 0.77844095, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.667012929916382 }, { "auxiliary_loss_clip": 0.01156353, "auxiliary_loss_mlp": 0.01022156, "balance_loss_clip": 1.04844236, "balance_loss_mlp": 1.01516747, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.954117065364899, "language_loss": 0.84099907, "learning_rate": 6.4340548232739714e-09, "loss": 0.86278415, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 3.5270040035247803 }, { "auxiliary_loss_clip": 0.01125289, "auxiliary_loss_mlp": 0.01025669, "balance_loss_clip": 1.0444926, "balance_loss_mlp": 1.0187881, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 1.7744140163693929, "language_loss": 0.79657191, "learning_rate": 6.371772998692071e-09, "loss": 0.8180815, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.660738945007324 }, { "auxiliary_loss_clip": 0.01123095, "auxiliary_loss_mlp": 0.01025648, "balance_loss_clip": 1.04265082, "balance_loss_mlp": 1.01856709, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 3.7687480756959704, "language_loss": 0.6532076, "learning_rate": 6.309793605927094e-09, "loss": 0.67469501, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 3.6754744052886963 }, { "auxiliary_loss_clip": 0.0114221, "auxiliary_loss_mlp": 0.01024645, "balance_loss_clip": 1.04682875, "balance_loss_mlp": 1.01789498, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.688194182255216, "language_loss": 0.80141008, "learning_rate": 6.248116654381297e-09, "loss": 0.82307863, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.6397461891174316 }, { "auxiliary_loss_clip": 0.0114087, "auxiliary_loss_mlp": 0.01021472, "balance_loss_clip": 1.04517746, "balance_loss_mlp": 1.01503468, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.9919667759953759, "language_loss": 0.72913671, "learning_rate": 6.186742153410751e-09, "loss": 0.75076014, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 3.588038682937622 }, { "auxiliary_loss_clip": 0.01136499, "auxiliary_loss_mlp": 0.01022946, "balance_loss_clip": 1.04702795, "balance_loss_mlp": 1.015293, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 3.2224173102429927, "language_loss": 0.87362409, "learning_rate": 6.125670112326453e-09, "loss": 0.89521855, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.6924054622650146 }, { "auxiliary_loss_clip": 0.01153685, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 1.0464803, "balance_loss_mlp": 1.01882362, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.8528655785650587, "language_loss": 0.70222354, "learning_rate": 6.064900540392548e-09, "loss": 0.72401738, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 3.5130536556243896 }, { "auxiliary_loss_clip": 0.01132569, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.04685092, "balance_loss_mlp": 1.02062011, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 2.0154923636881708, "language_loss": 0.78399241, "learning_rate": 6.0044334468278835e-09, "loss": 0.80558777, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.6435482501983643 }, { "auxiliary_loss_clip": 0.01105858, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.04213274, "balance_loss_mlp": 1.02048707, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 1.8241868197522317, "language_loss": 0.71764588, "learning_rate": 5.944268840805345e-09, "loss": 0.73898542, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.712019920349121 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04401791, "balance_loss_mlp": 1.01840901, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 2.357240626695142, "language_loss": 0.64567983, "learning_rate": 5.88440673145163e-09, "loss": 0.66709518, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.7258191108703613 }, { "auxiliary_loss_clip": 0.01151123, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.04920149, "balance_loss_mlp": 1.01763391, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 4.907417230836538, "language_loss": 0.82545668, "learning_rate": 5.824847127848142e-09, "loss": 0.84721649, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.5652661323547363 }, { "auxiliary_loss_clip": 0.01110682, "auxiliary_loss_mlp": 0.01024734, "balance_loss_clip": 1.04587579, "balance_loss_mlp": 1.01768279, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.7865871569122047, "language_loss": 0.79103827, "learning_rate": 5.765590039029433e-09, "loss": 0.81239247, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.677168846130371 }, { "auxiliary_loss_clip": 0.01167881, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.0500896, "balance_loss_mlp": 1.02037954, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 1.6531123958541931, "language_loss": 0.71329069, "learning_rate": 5.706635473985422e-09, "loss": 0.73524976, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.766228199005127 }, { "auxiliary_loss_clip": 0.01151173, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.04681921, "balance_loss_mlp": 1.01872659, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.9689090960884184, "language_loss": 0.85364938, "learning_rate": 5.6479834416591764e-09, "loss": 0.87542009, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.581695556640625 }, { "auxiliary_loss_clip": 0.01152183, "auxiliary_loss_mlp": 0.00712162, "balance_loss_clip": 1.04935539, "balance_loss_mlp": 1.0005908, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 1.8262492085158168, "language_loss": 0.68640447, "learning_rate": 5.589633950947803e-09, "loss": 0.70504791, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.689817428588867 }, { "auxiliary_loss_clip": 0.01133795, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.0461365, "balance_loss_mlp": 1.02133489, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 2.542401589344161, "language_loss": 0.69510365, "learning_rate": 5.5315870107035535e-09, "loss": 0.71673059, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.6075897216796875 }, { "auxiliary_loss_clip": 0.01136046, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.04905772, "balance_loss_mlp": 1.01785612, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.8053207165118195, "language_loss": 0.78887522, "learning_rate": 5.473842629731607e-09, "loss": 0.8104825, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.659870147705078 }, { "auxiliary_loss_clip": 0.0114603, "auxiliary_loss_mlp": 0.00711382, "balance_loss_clip": 1.0469687, "balance_loss_mlp": 1.00072908, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 2.5008995249758703, "language_loss": 0.77992839, "learning_rate": 5.416400816792066e-09, "loss": 0.7985025, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.63335919380188 }, { "auxiliary_loss_clip": 0.01166968, "auxiliary_loss_mlp": 0.01022747, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.01546001, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 3.099100659288346, "language_loss": 0.78588748, "learning_rate": 5.359261580598407e-09, "loss": 0.80778468, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.5979435443878174 }, { "auxiliary_loss_clip": 0.0115597, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.05001628, "balance_loss_mlp": 1.02524996, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.6778323305704776, "language_loss": 0.77789509, "learning_rate": 5.302424929819027e-09, "loss": 0.79978871, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.565556287765503 }, { "auxiliary_loss_clip": 0.01152862, "auxiliary_loss_mlp": 0.01022643, "balance_loss_clip": 1.04399526, "balance_loss_mlp": 1.01598823, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.365703008857373, "language_loss": 0.72709584, "learning_rate": 5.24589087307592e-09, "loss": 0.74885094, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.639277696609497 }, { "auxiliary_loss_clip": 0.01171011, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.04979479, "balance_loss_mlp": 1.02206492, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.547110696598397, "language_loss": 0.64947277, "learning_rate": 5.189659418944891e-09, "loss": 0.67147613, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 2.9368033409118652 }, { "auxiliary_loss_clip": 0.01168972, "auxiliary_loss_mlp": 0.01020045, "balance_loss_clip": 1.05024219, "balance_loss_mlp": 1.01318479, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 1.843476823742849, "language_loss": 0.78422481, "learning_rate": 5.133730575956674e-09, "loss": 0.80611503, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.5793087482452393 }, { "auxiliary_loss_clip": 0.01137849, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.0454092, "balance_loss_mlp": 1.01994681, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 7.285704752986488, "language_loss": 0.72196943, "learning_rate": 5.0781043525953696e-09, "loss": 0.7436198, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.6438465118408203 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.01023557, "balance_loss_clip": 1.04921174, "balance_loss_mlp": 1.01680374, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.8244947401928078, "language_loss": 0.74335027, "learning_rate": 5.0227807572995605e-09, "loss": 0.76492918, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.7225534915924072 }, { "auxiliary_loss_clip": 0.01139347, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.04544997, "balance_loss_mlp": 1.01854753, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.4602410488058433, "language_loss": 0.67444301, "learning_rate": 4.967759798461646e-09, "loss": 0.69609213, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.6420984268188477 }, { "auxiliary_loss_clip": 0.01166865, "auxiliary_loss_mlp": 0.01024635, "balance_loss_clip": 1.04948664, "balance_loss_mlp": 1.01809001, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 2.050227096306145, "language_loss": 0.74858409, "learning_rate": 4.913041484428282e-09, "loss": 0.77049911, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 3.51592755317688 }, { "auxiliary_loss_clip": 0.01153592, "auxiliary_loss_mlp": 0.010244, "balance_loss_clip": 1.04742694, "balance_loss_mlp": 1.01791799, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 3.504530278537109, "language_loss": 0.74329937, "learning_rate": 4.858625823500384e-09, "loss": 0.76507926, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.6388838291168213 }, { "auxiliary_loss_clip": 0.01155549, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.04676473, "balance_loss_mlp": 1.02071357, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 1.9639934714052973, "language_loss": 0.73673928, "learning_rate": 4.80451282393246e-09, "loss": 0.75857675, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 3.5386428833007812 }, { "auxiliary_loss_clip": 0.01139361, "auxiliary_loss_mlp": 0.01028622, "balance_loss_clip": 1.04659986, "balance_loss_mlp": 1.02143633, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 4.74094252229687, "language_loss": 0.67566288, "learning_rate": 4.750702493933722e-09, "loss": 0.69734269, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.767878532409668 }, { "auxiliary_loss_clip": 0.01137156, "auxiliary_loss_mlp": 0.00711316, "balance_loss_clip": 1.047683, "balance_loss_mlp": 1.00054634, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 2.2963344339435294, "language_loss": 0.8491351, "learning_rate": 4.697194841666974e-09, "loss": 0.86761987, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 3.5641958713531494 }, { "auxiliary_loss_clip": 0.01154909, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.04727435, "balance_loss_mlp": 1.02101433, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 2.018724707445815, "language_loss": 0.81940401, "learning_rate": 4.6439898752492764e-09, "loss": 0.84124124, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 3.5615081787109375 }, { "auxiliary_loss_clip": 0.01060791, "auxiliary_loss_mlp": 0.00701484, "balance_loss_clip": 1.01896226, "balance_loss_mlp": 1.00004709, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7542015447635857, "language_loss": 0.63608295, "learning_rate": 4.591087602751731e-09, "loss": 0.65370566, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.269225835800171 }, { "auxiliary_loss_clip": 0.0115207, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.04869938, "balance_loss_mlp": 1.02064133, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 2.0824111025467436, "language_loss": 0.72172904, "learning_rate": 4.538488032199916e-09, "loss": 0.74352539, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.5891332626342773 }, { "auxiliary_loss_clip": 0.01155127, "auxiliary_loss_mlp": 0.01023329, "balance_loss_clip": 1.04597378, "balance_loss_mlp": 1.01617932, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 4.24487041635658, "language_loss": 0.68884206, "learning_rate": 4.486191171572784e-09, "loss": 0.7106266, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.571112871170044 }, { "auxiliary_loss_clip": 0.01158729, "auxiliary_loss_mlp": 0.01027798, "balance_loss_clip": 1.05132556, "balance_loss_mlp": 1.02056825, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.5147539960174725, "language_loss": 0.77525997, "learning_rate": 4.434197028803766e-09, "loss": 0.79712522, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.7045226097106934 }, { "auxiliary_loss_clip": 0.01127431, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.04461288, "balance_loss_mlp": 1.02375627, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.0486891239325433, "language_loss": 0.81994748, "learning_rate": 4.3825056117805514e-09, "loss": 0.84153259, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 2.7035412788391113 }, { "auxiliary_loss_clip": 0.01167536, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 1.04715776, "balance_loss_mlp": 1.01703358, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 2.289250778834452, "language_loss": 0.79911315, "learning_rate": 4.331116928344425e-09, "loss": 0.82103437, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.5767064094543457 }, { "auxiliary_loss_clip": 0.01142122, "auxiliary_loss_mlp": 0.00711626, "balance_loss_clip": 1.0454421, "balance_loss_mlp": 1.00053883, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.037304207675365, "language_loss": 0.63014221, "learning_rate": 4.28003098629115e-09, "loss": 0.64867973, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.624574661254883 }, { "auxiliary_loss_clip": 0.01117679, "auxiliary_loss_mlp": 0.01018922, "balance_loss_clip": 1.04068089, "balance_loss_mlp": 1.01185322, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.8218037693122737, "language_loss": 0.78656888, "learning_rate": 4.229247793370305e-09, "loss": 0.80793488, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.6984758377075195 }, { "auxiliary_loss_clip": 0.01170995, "auxiliary_loss_mlp": 0.01030785, "balance_loss_clip": 1.05105662, "balance_loss_mlp": 1.02352774, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.667788375924871, "language_loss": 0.70843661, "learning_rate": 4.178767357285951e-09, "loss": 0.73045439, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.5995349884033203 }, { "auxiliary_loss_clip": 0.0115468, "auxiliary_loss_mlp": 0.00711477, "balance_loss_clip": 1.04938853, "balance_loss_mlp": 1.00057411, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.110991275923695, "language_loss": 0.71451831, "learning_rate": 4.128589685695516e-09, "loss": 0.73317987, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.6416513919830322 }, { "auxiliary_loss_clip": 0.0117067, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.05127501, "balance_loss_mlp": 1.02020812, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 3.829861438423274, "language_loss": 0.8458389, "learning_rate": 4.078714786211135e-09, "loss": 0.86782014, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.5769996643066406 }, { "auxiliary_loss_clip": 0.01152471, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.04993594, "balance_loss_mlp": 1.0205512, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 2.3409430571129763, "language_loss": 0.7674824, "learning_rate": 4.029142666398977e-09, "loss": 0.78928274, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.693159580230713 }, { "auxiliary_loss_clip": 0.01166463, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.04966569, "balance_loss_mlp": 1.02125669, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 2.6277454391933652, "language_loss": 0.80330753, "learning_rate": 3.979873333778805e-09, "loss": 0.82525688, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.5998518466949463 }, { "auxiliary_loss_clip": 0.01143914, "auxiliary_loss_mlp": 0.01022577, "balance_loss_clip": 1.04927731, "balance_loss_mlp": 1.01558828, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.972863254274908, "language_loss": 0.73873097, "learning_rate": 3.930906795824862e-09, "loss": 0.76039588, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.780941963195801 }, { "auxiliary_loss_clip": 0.01149551, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.04712987, "balance_loss_mlp": 1.0255003, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 2.1104856727930748, "language_loss": 0.77027887, "learning_rate": 3.882243059965207e-09, "loss": 0.79209995, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.5737318992614746 }, { "auxiliary_loss_clip": 0.01144288, "auxiliary_loss_mlp": 0.01021483, "balance_loss_clip": 1.04389036, "balance_loss_mlp": 1.0139904, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 3.4054244260473947, "language_loss": 0.65841919, "learning_rate": 3.833882133582156e-09, "loss": 0.68007684, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.663734197616577 }, { "auxiliary_loss_clip": 0.01156285, "auxiliary_loss_mlp": 0.01029678, "balance_loss_clip": 1.04808903, "balance_loss_mlp": 1.0223918, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 2.0956420666473394, "language_loss": 0.7829771, "learning_rate": 3.785824024012285e-09, "loss": 0.80483675, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.6596615314483643 }, { "auxiliary_loss_clip": 0.01134256, "auxiliary_loss_mlp": 0.01025393, "balance_loss_clip": 1.0498004, "balance_loss_mlp": 1.01856816, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.593843242609602, "language_loss": 0.78690088, "learning_rate": 3.738068738545541e-09, "loss": 0.80849731, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.6177444458007812 }, { "auxiliary_loss_clip": 0.0115696, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.04886031, "balance_loss_mlp": 1.0226028, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.6892419356793256, "language_loss": 0.79344487, "learning_rate": 3.6906162844265733e-09, "loss": 0.8153137, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.6428701877593994 }, { "auxiliary_loss_clip": 0.01132095, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.04551601, "balance_loss_mlp": 1.01954103, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 2.028382305869123, "language_loss": 0.70798159, "learning_rate": 3.643466668853845e-09, "loss": 0.72956967, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.633017063140869 }, { "auxiliary_loss_clip": 0.01138619, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.04635739, "balance_loss_mlp": 1.02393508, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 1.9748793646191698, "language_loss": 0.7539115, "learning_rate": 3.59661989898008e-09, "loss": 0.7756061, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 3.630000352859497 }, { "auxiliary_loss_clip": 0.01117881, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04721069, "balance_loss_mlp": 1.01812732, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.8066452966906545, "language_loss": 0.76893508, "learning_rate": 3.5500759819115934e-09, "loss": 0.79036331, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.665731191635132 }, { "auxiliary_loss_clip": 0.01171595, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.05225587, "balance_loss_mlp": 1.01910007, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.7139601216807447, "language_loss": 0.8092621, "learning_rate": 3.5038349247094034e-09, "loss": 0.83124, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 3.553536891937256 }, { "auxiliary_loss_clip": 0.01136212, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.04404688, "balance_loss_mlp": 1.01870954, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.4509757843807165, "language_loss": 0.77518308, "learning_rate": 3.4578967343878994e-09, "loss": 0.79680544, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.659166097640991 }, { "auxiliary_loss_clip": 0.01137268, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04788125, "balance_loss_mlp": 1.0182637, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 2.395411275621145, "language_loss": 0.81182241, "learning_rate": 3.4122614179161733e-09, "loss": 0.83344489, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 3.504674196243286 }, { "auxiliary_loss_clip": 0.01110171, "auxiliary_loss_mlp": 0.0102777, "balance_loss_clip": 1.04208779, "balance_loss_mlp": 1.02102232, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.8708081969252062, "language_loss": 0.78240669, "learning_rate": 3.36692898221691e-09, "loss": 0.80378616, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.695248603820801 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.04745984, "balance_loss_mlp": 1.01753616, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 1.755560736594264, "language_loss": 0.73485518, "learning_rate": 3.3218994341668305e-09, "loss": 0.75662792, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 3.554163694381714 }, { "auxiliary_loss_clip": 0.01169129, "auxiliary_loss_mlp": 0.01026063, "balance_loss_clip": 1.05252647, "balance_loss_mlp": 1.01894653, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.6724322679675503, "language_loss": 0.75523555, "learning_rate": 3.2771727805971373e-09, "loss": 0.77718747, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.60530161857605 }, { "auxiliary_loss_clip": 0.01100931, "auxiliary_loss_mlp": 0.01020073, "balance_loss_clip": 1.04208803, "balance_loss_mlp": 1.01267052, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 4.0754484415390655, "language_loss": 0.77176607, "learning_rate": 3.232749028292847e-09, "loss": 0.79297608, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.7132620811462402 }, { "auxiliary_loss_clip": 0.0116794, "auxiliary_loss_mlp": 0.01024644, "balance_loss_clip": 1.04683208, "balance_loss_mlp": 1.01712239, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.8634188448590525, "language_loss": 0.88579953, "learning_rate": 3.188628183992792e-09, "loss": 0.90772539, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.5244784355163574 }, { "auxiliary_loss_clip": 0.01059682, "auxiliary_loss_mlp": 0.01004648, "balance_loss_clip": 1.01758695, "balance_loss_mlp": 1.00363445, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7390977147338293, "language_loss": 0.62508398, "learning_rate": 3.1448102543902844e-09, "loss": 0.64572728, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.185298442840576 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.04645908, "balance_loss_mlp": 1.02348423, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 2.174762893138993, "language_loss": 0.67762411, "learning_rate": 3.1012952461324515e-09, "loss": 0.69924557, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.581805944442749 }, { "auxiliary_loss_clip": 0.01151633, "auxiliary_loss_mlp": 0.01024139, "balance_loss_clip": 1.04953814, "balance_loss_mlp": 1.01746309, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.841773028708096, "language_loss": 0.73748082, "learning_rate": 3.0580831658204575e-09, "loss": 0.7592386, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.5928432941436768 }, { "auxiliary_loss_clip": 0.01151253, "auxiliary_loss_mlp": 0.01027577, "balance_loss_clip": 1.04983044, "balance_loss_mlp": 1.02084434, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 2.3695832850210095, "language_loss": 0.77934176, "learning_rate": 3.015174020009281e-09, "loss": 0.80113006, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.595202922821045 }, { "auxiliary_loss_clip": 0.01122551, "auxiliary_loss_mlp": 0.01021777, "balance_loss_clip": 1.04274392, "balance_loss_mlp": 1.01514602, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 2.0714961851409592, "language_loss": 0.75073218, "learning_rate": 2.9725678152086043e-09, "loss": 0.77217549, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.678199052810669 }, { "auxiliary_loss_clip": 0.01123486, "auxiliary_loss_mlp": 0.0102553, "balance_loss_clip": 1.04224944, "balance_loss_mlp": 1.01827025, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 2.820080183192825, "language_loss": 0.82070476, "learning_rate": 2.930264557881257e-09, "loss": 0.84219491, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.570976495742798 }, { "auxiliary_loss_clip": 0.01070214, "auxiliary_loss_mlp": 0.01006452, "balance_loss_clip": 1.01784182, "balance_loss_mlp": 1.00543237, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.830998970164543, "language_loss": 0.58179331, "learning_rate": 2.8882642544452163e-09, "loss": 0.60256004, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.1832077503204346 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.04682398, "balance_loss_mlp": 1.02244949, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 25.556038266307006, "language_loss": 0.74776816, "learning_rate": 2.8465669112716083e-09, "loss": 0.76936579, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.614816665649414 }, { "auxiliary_loss_clip": 0.01155864, "auxiliary_loss_mlp": 0.00711451, "balance_loss_clip": 1.04862976, "balance_loss_mlp": 1.00049686, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 2.390178263002497, "language_loss": 0.76722938, "learning_rate": 2.8051725346858177e-09, "loss": 0.7859025, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.6907436847686768 }, { "auxiliary_loss_clip": 0.0116962, "auxiliary_loss_mlp": 0.01024488, "balance_loss_clip": 1.04793024, "balance_loss_mlp": 1.01762748, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 2.387165089961472, "language_loss": 0.70753479, "learning_rate": 2.7640811309674883e-09, "loss": 0.72947586, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6758692264556885 }, { "auxiliary_loss_clip": 0.01113836, "auxiliary_loss_mlp": 0.01023075, "balance_loss_clip": 1.04529381, "balance_loss_mlp": 1.01639342, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.743651139674595, "language_loss": 0.80780536, "learning_rate": 2.7232927063498557e-09, "loss": 0.82917446, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.7142510414123535 }, { "auxiliary_loss_clip": 0.01154276, "auxiliary_loss_mlp": 0.01026155, "balance_loss_clip": 1.04845786, "balance_loss_mlp": 1.01915145, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 2.0648928906446478, "language_loss": 0.69077486, "learning_rate": 2.682807267020859e-09, "loss": 0.71257913, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.752472400665283 }, { "auxiliary_loss_clip": 0.0115226, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.04814708, "balance_loss_mlp": 1.02582073, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.9773400750799706, "language_loss": 0.62465191, "learning_rate": 2.642624819121808e-09, "loss": 0.64650333, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.6300315856933594 }, { "auxiliary_loss_clip": 0.01136942, "auxiliary_loss_mlp": 0.01025094, "balance_loss_clip": 1.0477047, "balance_loss_mlp": 1.01849318, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 2.112107700416901, "language_loss": 0.6207754, "learning_rate": 2.6027453687487154e-09, "loss": 0.64239579, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.6247310638427734 }, { "auxiliary_loss_clip": 0.01136935, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04772794, "balance_loss_mlp": 1.01769984, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.4348505633648196, "language_loss": 0.54243195, "learning_rate": 2.5631689219509643e-09, "loss": 0.56405365, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.599222183227539 }, { "auxiliary_loss_clip": 0.01136323, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04745626, "balance_loss_mlp": 1.02066374, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.7442719677477159, "language_loss": 0.83292335, "learning_rate": 2.523895484732197e-09, "loss": 0.85456043, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.7020516395568848 }, { "auxiliary_loss_clip": 0.01159088, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 1.04829514, "balance_loss_mlp": 1.02299094, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 2.2619710178737042, "language_loss": 0.75185287, "learning_rate": 2.4849250630505357e-09, "loss": 0.77375066, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 3.6007578372955322 }, { "auxiliary_loss_clip": 0.01061416, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 1.0380373, "balance_loss_mlp": 1.02344775, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.7654685605430605, "language_loss": 0.7363739, "learning_rate": 2.4462576628172528e-09, "loss": 0.75729239, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 2.8610963821411133 }, { "auxiliary_loss_clip": 0.01150764, "auxiliary_loss_mlp": 0.01022596, "balance_loss_clip": 1.04919696, "balance_loss_mlp": 1.01567602, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 2.8271027688155077, "language_loss": 0.74056017, "learning_rate": 2.407893289898766e-09, "loss": 0.76229376, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 4.1838014125823975 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.01022535, "balance_loss_clip": 1.04244626, "balance_loss_mlp": 1.01490533, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.9967751376853875, "language_loss": 0.84089673, "learning_rate": 2.3698319501144202e-09, "loss": 0.86225128, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.8940703868865967 }, { "auxiliary_loss_clip": 0.01156689, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.04704702, "balance_loss_mlp": 1.01779032, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.759691091542104, "language_loss": 0.73765564, "learning_rate": 2.3320736492382644e-09, "loss": 0.75947869, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 3.63163423538208 }, { "auxiliary_loss_clip": 0.01167365, "auxiliary_loss_mlp": 0.01021656, "balance_loss_clip": 1.05050814, "balance_loss_mlp": 1.01470304, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.5802296405131417, "language_loss": 0.681409, "learning_rate": 2.29461839299816e-09, "loss": 0.70329928, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 3.5933077335357666 }, { "auxiliary_loss_clip": 0.01122261, "auxiliary_loss_mlp": 0.01023859, "balance_loss_clip": 1.0440011, "balance_loss_mlp": 1.0161823, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.7089420298492104, "language_loss": 0.79946333, "learning_rate": 2.257466187076229e-09, "loss": 0.82092452, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.7424590587615967 }, { "auxiliary_loss_clip": 0.01154406, "auxiliary_loss_mlp": 0.00711404, "balance_loss_clip": 1.04519558, "balance_loss_mlp": 1.00052631, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 3.898685667356876, "language_loss": 0.71469152, "learning_rate": 2.2206170371081854e-09, "loss": 0.73334962, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.584785223007202 }, { "auxiliary_loss_clip": 0.01139346, "auxiliary_loss_mlp": 0.01024515, "balance_loss_clip": 1.04702652, "balance_loss_mlp": 1.01708508, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.8552482216851007, "language_loss": 0.84936255, "learning_rate": 2.1840709486842247e-09, "loss": 0.87100112, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.741377115249634 }, { "auxiliary_loss_clip": 0.01130591, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.04529285, "balance_loss_mlp": 1.01903081, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.01177342765351, "language_loss": 0.7893303, "learning_rate": 2.1478279273481335e-09, "loss": 0.81090546, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.6247174739837646 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.04984593, "balance_loss_mlp": 1.0138278, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.056435627233957, "language_loss": 0.80294877, "learning_rate": 2.1118879785981815e-09, "loss": 0.82468313, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.8267128467559814 }, { "auxiliary_loss_clip": 0.01134744, "auxiliary_loss_mlp": 0.01024245, "balance_loss_clip": 1.04506719, "balance_loss_mlp": 1.01729846, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 2.1785104755124154, "language_loss": 0.79683971, "learning_rate": 2.0762511078862288e-09, "loss": 0.81842959, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.6465187072753906 }, { "auxiliary_loss_clip": 0.01143777, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.046175, "balance_loss_mlp": 1.02146268, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 2.65379820764082, "language_loss": 0.64788103, "learning_rate": 2.0409173206186183e-09, "loss": 0.66960454, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.6537837982177734 }, { "auxiliary_loss_clip": 0.01120821, "auxiliary_loss_mlp": 0.01020273, "balance_loss_clip": 1.04711199, "balance_loss_mlp": 1.01379657, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 1.884023164004413, "language_loss": 0.87303215, "learning_rate": 2.0058866221550617e-09, "loss": 0.89444309, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.62298583984375 }, { "auxiliary_loss_clip": 0.01169648, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.04960346, "balance_loss_mlp": 1.01836801, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 2.1558705519346346, "language_loss": 0.75339377, "learning_rate": 1.971159017809976e-09, "loss": 0.77534819, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.614694595336914 }, { "auxiliary_loss_clip": 0.01153942, "auxiliary_loss_mlp": 0.01023792, "balance_loss_clip": 1.05012977, "balance_loss_mlp": 1.01661575, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.3286605908402187, "language_loss": 0.78212535, "learning_rate": 1.93673451285159e-09, "loss": 0.80390269, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.5797979831695557 }, { "auxiliary_loss_clip": 0.01049251, "auxiliary_loss_mlp": 0.01006585, "balance_loss_clip": 1.01792228, "balance_loss_mlp": 1.00558376, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.733591873040984, "language_loss": 0.56459463, "learning_rate": 1.9026131125019495e-09, "loss": 0.58515298, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.173271894454956 }, { "auxiliary_loss_clip": 0.01147238, "auxiliary_loss_mlp": 0.01022235, "balance_loss_clip": 1.04720473, "balance_loss_mlp": 1.01559806, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.8989008840360246, "language_loss": 0.8739478, "learning_rate": 1.8687948219371363e-09, "loss": 0.89564252, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.570219039916992 }, { "auxiliary_loss_clip": 0.01171657, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.04819131, "balance_loss_mlp": 1.0231787, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.0208922293399794, "language_loss": 0.89011633, "learning_rate": 1.835279646287491e-09, "loss": 0.91214436, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.588348150253296 }, { "auxiliary_loss_clip": 0.01159543, "auxiliary_loss_mlp": 0.01027427, "balance_loss_clip": 1.04876423, "balance_loss_mlp": 1.02003002, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.862236404670989, "language_loss": 0.76541579, "learning_rate": 1.8020675906371685e-09, "loss": 0.78728557, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.577763080596924 }, { "auxiliary_loss_clip": 0.01099824, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 1.04109597, "balance_loss_mlp": 1.01773024, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 3.8069499527890223, "language_loss": 0.75182223, "learning_rate": 1.7691586600243612e-09, "loss": 0.7730642, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.7075040340423584 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.0476706, "balance_loss_mlp": 1.02161658, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 4.297878946083043, "language_loss": 0.86775076, "learning_rate": 1.7365528594415202e-09, "loss": 0.88938177, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.6240270137786865 }, { "auxiliary_loss_clip": 0.01158741, "auxiliary_loss_mlp": 0.00711534, "balance_loss_clip": 1.04858029, "balance_loss_mlp": 1.00054741, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 3.831528061919721, "language_loss": 0.67762315, "learning_rate": 1.7042501938346888e-09, "loss": 0.6963259, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.7029850482940674 }, { "auxiliary_loss_clip": 0.01125267, "auxiliary_loss_mlp": 0.01023127, "balance_loss_clip": 1.04165769, "balance_loss_mlp": 1.0162127, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 2.206657283860863, "language_loss": 0.76336741, "learning_rate": 1.6722506681043913e-09, "loss": 0.78485137, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.660646438598633 }, { "auxiliary_loss_clip": 0.01141777, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.04746461, "balance_loss_mlp": 1.02206564, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.6316921784476124, "language_loss": 0.69611025, "learning_rate": 1.640554287104745e-09, "loss": 0.71781749, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.6337451934814453 }, { "auxiliary_loss_clip": 0.01124088, "auxiliary_loss_mlp": 0.01024216, "balance_loss_clip": 1.04289794, "balance_loss_mlp": 1.01681292, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.021334251994825, "language_loss": 0.80108678, "learning_rate": 1.609161055644348e-09, "loss": 0.82256985, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.695349931716919 }, { "auxiliary_loss_clip": 0.01157698, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 1.04688036, "balance_loss_mlp": 1.01870632, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.156636986060166, "language_loss": 0.68658113, "learning_rate": 1.5780709784849467e-09, "loss": 0.70841825, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 3.5653703212738037 }, { "auxiliary_loss_clip": 0.01095166, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.04438972, "balance_loss_mlp": 1.01832366, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 1.9725189657040563, "language_loss": 0.82225275, "learning_rate": 1.5472840603436565e-09, "loss": 0.84346259, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.7385358810424805 }, { "auxiliary_loss_clip": 0.01139986, "auxiliary_loss_mlp": 0.01020964, "balance_loss_clip": 1.04725266, "balance_loss_mlp": 1.01410341, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 2.0827699384753284, "language_loss": 0.7806133, "learning_rate": 1.5168003058900757e-09, "loss": 0.80222273, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.6151182651519775 }, { "auxiliary_loss_clip": 0.01120818, "auxiliary_loss_mlp": 0.01023676, "balance_loss_clip": 1.04343605, "balance_loss_mlp": 1.01687217, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 1.9935303073346473, "language_loss": 0.91923094, "learning_rate": 1.4866197197491715e-09, "loss": 0.94067591, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 4.482726812362671 }, { "auxiliary_loss_clip": 0.01156884, "auxiliary_loss_mlp": 0.00712525, "balance_loss_clip": 1.0490129, "balance_loss_mlp": 1.00066829, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 3.366508978481423, "language_loss": 0.79312843, "learning_rate": 1.4567423064988371e-09, "loss": 0.81182253, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.660010576248169 }, { "auxiliary_loss_clip": 0.01170841, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.05107343, "balance_loss_mlp": 1.02553582, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 3.203748550449417, "language_loss": 0.78173029, "learning_rate": 1.4271680706718913e-09, "loss": 0.80376601, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 3.5604655742645264 }, { "auxiliary_loss_clip": 0.01155673, "auxiliary_loss_mlp": 0.01028081, "balance_loss_clip": 1.05095315, "balance_loss_mlp": 1.02054691, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.7797856831808536, "language_loss": 0.82697153, "learning_rate": 1.3978970167543013e-09, "loss": 0.84880906, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.6388680934906006 }, { "auxiliary_loss_clip": 0.01128053, "auxiliary_loss_mlp": 0.0102756, "balance_loss_clip": 1.04511631, "balance_loss_mlp": 1.02026713, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.0928129375903763, "language_loss": 0.7758733, "learning_rate": 1.3689291491867372e-09, "loss": 0.79742944, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.6580965518951416 }, { "auxiliary_loss_clip": 0.01169928, "auxiliary_loss_mlp": 0.01023503, "balance_loss_clip": 1.04874957, "balance_loss_mlp": 1.01600742, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 4.804511826006839, "language_loss": 0.73656726, "learning_rate": 1.3402644723636836e-09, "loss": 0.75850159, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.6032583713531494 }, { "auxiliary_loss_clip": 0.01134322, "auxiliary_loss_mlp": 0.01024832, "balance_loss_clip": 1.04824448, "balance_loss_mlp": 1.01734591, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 2.1355238858481336, "language_loss": 0.84049904, "learning_rate": 1.311902990633218e-09, "loss": 0.86209059, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.6738152503967285 }, { "auxiliary_loss_clip": 0.011266, "auxiliary_loss_mlp": 0.01022281, "balance_loss_clip": 1.03988886, "balance_loss_mlp": 1.01563573, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 2.118272609002341, "language_loss": 0.71379679, "learning_rate": 1.2838447082978987e-09, "loss": 0.73528564, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.726290225982666 }, { "auxiliary_loss_clip": 0.01149608, "auxiliary_loss_mlp": 0.01025144, "balance_loss_clip": 1.04644895, "balance_loss_mlp": 1.0179081, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 3.1108814175819823, "language_loss": 0.83176839, "learning_rate": 1.2560896296143208e-09, "loss": 0.85351598, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.6524744033813477 }, { "auxiliary_loss_clip": 0.01168405, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.04885685, "balance_loss_mlp": 1.01731539, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.494231096153254, "language_loss": 0.82776445, "learning_rate": 1.2286377587926722e-09, "loss": 0.84969044, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.5306591987609863 }, { "auxiliary_loss_clip": 0.01164825, "auxiliary_loss_mlp": 0.01024914, "balance_loss_clip": 1.04630804, "balance_loss_mlp": 1.01751065, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 2.344493678347743, "language_loss": 0.74957895, "learning_rate": 1.2014890999973992e-09, "loss": 0.77147639, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.712671995162964 }, { "auxiliary_loss_clip": 0.01164937, "auxiliary_loss_mlp": 0.01020138, "balance_loss_clip": 1.04624581, "balance_loss_mlp": 1.01338482, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.6928590212071024, "language_loss": 0.78583533, "learning_rate": 1.1746436573472073e-09, "loss": 0.80768609, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.607734441757202 }, { "auxiliary_loss_clip": 0.01146612, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.04793763, "balance_loss_mlp": 1.02155638, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 2.3411324309266535, "language_loss": 0.6924156, "learning_rate": 1.1481014349141726e-09, "loss": 0.71416771, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.628166913986206 }, { "auxiliary_loss_clip": 0.01141977, "auxiliary_loss_mlp": 0.01028374, "balance_loss_clip": 1.04818714, "balance_loss_mlp": 1.02059507, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 2.3305917617524536, "language_loss": 0.84466541, "learning_rate": 1.121862436724852e-09, "loss": 0.86636889, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.644080400466919 }, { "auxiliary_loss_clip": 0.01152617, "auxiliary_loss_mlp": 0.0102326, "balance_loss_clip": 1.04899144, "balance_loss_mlp": 1.01672745, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.7738572750832455, "language_loss": 0.70514643, "learning_rate": 1.0959266667598388e-09, "loss": 0.72690523, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.6134653091430664 }, { "auxiliary_loss_clip": 0.01128339, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 1.04939187, "balance_loss_mlp": 1.02210522, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 2.0268698909847105, "language_loss": 0.74706388, "learning_rate": 1.0702941289533196e-09, "loss": 0.76865005, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.6828436851501465 }, { "auxiliary_loss_clip": 0.01120779, "auxiliary_loss_mlp": 0.01021538, "balance_loss_clip": 1.04583561, "balance_loss_mlp": 1.01474023, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 3.861613559999084, "language_loss": 0.89008892, "learning_rate": 1.0449648271939615e-09, "loss": 0.91151214, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.6581408977508545 }, { "auxiliary_loss_clip": 0.01110454, "auxiliary_loss_mlp": 0.0071115, "balance_loss_clip": 1.04628539, "balance_loss_mlp": 1.00066125, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 2.336217322398547, "language_loss": 0.73210514, "learning_rate": 1.0199387653240243e-09, "loss": 0.75032115, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.735008478164673 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.01021634, "balance_loss_clip": 1.04664171, "balance_loss_mlp": 1.01502395, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.8426476492501864, "language_loss": 0.70883417, "learning_rate": 9.952159471400267e-10, "loss": 0.73036373, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.6092026233673096 }, { "auxiliary_loss_clip": 0.01156735, "auxiliary_loss_mlp": 0.00710761, "balance_loss_clip": 1.04949796, "balance_loss_mlp": 1.00062311, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 2.012979302317981, "language_loss": 0.84230846, "learning_rate": 9.707963763923022e-10, "loss": 0.86098349, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.6589393615722656 }, { "auxiliary_loss_clip": 0.01135114, "auxiliary_loss_mlp": 0.01022974, "balance_loss_clip": 1.04445875, "balance_loss_mlp": 1.01625037, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.8057950916402756, "language_loss": 0.79229116, "learning_rate": 9.466800567854427e-10, "loss": 0.8138721, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.638383626937866 }, { "auxiliary_loss_clip": 0.01122973, "auxiliary_loss_mlp": 0.01027786, "balance_loss_clip": 1.04417992, "balance_loss_mlp": 1.02067244, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 2.5459406865324845, "language_loss": 0.68283749, "learning_rate": 9.228669919778553e-10, "loss": 0.70434511, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.765155553817749 }, { "auxiliary_loss_clip": 0.01132731, "auxiliary_loss_mlp": 0.01028196, "balance_loss_clip": 1.04434967, "balance_loss_mlp": 1.02126431, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 3.1024283307000924, "language_loss": 0.7998265, "learning_rate": 8.993571855817617e-10, "loss": 0.82143575, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 3.5353777408599854 }, { "auxiliary_loss_clip": 0.01152509, "auxiliary_loss_mlp": 0.01028635, "balance_loss_clip": 1.04678524, "balance_loss_mlp": 1.0214591, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 10.251454796625382, "language_loss": 0.75168157, "learning_rate": 8.761506411638642e-10, "loss": 0.77349305, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.6304705142974854 }, { "auxiliary_loss_clip": 0.01136527, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.04639959, "balance_loss_mlp": 1.02429056, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.9246238368673363, "language_loss": 0.73558766, "learning_rate": 8.53247362244236e-10, "loss": 0.75727177, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.619723320007324 }, { "auxiliary_loss_clip": 0.01138404, "auxiliary_loss_mlp": 0.01020986, "balance_loss_clip": 1.04763937, "balance_loss_mlp": 1.01353526, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.6168511020773184, "language_loss": 0.68461001, "learning_rate": 8.306473522976532e-10, "loss": 0.70620394, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.6596791744232178 }, { "auxiliary_loss_clip": 0.01168228, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.04920506, "balance_loss_mlp": 1.01917338, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 2.8387959811276833, "language_loss": 0.7167936, "learning_rate": 8.083506147522623e-10, "loss": 0.73873621, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 3.539980411529541 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04643977, "balance_loss_mlp": 1.02309847, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.2968542895362978, "language_loss": 0.85469055, "learning_rate": 7.863571529906909e-10, "loss": 0.87646759, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.5390684604644775 }, { "auxiliary_loss_clip": 0.01059233, "auxiliary_loss_mlp": 0.01007581, "balance_loss_clip": 1.01752591, "balance_loss_mlp": 1.00652647, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7297234446309252, "language_loss": 0.59639871, "learning_rate": 7.646669703489372e-10, "loss": 0.61706686, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 4.233342885971069 }, { "auxiliary_loss_clip": 0.01048115, "auxiliary_loss_mlp": 0.0103057, "balance_loss_clip": 1.03636324, "balance_loss_mlp": 1.02309859, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 4.425173846713306, "language_loss": 0.57546669, "learning_rate": 7.432800701177023e-10, "loss": 0.59625357, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 2.9774606227874756 }, { "auxiliary_loss_clip": 0.01050004, "auxiliary_loss_mlp": 0.01004894, "balance_loss_clip": 1.01977348, "balance_loss_mlp": 1.00377929, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7908664010106694, "language_loss": 0.57744336, "learning_rate": 7.221964555415017e-10, "loss": 0.5979923, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.633542776107788 }, { "auxiliary_loss_clip": 0.01134825, "auxiliary_loss_mlp": 0.01022376, "balance_loss_clip": 1.0476048, "balance_loss_mlp": 1.01583695, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 2.703292582294238, "language_loss": 0.75516963, "learning_rate": 7.01416129818222e-10, "loss": 0.77674168, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.740468740463257 }, { "auxiliary_loss_clip": 0.01130784, "auxiliary_loss_mlp": 0.01022317, "balance_loss_clip": 1.04643369, "balance_loss_mlp": 1.01546001, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 9.787901359964103, "language_loss": 0.58352125, "learning_rate": 6.809390961006745e-10, "loss": 0.60505223, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.771547794342041 }, { "auxiliary_loss_clip": 0.0113818, "auxiliary_loss_mlp": 0.01030454, "balance_loss_clip": 1.04753256, "balance_loss_mlp": 1.02339661, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 2.924255407683261, "language_loss": 0.68937737, "learning_rate": 6.607653574948191e-10, "loss": 0.71106374, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 2.7754065990448 }, { "auxiliary_loss_clip": 0.01143334, "auxiliary_loss_mlp": 0.01025179, "balance_loss_clip": 1.04415917, "balance_loss_mlp": 1.01826787, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 1.8764198090113453, "language_loss": 0.81571579, "learning_rate": 6.408949170613187e-10, "loss": 0.83740091, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.7123591899871826 }, { "auxiliary_loss_clip": 0.01136935, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04680908, "balance_loss_mlp": 1.02019978, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 1.8595041641261216, "language_loss": 0.81945735, "learning_rate": 6.213277778144288e-10, "loss": 0.84110332, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.6845319271087646 }, { "auxiliary_loss_clip": 0.01091171, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 1.04108953, "balance_loss_mlp": 1.0210135, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 2.1676396664964384, "language_loss": 0.67025888, "learning_rate": 6.020639427224416e-10, "loss": 0.69145781, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.7587106227874756 }, { "auxiliary_loss_clip": 0.01138979, "auxiliary_loss_mlp": 0.01024089, "balance_loss_clip": 1.04936171, "balance_loss_mlp": 1.01730347, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 2.007993352447434, "language_loss": 0.72637266, "learning_rate": 5.831034147076864e-10, "loss": 0.74800336, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.6581783294677734 }, { "auxiliary_loss_clip": 0.01056318, "auxiliary_loss_mlp": 0.01005971, "balance_loss_clip": 1.01651859, "balance_loss_mlp": 1.00492811, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.7230943291694873, "language_loss": 0.55667031, "learning_rate": 5.644461966463065e-10, "loss": 0.57729322, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.307783603668213 }, { "auxiliary_loss_clip": 0.011371, "auxiliary_loss_mlp": 0.01024674, "balance_loss_clip": 1.04715669, "balance_loss_mlp": 1.01789355, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 2.3122322211654165, "language_loss": 0.75856924, "learning_rate": 5.460922913687049e-10, "loss": 0.78018695, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.608231544494629 }, { "auxiliary_loss_clip": 0.01102343, "auxiliary_loss_mlp": 0.00711749, "balance_loss_clip": 1.03977692, "balance_loss_mlp": 1.00053954, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.162192753920342, "language_loss": 0.75368237, "learning_rate": 5.280417016593208e-10, "loss": 0.77182329, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.753262758255005 }, { "auxiliary_loss_clip": 0.0115394, "auxiliary_loss_mlp": 0.00711055, "balance_loss_clip": 1.05197787, "balance_loss_mlp": 1.00058126, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.6394661492773983, "language_loss": 0.74628705, "learning_rate": 5.102944302559642e-10, "loss": 0.76493698, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.5675153732299805 }, { "auxiliary_loss_clip": 0.01097991, "auxiliary_loss_mlp": 0.01027526, "balance_loss_clip": 1.04231441, "balance_loss_mlp": 1.01971197, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 2.2063285507074553, "language_loss": 0.79621089, "learning_rate": 4.9285047985137e-10, "loss": 0.81746602, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.758399724960327 }, { "auxiliary_loss_clip": 0.01156785, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.04826427, "balance_loss_mlp": 1.02336371, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.6805743771131054, "language_loss": 0.7452479, "learning_rate": 4.757098530916436e-10, "loss": 0.76712096, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.6938722133636475 }, { "auxiliary_loss_clip": 0.01154494, "auxiliary_loss_mlp": 0.01029016, "balance_loss_clip": 1.04797041, "balance_loss_mlp": 1.02191138, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 2.846933326553484, "language_loss": 0.77295601, "learning_rate": 4.5887255257670563e-10, "loss": 0.7947911, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.575490951538086 }, { "auxiliary_loss_clip": 0.0116794, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.04792166, "balance_loss_mlp": 1.02127886, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 2.3811588699502635, "language_loss": 0.7693398, "learning_rate": 4.4233858086117906e-10, "loss": 0.79130352, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.6292269229888916 }, { "auxiliary_loss_clip": 0.0110313, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.04655647, "balance_loss_mlp": 1.02030253, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.13115046795359, "language_loss": 0.67709863, "learning_rate": 4.261079404528356e-10, "loss": 0.6984064, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.671175718307495 }, { "auxiliary_loss_clip": 0.01152271, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.04885721, "balance_loss_mlp": 1.0183773, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 2.1826315467073885, "language_loss": 0.68937093, "learning_rate": 4.1018063381437205e-10, "loss": 0.7111547, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.6680145263671875 }, { "auxiliary_loss_clip": 0.01053795, "auxiliary_loss_mlp": 0.01002001, "balance_loss_clip": 1.01962543, "balance_loss_mlp": 1.00101721, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8642585327127159, "language_loss": 0.61019397, "learning_rate": 3.9455666336141167e-10, "loss": 0.63075197, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 4.374474763870239 }, { "auxiliary_loss_clip": 0.01168414, "auxiliary_loss_mlp": 0.01021023, "balance_loss_clip": 1.04977465, "balance_loss_mlp": 1.01388216, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 6.311959471558387, "language_loss": 0.83616167, "learning_rate": 3.7923603146450267e-10, "loss": 0.85805601, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.5763418674468994 }, { "auxiliary_loss_clip": 0.0112276, "auxiliary_loss_mlp": 0.01027983, "balance_loss_clip": 1.04360318, "balance_loss_mlp": 1.02128065, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 2.152434727761101, "language_loss": 0.80618238, "learning_rate": 3.642187404473418e-10, "loss": 0.82768983, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.639291286468506 }, { "auxiliary_loss_clip": 0.01153702, "auxiliary_loss_mlp": 0.0102146, "balance_loss_clip": 1.04779053, "balance_loss_mlp": 1.01478446, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.328263122567285, "language_loss": 0.85916144, "learning_rate": 3.495047925885508e-10, "loss": 0.88091302, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 2.6173160076141357 }, { "auxiliary_loss_clip": 0.01132887, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.04444337, "balance_loss_mlp": 1.02118397, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.4473841871680544, "language_loss": 0.83030105, "learning_rate": 3.350941901199e-10, "loss": 0.85191262, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 4.802061319351196 }, { "auxiliary_loss_clip": 0.01138234, "auxiliary_loss_mlp": 0.01022585, "balance_loss_clip": 1.04505479, "balance_loss_mlp": 1.01528311, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.91418865811128, "language_loss": 0.8346203, "learning_rate": 3.2098693522764066e-10, "loss": 0.85622847, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 3.546875476837158 }, { "auxiliary_loss_clip": 0.01141737, "auxiliary_loss_mlp": 0.00711419, "balance_loss_clip": 1.04526842, "balance_loss_mlp": 1.00055027, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 2.8326081953845175, "language_loss": 0.81378961, "learning_rate": 3.071830300516165e-10, "loss": 0.83232123, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.629908561706543 }, { "auxiliary_loss_clip": 0.01160003, "auxiliary_loss_mlp": 0.01027528, "balance_loss_clip": 1.04853988, "balance_loss_mlp": 1.0199883, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.4291279755029205, "language_loss": 0.71154785, "learning_rate": 2.9368247668615234e-10, "loss": 0.73342317, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.584712028503418 }, { "auxiliary_loss_clip": 0.01174074, "auxiliary_loss_mlp": 0.01027121, "balance_loss_clip": 1.0518291, "balance_loss_mlp": 1.01946783, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 3.804870749967328, "language_loss": 0.61359662, "learning_rate": 2.804852771789434e-10, "loss": 0.63560861, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.4819512367248535 }, { "auxiliary_loss_clip": 0.01165769, "auxiliary_loss_mlp": 0.01021532, "balance_loss_clip": 1.04730463, "balance_loss_mlp": 1.01479936, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.8084665125359112, "language_loss": 0.55866539, "learning_rate": 2.675914335321661e-10, "loss": 0.58053845, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.6063194274902344 }, { "auxiliary_loss_clip": 0.01157335, "auxiliary_loss_mlp": 0.01025884, "balance_loss_clip": 1.04786909, "balance_loss_mlp": 1.01817155, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.7158228718993738, "language_loss": 0.79630202, "learning_rate": 2.550009477018111e-10, "loss": 0.81813419, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.6045498847961426 }, { "auxiliary_loss_clip": 0.01137212, "auxiliary_loss_mlp": 0.00711581, "balance_loss_clip": 1.04776669, "balance_loss_mlp": 1.00057793, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 2.0811866318114167, "language_loss": 0.63196993, "learning_rate": 2.4271382159790634e-10, "loss": 0.65045786, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 2.6740376949310303 }, { "auxiliary_loss_clip": 0.01101441, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.04431486, "balance_loss_mlp": 1.02262509, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 2.0386413554871026, "language_loss": 0.85600251, "learning_rate": 2.3073005708429406e-10, "loss": 0.87731963, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.761793851852417 }, { "auxiliary_loss_clip": 0.01116574, "auxiliary_loss_mlp": 0.01025276, "balance_loss_clip": 1.04585755, "balance_loss_mlp": 1.01804602, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 2.0428775330398325, "language_loss": 0.724509, "learning_rate": 2.190496559788535e-10, "loss": 0.74592745, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.6733314990997314 }, { "auxiliary_loss_clip": 0.01133989, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.04670107, "balance_loss_mlp": 1.01770997, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 4.885681268618183, "language_loss": 0.77079946, "learning_rate": 2.0767262005372265e-10, "loss": 0.79238296, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.623710870742798 }, { "auxiliary_loss_clip": 0.01127564, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.04425192, "balance_loss_mlp": 1.02377355, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 2.0708390343452954, "language_loss": 0.75547493, "learning_rate": 1.965989510346322e-10, "loss": 0.77705657, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.6805763244628906 }, { "auxiliary_loss_clip": 0.0110115, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.04409742, "balance_loss_mlp": 1.02182364, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 2.13425876528241, "language_loss": 0.71502113, "learning_rate": 1.8582865060134955e-10, "loss": 0.73632658, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.6837687492370605 }, { "auxiliary_loss_clip": 0.01070579, "auxiliary_loss_mlp": 0.01005949, "balance_loss_clip": 1.01820469, "balance_loss_mlp": 1.00491142, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7861434553417559, "language_loss": 0.55632442, "learning_rate": 1.7536172038790098e-10, "loss": 0.57708967, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.219383478164673 }, { "auxiliary_loss_clip": 0.01139829, "auxiliary_loss_mlp": 0.01028991, "balance_loss_clip": 1.04914725, "balance_loss_mlp": 1.02188659, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.147969676739077, "language_loss": 0.69587564, "learning_rate": 1.651981619819054e-10, "loss": 0.71756387, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.675541639328003 }, { "auxiliary_loss_clip": 0.01111032, "auxiliary_loss_mlp": 0.01022365, "balance_loss_clip": 1.04474151, "balance_loss_mlp": 1.0150218, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.307691388440643, "language_loss": 0.71018243, "learning_rate": 1.5533797692546257e-10, "loss": 0.73151648, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.6991047859191895 }, { "auxiliary_loss_clip": 0.01148698, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.04631186, "balance_loss_mlp": 1.02220607, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 3.7380250645911315, "language_loss": 0.84039235, "learning_rate": 1.4578116671404296e-10, "loss": 0.86217386, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.609309673309326 }, { "auxiliary_loss_clip": 0.01150537, "auxiliary_loss_mlp": 0.01025981, "balance_loss_clip": 1.04968166, "balance_loss_mlp": 1.01923931, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 2.2150368535579408, "language_loss": 0.71118623, "learning_rate": 1.3652773279759777e-10, "loss": 0.7329514, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.5408124923706055 }, { "auxiliary_loss_clip": 0.0115069, "auxiliary_loss_mlp": 0.01026403, "balance_loss_clip": 1.04721653, "balance_loss_mlp": 1.01899719, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 1.752705371879642, "language_loss": 0.63059008, "learning_rate": 1.2757767657989305e-10, "loss": 0.65236098, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.678067445755005 }, { "auxiliary_loss_clip": 0.01147318, "auxiliary_loss_mlp": 0.01025027, "balance_loss_clip": 1.04550385, "balance_loss_mlp": 1.0178833, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 1.9477539384614777, "language_loss": 0.86903238, "learning_rate": 1.1893099941850948e-10, "loss": 0.89075589, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.5808804035186768 }, { "auxiliary_loss_clip": 0.01140963, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04499865, "balance_loss_mlp": 1.01635468, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.4802288098975485, "language_loss": 0.77210295, "learning_rate": 1.105877026252866e-10, "loss": 0.79375243, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.674262762069702 }, { "auxiliary_loss_clip": 0.01171405, "auxiliary_loss_mlp": 0.01023893, "balance_loss_clip": 1.04967952, "balance_loss_mlp": 1.0163945, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 2.4373227089585763, "language_loss": 0.72307587, "learning_rate": 1.0254778746565663e-10, "loss": 0.74502885, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.523542881011963 }, { "auxiliary_loss_clip": 0.01119236, "auxiliary_loss_mlp": 0.01023898, "balance_loss_clip": 1.04569685, "balance_loss_mlp": 1.01725817, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 5.1425212430751515, "language_loss": 0.7330268, "learning_rate": 9.481125515953259e-11, "loss": 0.75445819, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 3.6843903064727783 }, { "auxiliary_loss_clip": 0.01107311, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.04177439, "balance_loss_mlp": 1.01651514, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.919756724396564, "language_loss": 0.80160916, "learning_rate": 8.737810688064228e-11, "loss": 0.82292163, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.848036766052246 }, { "auxiliary_loss_clip": 0.01114039, "auxiliary_loss_mlp": 0.01029264, "balance_loss_clip": 1.04457593, "balance_loss_mlp": 1.02088332, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 2.2253124467639314, "language_loss": 0.79143393, "learning_rate": 8.024834375608414e-11, "loss": 0.81286693, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.7272534370422363 }, { "auxiliary_loss_clip": 0.0107094, "auxiliary_loss_mlp": 0.01005248, "balance_loss_clip": 1.01837587, "balance_loss_mlp": 1.00421703, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8219231835959345, "language_loss": 0.62844396, "learning_rate": 7.342196686788149e-11, "loss": 0.6492058, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 3.094611644744873 }, { "auxiliary_loss_clip": 0.01137776, "auxiliary_loss_mlp": 0.0102745, "balance_loss_clip": 1.05077207, "balance_loss_mlp": 1.01985061, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.9538017724835206, "language_loss": 0.69011962, "learning_rate": 6.689897725142834e-11, "loss": 0.71177191, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 4.022667646408081 }, { "auxiliary_loss_clip": 0.01137763, "auxiliary_loss_mlp": 0.01024887, "balance_loss_clip": 1.04558814, "balance_loss_mlp": 1.01776445, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 3.8307814442477595, "language_loss": 0.88610387, "learning_rate": 6.067937589615545e-11, "loss": 0.90773046, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 3.3974075317382812 }, { "auxiliary_loss_clip": 0.01050389, "auxiliary_loss_mlp": 0.01004697, "balance_loss_clip": 1.01931024, "balance_loss_mlp": 1.00357044, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7415682443062426, "language_loss": 0.57657373, "learning_rate": 5.476316374575241e-11, "loss": 0.59712458, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.159198045730591 }, { "auxiliary_loss_clip": 0.01171108, "auxiliary_loss_mlp": 0.01033953, "balance_loss_clip": 1.04989481, "balance_loss_mlp": 1.02625465, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 2.213055796286033, "language_loss": 0.72703242, "learning_rate": 4.9150341697723476e-11, "loss": 0.74908304, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.651066303253174 }, { "auxiliary_loss_clip": 0.01134998, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.0474211, "balance_loss_mlp": 1.02428842, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.6015690746839666, "language_loss": 0.66783506, "learning_rate": 4.384091060338768e-11, "loss": 0.68950462, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.7502059936523438 }, { "auxiliary_loss_clip": 0.0115208, "auxiliary_loss_mlp": 0.01024394, "balance_loss_clip": 1.04808962, "balance_loss_mlp": 1.01720834, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.352871251284669, "language_loss": 0.73666418, "learning_rate": 3.883487126810081e-11, "loss": 0.75842887, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.740628242492676 }, { "auxiliary_loss_clip": 0.01143935, "auxiliary_loss_mlp": 0.01024405, "balance_loss_clip": 1.0447855, "balance_loss_mlp": 1.0172646, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.6569891311420968, "language_loss": 0.7946806, "learning_rate": 3.41322244516995e-11, "loss": 0.81636399, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.681535005569458 }, { "auxiliary_loss_clip": 0.01094028, "auxiliary_loss_mlp": 0.01026506, "balance_loss_clip": 1.04275751, "balance_loss_mlp": 1.0197289, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 1.8734752136298027, "language_loss": 0.63027859, "learning_rate": 2.9732970866946925e-11, "loss": 0.65148389, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 2.9096121788024902 }, { "auxiliary_loss_clip": 0.01107104, "auxiliary_loss_mlp": 0.0102398, "balance_loss_clip": 1.04090905, "balance_loss_mlp": 1.01627362, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.3589949134414727, "language_loss": 0.78630704, "learning_rate": 2.563711118175327e-11, "loss": 0.8076179, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.720613956451416 }, { "auxiliary_loss_clip": 0.01115988, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.04425168, "balance_loss_mlp": 1.02144647, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 2.1029147628019804, "language_loss": 0.83822203, "learning_rate": 2.184464601717728e-11, "loss": 0.85966241, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.7263238430023193 }, { "auxiliary_loss_clip": 0.01159152, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.05202818, "balance_loss_mlp": 1.02206528, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.5943772611755636, "language_loss": 0.77553463, "learning_rate": 1.8355575948758585e-11, "loss": 0.79741675, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.5999512672424316 }, { "auxiliary_loss_clip": 0.0113579, "auxiliary_loss_mlp": 0.01024556, "balance_loss_clip": 1.04429543, "balance_loss_mlp": 1.01715922, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.4071858834070627, "language_loss": 0.73593152, "learning_rate": 1.5169901505407424e-11, "loss": 0.75753498, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.717360734939575 }, { "auxiliary_loss_clip": 0.01136796, "auxiliary_loss_mlp": 0.01024215, "balance_loss_clip": 1.0480113, "balance_loss_mlp": 1.017542, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 1.7000027693722948, "language_loss": 0.74490309, "learning_rate": 1.228762317073695e-11, "loss": 0.76651323, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.703446388244629 }, { "auxiliary_loss_clip": 0.01137111, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 1.04627681, "balance_loss_mlp": 1.01881087, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 2.296601656158836, "language_loss": 0.78785849, "learning_rate": 9.70874138195299e-12, "loss": 0.8094846, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.7829344272613525 }, { "auxiliary_loss_clip": 0.01168744, "auxiliary_loss_mlp": 0.01024153, "balance_loss_clip": 1.04809928, "balance_loss_mlp": 1.01657176, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.8963670494365836, "language_loss": 0.74700278, "learning_rate": 7.433256530076093e-12, "loss": 0.76893175, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.666844367980957 }, { "auxiliary_loss_clip": 0.01108753, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.04171252, "balance_loss_mlp": 1.01955199, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.4236394098756358, "language_loss": 0.75948524, "learning_rate": 5.46116896038562e-12, "loss": 0.7808345, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.6999924182891846 }, { "auxiliary_loss_clip": 0.01133587, "auxiliary_loss_mlp": 0.01026297, "balance_loss_clip": 1.04556251, "balance_loss_mlp": 1.01907277, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.1160458129389097, "language_loss": 0.61847663, "learning_rate": 3.792478972197699e-12, "loss": 0.64007545, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.9370553493499756 }, { "auxiliary_loss_clip": 0.01166274, "auxiliary_loss_mlp": 0.01020465, "balance_loss_clip": 1.04703009, "balance_loss_mlp": 1.01345003, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 3.928895812331167, "language_loss": 0.70699728, "learning_rate": 2.4271868181990895e-12, "loss": 0.72886467, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.5835893154144287 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01023113, "balance_loss_clip": 1.04922009, "balance_loss_mlp": 1.01548922, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.2159221669881823, "language_loss": 0.81062752, "learning_rate": 1.3652927060014973e-12, "loss": 0.8324272, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.633453845977783 }, { "auxiliary_loss_clip": 0.01124191, "auxiliary_loss_mlp": 0.01024784, "balance_loss_clip": 1.04527712, "balance_loss_mlp": 1.01767921, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 2.3207123839750348, "language_loss": 0.63962197, "learning_rate": 6.067967965872612e-13, "loss": 0.66111171, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.6670713424682617 }, { "auxiliary_loss_clip": 0.01120257, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.04494834, "balance_loss_mlp": 1.01744747, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.6155241789605208, "language_loss": 0.77119601, "learning_rate": 1.5169920497548615e-13, "loss": 0.79264522, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.032151222229004 }, { "auxiliary_loss_clip": 0.01108197, "auxiliary_loss_mlp": 0.0101412, "balance_loss_clip": 1.03227258, "balance_loss_mlp": 1.00972605, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.1411842884006298, "language_loss": 0.54971689, "learning_rate": 0.0, "loss": 0.57094008, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.1767191886901855 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7896715052544124, "train_runtime": 24877.4235, "train_samples_per_second": 13.371, "train_steps_per_second": 0.334 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }