{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05185335, "auxiliary_loss_mlp": 0.02433534, "balance_loss_clip": 2.59465408, "balance_loss_mlp": 1.87963986, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 40.4480724223927, "language_loss": 2.58550739, "learning_rate": 0.0, "loss": 1.9090879, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 15.124518394470215 }, { "auxiliary_loss_clip": 0.0350525, "auxiliary_loss_mlp": 0.01638677, "balance_loss_clip": 1.73121238, "balance_loss_mlp": 1.2682699, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 55.24028104111834, "language_loss": 1.8909446, "learning_rate": 5.021476677069823e-07, "loss": 1.942384, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.724127769470215 }, { "auxiliary_loss_clip": 0.03468621, "auxiliary_loss_mlp": 0.01588339, "balance_loss_clip": 1.72706831, "balance_loss_mlp": 1.22002971, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 41.34773921452883, "language_loss": 1.6165818, "learning_rate": 7.958852231401551e-07, "loss": 1.66715133, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.657543897628784 }, { "auxiliary_loss_clip": 0.03496814, "auxiliary_loss_mlp": 0.01637831, "balance_loss_clip": 1.73247063, "balance_loss_mlp": 1.26723325, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 37.020231505560474, "language_loss": 1.64451838, "learning_rate": 1.0042953354139647e-06, "loss": 1.6958648, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.6132500171661377 }, { "auxiliary_loss_clip": 0.03469621, "auxiliary_loss_mlp": 0.01594352, "balance_loss_clip": 1.72772098, "balance_loss_mlp": 1.233482, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 54.98524326539907, "language_loss": 1.93146873, "learning_rate": 1.1659507774310057e-06, "loss": 1.98210859, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.7354953289031982 }, { "auxiliary_loss_clip": 0.03499077, "auxiliary_loss_mlp": 0.01605233, "balance_loss_clip": 1.73888242, "balance_loss_mlp": 1.24131131, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 44.151142915898504, "language_loss": 1.6085887, "learning_rate": 1.2980328908471373e-06, "loss": 1.65963173, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 3.0918002128601074 }, { "auxiliary_loss_clip": 0.03247097, "auxiliary_loss_mlp": 0.01529816, "balance_loss_clip": 1.77931428, "balance_loss_mlp": 1.16055298, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 6.240704648572775, "language_loss": 0.81465656, "learning_rate": 1.4097067265369432e-06, "loss": 0.86242568, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.2736217975616455 }, { "auxiliary_loss_clip": 0.03500811, "auxiliary_loss_mlp": 0.01674233, "balance_loss_clip": 1.72733915, "balance_loss_mlp": 1.29505169, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.08012040211905, "language_loss": 1.58913291, "learning_rate": 1.506443003120947e-06, "loss": 1.64088345, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.8999123573303223 }, { "auxiliary_loss_clip": 0.03451107, "auxiliary_loss_mlp": 0.01623617, "balance_loss_clip": 1.73023951, "balance_loss_mlp": 1.257406, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 17.49076040670353, "language_loss": 1.47781503, "learning_rate": 1.5917704462803102e-06, "loss": 1.52856231, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.888622283935547 }, { "auxiliary_loss_clip": 0.03459074, "auxiliary_loss_mlp": 0.01637403, "balance_loss_clip": 1.73184085, "balance_loss_mlp": 1.27100158, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.379162811774234, "language_loss": 1.5282774, "learning_rate": 1.6680984451379884e-06, "loss": 1.57924223, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.756546974182129 }, { "auxiliary_loss_clip": 0.03479782, "auxiliary_loss_mlp": 0.01619238, "balance_loss_clip": 1.73374796, "balance_loss_mlp": 1.24692321, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 14.39391284296686, "language_loss": 1.3245579, "learning_rate": 1.7371455188905097e-06, "loss": 1.37554812, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.78688383102417 }, { "auxiliary_loss_clip": 0.03472403, "auxiliary_loss_mlp": 0.01589809, "balance_loss_clip": 1.72860444, "balance_loss_mlp": 1.22207201, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 10.467330461408412, "language_loss": 1.25333619, "learning_rate": 1.8001805585541196e-06, "loss": 1.3039583, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.968977212905884 }, { "auxiliary_loss_clip": 0.0343064, "auxiliary_loss_mlp": 0.01621842, "balance_loss_clip": 1.7212956, "balance_loss_mlp": 1.25200772, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.541967633443749, "language_loss": 1.28970003, "learning_rate": 1.8581671739548328e-06, "loss": 1.34022498, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.778419256210327 }, { "auxiliary_loss_clip": 0.03440524, "auxiliary_loss_mlp": 0.01579669, "balance_loss_clip": 1.72803009, "balance_loss_mlp": 1.21574664, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.231639441799505, "language_loss": 1.1346221, "learning_rate": 1.9118543942439254e-06, "loss": 1.18482399, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 4.086289405822754 }, { "auxiliary_loss_clip": 0.03415908, "auxiliary_loss_mlp": 0.01638095, "balance_loss_clip": 1.71943903, "balance_loss_mlp": 1.27131164, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.366564514175447, "language_loss": 1.12644029, "learning_rate": 1.961836000571161e-06, "loss": 1.17698026, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.9294369220733643 }, { "auxiliary_loss_clip": 0.03122865, "auxiliary_loss_mlp": 0.01478761, "balance_loss_clip": 1.75881386, "balance_loss_mlp": 1.12170529, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.844140140725537, "language_loss": 0.64833224, "learning_rate": 2.0085906708279293e-06, "loss": 0.69434845, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.288327693939209 }, { "auxiliary_loss_clip": 0.03384965, "auxiliary_loss_mlp": 0.01584047, "balance_loss_clip": 1.71822166, "balance_loss_mlp": 1.22088802, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.289096016947757, "language_loss": 1.1584332, "learning_rate": 2.0525099325728135e-06, "loss": 1.20812321, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 3.0331172943115234 }, { "auxiliary_loss_clip": 0.03071732, "auxiliary_loss_mlp": 0.01458972, "balance_loss_clip": 1.74686003, "balance_loss_mlp": 1.10649359, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.604440557412118, "language_loss": 0.72346109, "learning_rate": 2.0939181139872922e-06, "loss": 0.76876813, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.3093671798706055 }, { "auxiliary_loss_clip": 0.0339064, "auxiliary_loss_mlp": 0.01601546, "balance_loss_clip": 1.71849382, "balance_loss_mlp": 1.24925852, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 4.8730622052850485, "language_loss": 1.01332808, "learning_rate": 2.1330868934640175e-06, "loss": 1.06324995, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 3.05334210395813 }, { "auxiliary_loss_clip": 0.03005865, "auxiliary_loss_mlp": 0.01438341, "balance_loss_clip": 1.73370576, "balance_loss_mlp": 1.09196603, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.5596986444807834, "language_loss": 0.76430011, "learning_rate": 2.170246112844971e-06, "loss": 0.80874217, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.080176591873169 }, { "auxiliary_loss_clip": 0.0329226, "auxiliary_loss_mlp": 0.01540556, "balance_loss_clip": 1.7006886, "balance_loss_mlp": 1.18845916, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 6.399826225561999, "language_loss": 1.01332736, "learning_rate": 2.2055919496770983e-06, "loss": 1.06165552, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.818455696105957 }, { "auxiliary_loss_clip": 0.03260464, "auxiliary_loss_mlp": 0.01506764, "balance_loss_clip": 1.69960642, "balance_loss_mlp": 1.16553926, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 5.118361462888217, "language_loss": 0.89574873, "learning_rate": 2.2392931865974923e-06, "loss": 0.94342101, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 3.0259480476379395 }, { "auxiliary_loss_clip": 0.03204627, "auxiliary_loss_mlp": 0.0151953, "balance_loss_clip": 1.68441713, "balance_loss_mlp": 1.17048562, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.92583740794861, "language_loss": 1.01950443, "learning_rate": 2.271496085962064e-06, "loss": 1.066746, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.886434316635132 }, { "auxiliary_loss_clip": 0.03203667, "auxiliary_loss_mlp": 0.01551569, "balance_loss_clip": 1.68738294, "balance_loss_mlp": 1.21148896, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.4512426182530604, "language_loss": 1.02640259, "learning_rate": 2.3023282262611022e-06, "loss": 1.07395506, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.821049451828003 }, { "auxiliary_loss_clip": 0.03191176, "auxiliary_loss_mlp": 0.01506546, "balance_loss_clip": 1.68249726, "balance_loss_mlp": 1.17047155, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.4080988107624446, "language_loss": 0.9259091, "learning_rate": 2.3319015548620114e-06, "loss": 0.97288644, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 3.018259286880493 }, { "auxiliary_loss_clip": 0.03175901, "auxiliary_loss_mlp": 0.01514845, "balance_loss_clip": 1.67973578, "balance_loss_mlp": 1.17209435, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.551703275296398, "language_loss": 0.9291147, "learning_rate": 2.3603148416618152e-06, "loss": 0.97602218, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.8973653316497803 }, { "auxiliary_loss_clip": 0.03138396, "auxiliary_loss_mlp": 0.0149702, "balance_loss_clip": 1.66958749, "balance_loss_mlp": 1.16285288, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.456157128322032, "language_loss": 1.01025105, "learning_rate": 2.3876556694204647e-06, "loss": 1.05660522, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.9213926792144775 }, { "auxiliary_loss_clip": 0.0308387, "auxiliary_loss_mlp": 0.01488497, "balance_loss_clip": 1.67152953, "balance_loss_mlp": 1.15375781, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.4096232349357534, "language_loss": 0.90869963, "learning_rate": 2.414002061950908e-06, "loss": 0.95442331, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.9950122833251953 }, { "auxiliary_loss_clip": 0.03086801, "auxiliary_loss_mlp": 0.01489512, "balance_loss_clip": 1.66141748, "balance_loss_mlp": 1.16297388, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.6213439079831264, "language_loss": 1.00017726, "learning_rate": 2.4394238264681557e-06, "loss": 1.0459404, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 3.02445387840271 }, { "auxiliary_loss_clip": 0.03056916, "auxiliary_loss_mlp": 0.01423809, "balance_loss_clip": 1.65792704, "balance_loss_mlp": 1.11214781, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.0568852810562825, "language_loss": 0.99631858, "learning_rate": 2.4639836682781433e-06, "loss": 1.04112577, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.9185898303985596 }, { "auxiliary_loss_clip": 0.03020404, "auxiliary_loss_mlp": 0.01438776, "balance_loss_clip": 1.66035163, "balance_loss_mlp": 1.12330055, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.250147371903613, "language_loss": 1.00315619, "learning_rate": 2.487738122623307e-06, "loss": 1.04774785, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.8238682746887207 }, { "auxiliary_loss_clip": 0.02909972, "auxiliary_loss_mlp": 0.01405161, "balance_loss_clip": 1.62567604, "balance_loss_mlp": 1.11238337, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 3.456212399523975, "language_loss": 0.98862624, "learning_rate": 2.510738338534912e-06, "loss": 1.03177762, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.8929226398468018 }, { "auxiliary_loss_clip": 0.02848385, "auxiliary_loss_mlp": 0.01436734, "balance_loss_clip": 1.60597122, "balance_loss_mlp": 1.12450135, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.344068393941192, "language_loss": 1.0257318, "learning_rate": 2.5330307420306648e-06, "loss": 1.06858301, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.7943201065063477 }, { "auxiliary_loss_clip": 0.02792377, "auxiliary_loss_mlp": 0.01416982, "balance_loss_clip": 1.59691072, "balance_loss_mlp": 1.10951734, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 2.560357003134473, "language_loss": 0.88235676, "learning_rate": 2.554657600279796e-06, "loss": 0.92445034, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.8532426357269287 }, { "auxiliary_loss_clip": 0.02733954, "auxiliary_loss_mlp": 0.01385347, "balance_loss_clip": 1.58420229, "balance_loss_mlp": 1.08722806, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 2.093911325986375, "language_loss": 1.0344528, "learning_rate": 2.5756575039679493e-06, "loss": 1.0756458, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.988123893737793 }, { "auxiliary_loss_clip": 0.02735044, "auxiliary_loss_mlp": 0.01408631, "balance_loss_clip": 1.5803895, "balance_loss_mlp": 1.11489916, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 2.061206029369176, "language_loss": 0.95139968, "learning_rate": 2.5960657816942747e-06, "loss": 0.99283636, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.8112614154815674 }, { "auxiliary_loss_clip": 0.02238624, "auxiliary_loss_mlp": 0.01335062, "balance_loss_clip": 1.53190207, "balance_loss_mlp": 1.06192935, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.4277416072482807, "language_loss": 0.61032999, "learning_rate": 2.6159148575788668e-06, "loss": 0.64606678, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.436404228210449 }, { "auxiliary_loss_clip": 0.02629954, "auxiliary_loss_mlp": 0.01370801, "balance_loss_clip": 1.55763412, "balance_loss_mlp": 1.09804964, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.4946954402368133, "language_loss": 0.98737931, "learning_rate": 2.635234561171e-06, "loss": 1.02738678, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.8793773651123047 }, { "auxiliary_loss_clip": 0.02597857, "auxiliary_loss_mlp": 0.01371571, "balance_loss_clip": 1.54921222, "balance_loss_mlp": 1.09119105, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 4.024938973739427, "language_loss": 0.93987781, "learning_rate": 2.6540523970949877e-06, "loss": 0.97957206, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 3.034435272216797 }, { "auxiliary_loss_clip": 0.02548622, "auxiliary_loss_mlp": 0.01362675, "balance_loss_clip": 1.54095733, "balance_loss_mlp": 1.08973384, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 2.5883133214017384, "language_loss": 0.9252919, "learning_rate": 2.6723937805519533e-06, "loss": 0.96440482, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.7699296474456787 }, { "auxiliary_loss_clip": 0.02528984, "auxiliary_loss_mlp": 0.01340672, "balance_loss_clip": 1.52901721, "balance_loss_mlp": 1.08089066, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.29589790401655, "language_loss": 0.92849833, "learning_rate": 2.690282243737839e-06, "loss": 0.96719486, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 4.826427459716797 }, { "auxiliary_loss_clip": 0.02516023, "auxiliary_loss_mlp": 0.0132808, "balance_loss_clip": 1.51832938, "balance_loss_mlp": 1.06562912, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.678993185822361, "language_loss": 0.99334121, "learning_rate": 2.7077396173840807e-06, "loss": 1.03178227, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.8256630897521973 }, { "auxiliary_loss_clip": 0.02490952, "auxiliary_loss_mlp": 0.01341936, "balance_loss_clip": 1.5156579, "balance_loss_mlp": 1.07891214, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 2.4240305145624, "language_loss": 0.9267841, "learning_rate": 2.7247861909342594e-06, "loss": 0.96511292, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 3.077012062072754 }, { "auxiliary_loss_clip": 0.02424027, "auxiliary_loss_mlp": 0.01355532, "balance_loss_clip": 1.49722731, "balance_loss_mlp": 1.09765887, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.1686668542059713, "language_loss": 0.8290838, "learning_rate": 2.7414408543044743e-06, "loss": 0.86687934, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 3.0378284454345703 }, { "auxiliary_loss_clip": 0.02410129, "auxiliary_loss_mlp": 0.0132397, "balance_loss_clip": 1.49380207, "balance_loss_mlp": 1.07239115, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 7.028687941224897, "language_loss": 0.79304945, "learning_rate": 2.7577212237113157e-06, "loss": 0.83039045, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.830261707305908 }, { "auxiliary_loss_clip": 0.02388573, "auxiliary_loss_mlp": 0.01321429, "balance_loss_clip": 1.4874568, "balance_loss_mlp": 1.06908679, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 2.264610562022804, "language_loss": 1.0418762, "learning_rate": 2.7736437536690466e-06, "loss": 1.07897627, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.8702497482299805 }, { "auxiliary_loss_clip": 0.02365727, "auxiliary_loss_mlp": 0.01296018, "balance_loss_clip": 1.48145151, "balance_loss_mlp": 1.06141376, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 1.9491775411636278, "language_loss": 1.07772648, "learning_rate": 2.789223836941131e-06, "loss": 1.11434388, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.9034242630004883 }, { "auxiliary_loss_clip": 0.02329709, "auxiliary_loss_mlp": 0.01287748, "balance_loss_clip": 1.47227085, "balance_loss_mlp": 1.05962896, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.298104600796841, "language_loss": 1.0869168, "learning_rate": 2.8044758939680847e-06, "loss": 1.12309146, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.8259263038635254 }, { "auxiliary_loss_clip": 0.02288665, "auxiliary_loss_mlp": 0.01296505, "balance_loss_clip": 1.46353304, "balance_loss_mlp": 1.06533456, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 3.3437307150582796, "language_loss": 1.01959813, "learning_rate": 2.8194134530738863e-06, "loss": 1.05544972, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.914057493209839 }, { "auxiliary_loss_clip": 0.02263423, "auxiliary_loss_mlp": 0.01294982, "balance_loss_clip": 1.45742893, "balance_loss_mlp": 1.07258475, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 2.539005595426923, "language_loss": 0.9024263, "learning_rate": 2.834049222568994e-06, "loss": 0.93801039, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.8540079593658447 }, { "auxiliary_loss_clip": 0.02280104, "auxiliary_loss_mlp": 0.01269549, "balance_loss_clip": 1.46038389, "balance_loss_mlp": 1.06527221, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 2.121873560301535, "language_loss": 0.9251225, "learning_rate": 2.848395155712969e-06, "loss": 0.96061903, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.8759188652038574 }, { "auxiliary_loss_clip": 0.02246212, "auxiliary_loss_mlp": 0.01270323, "balance_loss_clip": 1.45885146, "balance_loss_mlp": 1.06432915, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.553519185204845, "language_loss": 0.97582018, "learning_rate": 2.8624625093687977e-06, "loss": 1.01098549, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.7934319972991943 }, { "auxiliary_loss_clip": 0.0224054, "auxiliary_loss_mlp": 0.0126221, "balance_loss_clip": 1.45193815, "balance_loss_mlp": 1.05297363, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.2530380833144013, "language_loss": 0.88830531, "learning_rate": 2.876261897070029e-06, "loss": 0.92333281, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 3.0498597621917725 }, { "auxiliary_loss_clip": 0.0222182, "auxiliary_loss_mlp": 0.01261327, "balance_loss_clip": 1.44919372, "balance_loss_mlp": 1.06105566, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.401777300318519, "language_loss": 0.92243266, "learning_rate": 2.889803337127447e-06, "loss": 0.95726418, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.7963247299194336 }, { "auxiliary_loss_clip": 0.02206187, "auxiliary_loss_mlp": 0.01266015, "balance_loss_clip": 1.44370282, "balance_loss_mlp": 1.05964017, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 2.3859300876222624, "language_loss": 0.84876549, "learning_rate": 2.903096296321516e-06, "loss": 0.88348752, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.797780752182007 }, { "auxiliary_loss_clip": 0.02172939, "auxiliary_loss_mlp": 0.01254272, "balance_loss_clip": 1.43711352, "balance_loss_mlp": 1.05886436, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 2.1001634179739392, "language_loss": 0.91604102, "learning_rate": 2.9161497296578907e-06, "loss": 0.95031315, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.9598331451416016 }, { "auxiliary_loss_clip": 0.02164301, "auxiliary_loss_mlp": 0.01258346, "balance_loss_clip": 1.43020535, "balance_loss_mlp": 1.06522655, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.215194976434327, "language_loss": 0.8603946, "learning_rate": 2.928972116604173e-06, "loss": 0.89462101, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.871432065963745 }, { "auxiliary_loss_clip": 0.0214085, "auxiliary_loss_mlp": 0.0125286, "balance_loss_clip": 1.42077827, "balance_loss_mlp": 1.06307864, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 2.0156335405022316, "language_loss": 1.01923656, "learning_rate": 2.9415714941751377e-06, "loss": 1.05317366, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.8330562114715576 }, { "auxiliary_loss_clip": 0.02137264, "auxiliary_loss_mlp": 0.01234336, "balance_loss_clip": 1.42289662, "balance_loss_mlp": 1.05275667, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 1.8938591915910568, "language_loss": 0.93560141, "learning_rate": 2.9539554871897396e-06, "loss": 0.96931744, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.9713799953460693 }, { "auxiliary_loss_clip": 0.02098636, "auxiliary_loss_mlp": 0.01247756, "balance_loss_clip": 1.41418004, "balance_loss_mlp": 1.06436443, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.1353651673232483, "language_loss": 0.97507596, "learning_rate": 2.9661313359851253e-06, "loss": 1.00853992, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.9260361194610596 }, { "auxiliary_loss_clip": 0.02085896, "auxiliary_loss_mlp": 0.01243967, "balance_loss_clip": 1.40890956, "balance_loss_mlp": 1.07068479, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.1055018418407947, "language_loss": 0.93890011, "learning_rate": 2.978105921839922e-06, "loss": 0.97219872, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 3.2079946994781494 }, { "auxiliary_loss_clip": 0.02068824, "auxiliary_loss_mlp": 0.0122212, "balance_loss_clip": 1.40309405, "balance_loss_mlp": 1.0543685, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.362703955112609, "language_loss": 0.72291899, "learning_rate": 2.9898857903302893e-06, "loss": 0.75582838, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.854692220687866 }, { "auxiliary_loss_clip": 0.02053978, "auxiliary_loss_mlp": 0.01230505, "balance_loss_clip": 1.39704847, "balance_loss_mlp": 1.06199086, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 2.6033688101864447, "language_loss": 0.87621224, "learning_rate": 3.001477172817253e-06, "loss": 0.90905702, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.839292287826538 }, { "auxiliary_loss_clip": 0.02035088, "auxiliary_loss_mlp": 0.01219528, "balance_loss_clip": 1.3924197, "balance_loss_mlp": 1.05826211, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.4802408981364645, "language_loss": 0.96154487, "learning_rate": 3.012886006241894e-06, "loss": 0.99409103, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.898923635482788 }, { "auxiliary_loss_clip": 0.02033275, "auxiliary_loss_mlp": 0.01213702, "balance_loss_clip": 1.39344919, "balance_loss_mlp": 1.05663168, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 2.2258345459190374, "language_loss": 0.88132226, "learning_rate": 3.0241179513858383e-06, "loss": 0.91379201, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.930673837661743 }, { "auxiliary_loss_clip": 0.02013862, "auxiliary_loss_mlp": 0.01228352, "balance_loss_clip": 1.38246441, "balance_loss_mlp": 1.06336641, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 2.1350493325315916, "language_loss": 0.87790543, "learning_rate": 3.035178409737647e-06, "loss": 0.91032761, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.8758981227874756 }, { "auxiliary_loss_clip": 0.01997916, "auxiliary_loss_mlp": 0.0119593, "balance_loss_clip": 1.37966132, "balance_loss_mlp": 1.05011344, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.3731468169460297, "language_loss": 0.88661158, "learning_rate": 3.046072539090907e-06, "loss": 0.91855001, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 4.797903299331665 }, { "auxiliary_loss_clip": 0.01976797, "auxiliary_loss_mlp": 0.01202246, "balance_loss_clip": 1.37254548, "balance_loss_mlp": 1.06157875, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 3.662888951917216, "language_loss": 1.04483604, "learning_rate": 3.056805267986779e-06, "loss": 1.07662642, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 3.6564290523529053 }, { "auxiliary_loss_clip": 0.01963282, "auxiliary_loss_mlp": 0.01206784, "balance_loss_clip": 1.36568475, "balance_loss_mlp": 1.06220698, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.0516703415677027, "language_loss": 0.95429122, "learning_rate": 3.0673813091022194e-06, "loss": 0.98599184, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.877664566040039 }, { "auxiliary_loss_clip": 0.01783168, "auxiliary_loss_mlp": 0.01193475, "balance_loss_clip": 1.4093554, "balance_loss_mlp": 1.05690885, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3112540293831965, "language_loss": 0.62117922, "learning_rate": 3.0778051716749317e-06, "loss": 0.65094566, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.382751941680908 }, { "auxiliary_loss_clip": 0.01938984, "auxiliary_loss_mlp": 0.01209784, "balance_loss_clip": 1.35544503, "balance_loss_mlp": 1.06978416, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.283432386402717, "language_loss": 0.90425324, "learning_rate": 3.0880811730470094e-06, "loss": 0.93574101, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.8299033641815186 }, { "auxiliary_loss_clip": 0.01752322, "auxiliary_loss_mlp": 0.01160332, "balance_loss_clip": 1.39341569, "balance_loss_mlp": 1.0313952, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.171437747189355, "language_loss": 0.58690393, "learning_rate": 3.098213449401257e-06, "loss": 0.61603051, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.305851697921753 }, { "auxiliary_loss_clip": 0.01907767, "auxiliary_loss_mlp": 0.01181609, "balance_loss_clip": 1.34569395, "balance_loss_mlp": 1.05286348, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.33621893584454, "language_loss": 0.98638523, "learning_rate": 3.1082059657570015e-06, "loss": 1.01727891, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.9069511890411377 }, { "auxiliary_loss_clip": 0.01881045, "auxiliary_loss_mlp": 0.01189698, "balance_loss_clip": 1.33891714, "balance_loss_mlp": 1.06705499, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 4.091807571584644, "language_loss": 0.96691203, "learning_rate": 3.1180625252858496e-06, "loss": 0.99761951, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.854128122329712 }, { "auxiliary_loss_clip": 0.01866239, "auxiliary_loss_mlp": 0.0117165, "balance_loss_clip": 1.33517671, "balance_loss_mlp": 1.04900789, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 3.0707094594819253, "language_loss": 0.80149853, "learning_rate": 3.1277867780021663e-06, "loss": 0.83187747, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.7745285034179688 }, { "auxiliary_loss_clip": 0.01848283, "auxiliary_loss_mlp": 0.01177624, "balance_loss_clip": 1.3272078, "balance_loss_mlp": 1.05717504, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 1.947709545683033, "language_loss": 0.95480663, "learning_rate": 3.1373822288779824e-06, "loss": 0.98506576, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.8145854473114014 }, { "auxiliary_loss_clip": 0.01854701, "auxiliary_loss_mlp": 0.01177368, "balance_loss_clip": 1.33191991, "balance_loss_mlp": 1.06521583, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 1.9834128787942806, "language_loss": 0.79539251, "learning_rate": 3.1468522454274533e-06, "loss": 0.82571316, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.856318235397339 }, { "auxiliary_loss_clip": 0.01843248, "auxiliary_loss_mlp": 0.01169944, "balance_loss_clip": 1.32464731, "balance_loss_mlp": 1.05187941, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.869358897674171, "language_loss": 0.91812158, "learning_rate": 3.15620006480197e-06, "loss": 0.94825351, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.7537238597869873 }, { "auxiliary_loss_clip": 0.01844362, "auxiliary_loss_mlp": 0.01166852, "balance_loss_clip": 1.32906866, "balance_loss_mlp": 1.0563693, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 15.050915885869903, "language_loss": 0.74670494, "learning_rate": 3.1654288004333087e-06, "loss": 0.77681714, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.8749284744262695 }, { "auxiliary_loss_clip": 0.01805544, "auxiliary_loss_mlp": 0.01165912, "balance_loss_clip": 1.31124902, "balance_loss_mlp": 1.05681217, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.3154383860474352, "language_loss": 0.75886965, "learning_rate": 3.1745414482589353e-06, "loss": 0.78858423, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.6829047203063965 }, { "auxiliary_loss_clip": 0.01807058, "auxiliary_loss_mlp": 0.01162513, "balance_loss_clip": 1.3155309, "balance_loss_mlp": 1.05770373, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 2.457638570543815, "language_loss": 0.86934531, "learning_rate": 3.1835408925606204e-06, "loss": 0.89904106, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.681938648223877 }, { "auxiliary_loss_clip": 0.01789336, "auxiliary_loss_mlp": 0.01155422, "balance_loss_clip": 1.30708194, "balance_loss_mlp": 1.05314064, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.28596350467388, "language_loss": 0.8905437, "learning_rate": 3.1924299114448214e-06, "loss": 0.91999137, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.969802141189575 }, { "auxiliary_loss_clip": 0.01781989, "auxiliary_loss_mlp": 0.01151799, "balance_loss_clip": 1.30663323, "balance_loss_mlp": 1.05061412, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.1136109592028736, "language_loss": 0.83584112, "learning_rate": 3.2012111819909055e-06, "loss": 0.86517894, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.7572309970855713 }, { "auxiliary_loss_clip": 0.01776569, "auxiliary_loss_mlp": 0.01158084, "balance_loss_clip": 1.30134261, "balance_loss_mlp": 1.06128633, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.2702170252346567, "language_loss": 0.9495101, "learning_rate": 3.2098872850910627e-06, "loss": 0.97885668, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.8883330821990967 }, { "auxiliary_loss_clip": 0.01785461, "auxiliary_loss_mlp": 0.01163535, "balance_loss_clip": 1.30605602, "balance_loss_mlp": 1.06778574, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 2.0581303258922254, "language_loss": 0.89431781, "learning_rate": 3.2184607100038194e-06, "loss": 0.92380774, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.8665943145751953 }, { "auxiliary_loss_clip": 0.01765957, "auxiliary_loss_mlp": 0.01150984, "balance_loss_clip": 1.2992996, "balance_loss_mlp": 1.06396151, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.0774094340109954, "language_loss": 0.9318859, "learning_rate": 3.2269338586412414e-06, "loss": 0.96105534, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.816697835922241 }, { "auxiliary_loss_clip": 0.01743224, "auxiliary_loss_mlp": 0.0114959, "balance_loss_clip": 1.28645968, "balance_loss_mlp": 1.05679739, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.572931391505403, "language_loss": 0.96277022, "learning_rate": 3.2353090496083106e-06, "loss": 0.99169838, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.7933943271636963 }, { "auxiliary_loss_clip": 0.0173763, "auxiliary_loss_mlp": 0.0114468, "balance_loss_clip": 1.28482556, "balance_loss_mlp": 1.05713248, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 2.0404347768577282, "language_loss": 0.81148064, "learning_rate": 3.2435885220114572e-06, "loss": 0.84030378, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.949205160140991 }, { "auxiliary_loss_clip": 0.01725969, "auxiliary_loss_mlp": 0.01138088, "balance_loss_clip": 1.27929282, "balance_loss_mlp": 1.05626285, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.0617378156859574, "language_loss": 0.93719208, "learning_rate": 3.2517744390519113e-06, "loss": 0.96583265, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.825634241104126 }, { "auxiliary_loss_clip": 0.01720225, "auxiliary_loss_mlp": 0.01133944, "balance_loss_clip": 1.27932894, "balance_loss_mlp": 1.05154669, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 2.0125441191829574, "language_loss": 0.74850976, "learning_rate": 3.259868891418298e-06, "loss": 0.77705145, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.8090717792510986 }, { "auxiliary_loss_clip": 0.01719674, "auxiliary_loss_mlp": 0.01140068, "balance_loss_clip": 1.27796435, "balance_loss_mlp": 1.0583384, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 1.982475465063501, "language_loss": 0.84916496, "learning_rate": 3.2678739004917757e-06, "loss": 0.87776244, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.842331647872925 }, { "auxiliary_loss_clip": 0.01709546, "auxiliary_loss_mlp": 0.01129463, "balance_loss_clip": 1.27631843, "balance_loss_mlp": 1.05417073, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.7759373949911144, "language_loss": 0.92175782, "learning_rate": 3.275791421376029e-06, "loss": 0.95014787, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.946227550506592 }, { "auxiliary_loss_clip": 0.01698145, "auxiliary_loss_mlp": 0.01136776, "balance_loss_clip": 1.26935482, "balance_loss_mlp": 1.06443989, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.4656475965603013, "language_loss": 0.95927751, "learning_rate": 3.2836233457634622e-06, "loss": 0.98762679, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 2.8196961879730225 }, { "auxiliary_loss_clip": 0.01700361, "auxiliary_loss_mlp": 0.0113281, "balance_loss_clip": 1.26682377, "balance_loss_mlp": 1.05937696, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 2.2017472758015124, "language_loss": 0.8547945, "learning_rate": 3.2913715046481135e-06, "loss": 0.88312614, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 4.754944324493408 }, { "auxiliary_loss_clip": 0.01681891, "auxiliary_loss_mlp": 0.01115872, "balance_loss_clip": 1.26045489, "balance_loss_mlp": 1.04506171, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.3088488900392665, "language_loss": 0.88925433, "learning_rate": 3.299037670895023e-06, "loss": 0.91723192, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.8999264240264893 }, { "auxiliary_loss_clip": 0.01679649, "auxiliary_loss_mlp": 0.01116165, "balance_loss_clip": 1.25485086, "balance_loss_mlp": 1.04945564, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 2.0562111507023673, "language_loss": 0.80218911, "learning_rate": 3.3066235616750667e-06, "loss": 0.83014727, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.9678542613983154 }, { "auxiliary_loss_clip": 0.01657413, "auxiliary_loss_mlp": 0.01113427, "balance_loss_clip": 1.25130641, "balance_loss_mlp": 1.04857683, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.352959931248121, "language_loss": 0.92383903, "learning_rate": 3.3141308407736276e-06, "loss": 0.95154744, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.9431073665618896 }, { "auxiliary_loss_clip": 0.0165958, "auxiliary_loss_mlp": 0.01128718, "balance_loss_clip": 1.24800241, "balance_loss_mlp": 1.06410646, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 2.014775478477277, "language_loss": 0.86724949, "learning_rate": 3.321561120780869e-06, "loss": 0.89513242, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.887434959411621 }, { "auxiliary_loss_clip": 0.01650592, "auxiliary_loss_mlp": 0.01119005, "balance_loss_clip": 1.24637866, "balance_loss_mlp": 1.05787432, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.2020641853907517, "language_loss": 1.01383877, "learning_rate": 3.3289159651708192e-06, "loss": 1.04153466, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.9292523860931396 }, { "auxiliary_loss_clip": 0.01649363, "auxiliary_loss_mlp": 0.0111427, "balance_loss_clip": 1.24714684, "balance_loss_mlp": 1.05361629, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 2.179403670663143, "language_loss": 0.97495639, "learning_rate": 3.3361968902759768e-06, "loss": 1.00259268, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.917595863342285 }, { "auxiliary_loss_clip": 0.01628827, "auxiliary_loss_mlp": 0.01114397, "balance_loss_clip": 1.23518527, "balance_loss_mlp": 1.05488825, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.3609805264819257, "language_loss": 0.93646479, "learning_rate": 3.343405367163663e-06, "loss": 0.96389699, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 3.0340323448181152 }, { "auxiliary_loss_clip": 0.01618673, "auxiliary_loss_mlp": 0.01107611, "balance_loss_clip": 1.23096919, "balance_loss_mlp": 1.05379951, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 3.844212241035829, "language_loss": 0.80961734, "learning_rate": 3.350542823419951e-06, "loss": 0.83688015, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.863964080810547 }, { "auxiliary_loss_clip": 0.0161837, "auxiliary_loss_mlp": 0.01118917, "balance_loss_clip": 1.23053753, "balance_loss_mlp": 1.06331778, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 3.3718106306784406, "language_loss": 0.87442303, "learning_rate": 3.3576106448465615e-06, "loss": 0.90179592, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 3.0117757320404053 }, { "auxiliary_loss_clip": 0.0161988, "auxiliary_loss_mlp": 0.01118772, "balance_loss_clip": 1.2264924, "balance_loss_mlp": 1.06312466, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 2.1873757980323822, "language_loss": 0.8826037, "learning_rate": 3.3646101770757797e-06, "loss": 0.90999019, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 3.094442367553711 }, { "auxiliary_loss_clip": 0.01607943, "auxiliary_loss_mlp": 0.01109079, "balance_loss_clip": 1.22631788, "balance_loss_mlp": 1.05631709, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.9907310622882193, "language_loss": 0.85675466, "learning_rate": 3.371542727108104e-06, "loss": 0.88392484, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 3.135592222213745 }, { "auxiliary_loss_clip": 0.01609238, "auxiliary_loss_mlp": 0.01112593, "balance_loss_clip": 1.22191572, "balance_loss_mlp": 1.05992675, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 8.5359428410745, "language_loss": 0.89892548, "learning_rate": 3.3784095647770114e-06, "loss": 0.92614377, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.9435110092163086 }, { "auxiliary_loss_clip": 0.01597149, "auxiliary_loss_mlp": 0.01111246, "balance_loss_clip": 1.21515703, "balance_loss_mlp": 1.05619562, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 1.9533495183186578, "language_loss": 0.88561028, "learning_rate": 3.3852119241449547e-06, "loss": 0.91269422, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.9650747776031494 }, { "auxiliary_loss_clip": 0.01591744, "auxiliary_loss_mlp": 0.01107736, "balance_loss_clip": 1.21159911, "balance_loss_mlp": 1.06110132, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 3.471173636601977, "language_loss": 0.96278369, "learning_rate": 3.3919510048344295e-06, "loss": 0.98977852, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.9390201568603516 }, { "auxiliary_loss_clip": 0.01576125, "auxiliary_loss_mlp": 0.01103272, "balance_loss_clip": 1.20501542, "balance_loss_mlp": 1.05315685, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 3.083095720731453, "language_loss": 0.86733073, "learning_rate": 3.3986279732976907e-06, "loss": 0.89412475, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.8686563968658447 }, { "auxiliary_loss_clip": 0.0157439, "auxiliary_loss_mlp": 0.01078175, "balance_loss_clip": 1.20360136, "balance_loss_mlp": 1.03320932, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 2.0767055122512605, "language_loss": 0.95307249, "learning_rate": 3.4052439640284983e-06, "loss": 0.97959816, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.7682225704193115 }, { "auxiliary_loss_clip": 0.01566163, "auxiliary_loss_mlp": 0.01090116, "balance_loss_clip": 1.20040119, "balance_loss_mlp": 1.04743886, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.8833394320244652, "language_loss": 0.81225985, "learning_rate": 3.4118000807190217e-06, "loss": 0.8388226, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.8246755599975586 }, { "auxiliary_loss_clip": 0.01569366, "auxiliary_loss_mlp": 0.01106493, "balance_loss_clip": 1.19975197, "balance_loss_mlp": 1.05921423, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.6462364361892563, "language_loss": 0.76067609, "learning_rate": 3.4182973973648723e-06, "loss": 0.78743464, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.7980284690856934 }, { "auxiliary_loss_clip": 0.01558801, "auxiliary_loss_mlp": 0.01104865, "balance_loss_clip": 1.19457853, "balance_loss_mlp": 1.05863571, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 3.394787604953982, "language_loss": 0.94863075, "learning_rate": 3.424736959321014e-06, "loss": 0.97526741, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.8256382942199707 }, { "auxiliary_loss_clip": 0.01559872, "auxiliary_loss_mlp": 0.01098674, "balance_loss_clip": 1.19667745, "balance_loss_mlp": 1.05499601, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 1.901973391649418, "language_loss": 0.88773286, "learning_rate": 3.431119784311155e-06, "loss": 0.91431832, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.8701696395874023 }, { "auxiliary_loss_clip": 0.01545032, "auxiliary_loss_mlp": 0.01079572, "balance_loss_clip": 1.18943, "balance_loss_mlp": 1.04044747, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 2.1527019786214994, "language_loss": 0.77658367, "learning_rate": 3.43744686339307e-06, "loss": 0.80282974, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.93011212348938 }, { "auxiliary_loss_clip": 0.01547355, "auxiliary_loss_mlp": 0.01082829, "balance_loss_clip": 1.18514788, "balance_loss_mlp": 1.0420835, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.155507109342344, "language_loss": 0.90846664, "learning_rate": 3.44371916188212e-06, "loss": 0.9347685, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 3.031212329864502 }, { "auxiliary_loss_clip": 0.01540892, "auxiliary_loss_mlp": 0.01091971, "balance_loss_clip": 1.18246925, "balance_loss_mlp": 1.05020022, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.515884576721947, "language_loss": 0.86120546, "learning_rate": 3.449937620235143e-06, "loss": 0.88753408, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.75055193901062 }, { "auxiliary_loss_clip": 0.01532086, "auxiliary_loss_mlp": 0.01097726, "balance_loss_clip": 1.18155766, "balance_loss_mlp": 1.05829155, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.6152434685571957, "language_loss": 0.89283013, "learning_rate": 3.456103154896722e-06, "loss": 0.91912818, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.817178726196289 }, { "auxiliary_loss_clip": 0.01523611, "auxiliary_loss_mlp": 0.01097407, "balance_loss_clip": 1.17816901, "balance_loss_mlp": 1.05787742, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 2.0804984762725893, "language_loss": 0.92290831, "learning_rate": 3.462216659109757e-06, "loss": 0.94911849, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 3.8003664016723633 }, { "auxiliary_loss_clip": 0.01525992, "auxiliary_loss_mlp": 0.01088293, "balance_loss_clip": 1.17881882, "balance_loss_mlp": 1.05195808, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.7768114635749925, "language_loss": 0.85283077, "learning_rate": 3.4682790036921077e-06, "loss": 0.8789736, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.7602312564849854 }, { "auxiliary_loss_clip": 0.01509532, "auxiliary_loss_mlp": 0.01087468, "balance_loss_clip": 1.16937089, "balance_loss_mlp": 1.05232525, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.126694838565762, "language_loss": 0.83170217, "learning_rate": 3.4742910377810193e-06, "loss": 0.85767221, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.8198482990264893 }, { "auxiliary_loss_clip": 0.01513981, "auxiliary_loss_mlp": 0.01075418, "balance_loss_clip": 1.17171133, "balance_loss_mlp": 1.03891611, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.345661585606919, "language_loss": 0.88761133, "learning_rate": 3.4802535895469042e-06, "loss": 0.91350532, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.8230888843536377 }, { "auxiliary_loss_clip": 0.01510159, "auxiliary_loss_mlp": 0.01084426, "balance_loss_clip": 1.16765797, "balance_loss_mlp": 1.04744697, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 2.5498199937514014, "language_loss": 0.89572775, "learning_rate": 3.4861674668779934e-06, "loss": 0.9216736, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.7821788787841797 }, { "auxiliary_loss_clip": 0.01500901, "auxiliary_loss_mlp": 0.01069206, "balance_loss_clip": 1.16391468, "balance_loss_mlp": 1.03506422, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 1.9648674555996732, "language_loss": 0.84062564, "learning_rate": 3.492033458037272e-06, "loss": 0.86632669, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 3.1374223232269287 }, { "auxiliary_loss_clip": 0.0150294, "auxiliary_loss_mlp": 0.01071043, "balance_loss_clip": 1.16474628, "balance_loss_mlp": 1.03673458, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 3.3861179220679785, "language_loss": 0.8692565, "learning_rate": 3.497852332293018e-06, "loss": 0.89499629, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.884185552597046 }, { "auxiliary_loss_clip": 0.01501221, "auxiliary_loss_mlp": 0.0107896, "balance_loss_clip": 1.16382384, "balance_loss_mlp": 1.04803681, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 1.9272317732018271, "language_loss": 0.96497774, "learning_rate": 3.5036248405242356e-06, "loss": 0.99077952, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.827526092529297 }, { "auxiliary_loss_clip": 0.01501179, "auxiliary_loss_mlp": 0.01081524, "balance_loss_clip": 1.16387129, "balance_loss_mlp": 1.04778731, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 1.9059687342879121, "language_loss": 0.82785332, "learning_rate": 3.509351715802146e-06, "loss": 0.85368031, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.9764604568481445 }, { "auxiliary_loss_clip": 0.01488506, "auxiliary_loss_mlp": 0.01086115, "balance_loss_clip": 1.1569519, "balance_loss_mlp": 1.0524025, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.1243832330950214, "language_loss": 0.78334767, "learning_rate": 3.5150336739488763e-06, "loss": 0.80909389, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 3.010080099105835 }, { "auxiliary_loss_clip": 0.01489312, "auxiliary_loss_mlp": 0.01069716, "balance_loss_clip": 1.16090679, "balance_loss_mlp": 1.041273, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 2.281662691310887, "language_loss": 0.83838475, "learning_rate": 3.5206714140744143e-06, "loss": 0.86397499, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.8404948711395264 }, { "auxiliary_loss_clip": 0.01489672, "auxiliary_loss_mlp": 0.01081931, "balance_loss_clip": 1.16307259, "balance_loss_mlp": 1.0519855, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 4.512470047939525, "language_loss": 0.87657356, "learning_rate": 3.5262656190928208e-06, "loss": 0.90228963, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.8411309719085693 }, { "auxiliary_loss_clip": 0.01405221, "auxiliary_loss_mlp": 0.01096591, "balance_loss_clip": 1.17936254, "balance_loss_mlp": 1.06588256, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.0470112473331754, "language_loss": 0.71513855, "learning_rate": 3.5318169562186737e-06, "loss": 0.74015665, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.3645803928375244 }, { "auxiliary_loss_clip": 0.01480737, "auxiliary_loss_mlp": 0.01077277, "balance_loss_clip": 1.15788054, "balance_loss_mlp": 1.05138469, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 7.354892763364748, "language_loss": 0.81998622, "learning_rate": 3.5373260774446292e-06, "loss": 0.84556633, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.8588204383850098 }, { "auxiliary_loss_clip": 0.01472199, "auxiliary_loss_mlp": 0.01067898, "balance_loss_clip": 1.15078843, "balance_loss_mlp": 1.04170752, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 1.9327874635236526, "language_loss": 0.90098774, "learning_rate": 3.542793620000961e-06, "loss": 0.92638874, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.8278417587280273 }, { "auxiliary_loss_clip": 0.01470067, "auxiliary_loss_mlp": 0.01085602, "balance_loss_clip": 1.15295649, "balance_loss_mlp": 1.05700421, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 3.230339817463091, "language_loss": 0.86880374, "learning_rate": 3.5482202067978894e-06, "loss": 0.89436042, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.8220551013946533 }, { "auxiliary_loss_clip": 0.01470042, "auxiliary_loss_mlp": 0.01076111, "balance_loss_clip": 1.15019131, "balance_loss_mlp": 1.05086291, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 1.941718955997415, "language_loss": 0.76157939, "learning_rate": 3.553606446851471e-06, "loss": 0.78704095, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.7709388732910156 }, { "auxiliary_loss_clip": 0.01474535, "auxiliary_loss_mlp": 0.01073724, "balance_loss_clip": 1.15253222, "balance_loss_mlp": 1.04929805, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 2.1694246647825075, "language_loss": 0.83607149, "learning_rate": 3.5589529356937613e-06, "loss": 0.86155409, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.808544397354126 }, { "auxiliary_loss_clip": 0.01465771, "auxiliary_loss_mlp": 0.01083064, "balance_loss_clip": 1.14579844, "balance_loss_mlp": 1.05627775, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 2.0041320258243296, "language_loss": 0.77109492, "learning_rate": 3.5642602557679627e-06, "loss": 0.79658329, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.855867862701416 }, { "auxiliary_loss_clip": 0.0146943, "auxiliary_loss_mlp": 0.01081235, "balance_loss_clip": 1.15673423, "balance_loss_mlp": 1.05791807, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.2122183361332857, "language_loss": 0.84191215, "learning_rate": 3.569528976809202e-06, "loss": 0.86741883, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.8571856021881104 }, { "auxiliary_loss_clip": 0.01461878, "auxiliary_loss_mlp": 0.01069839, "balance_loss_clip": 1.14551651, "balance_loss_mlp": 1.04577065, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.6381033341548359, "language_loss": 0.89973754, "learning_rate": 3.5747596562115522e-06, "loss": 0.92505473, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.8138585090637207 }, { "auxiliary_loss_clip": 0.01465311, "auxiliary_loss_mlp": 0.01072614, "balance_loss_clip": 1.14912438, "balance_loss_mlp": 1.04620934, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 2.7676590601189366, "language_loss": 0.90954685, "learning_rate": 3.5799528393819138e-06, "loss": 0.93492603, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.9601049423217773 }, { "auxiliary_loss_clip": 0.01458671, "auxiliary_loss_mlp": 0.0105828, "balance_loss_clip": 1.14633071, "balance_loss_mlp": 1.03489137, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 1.910920404567093, "language_loss": 0.8799212, "learning_rate": 3.585109060081286e-06, "loss": 0.90509075, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.7530276775360107 }, { "auxiliary_loss_clip": 0.01452697, "auxiliary_loss_mlp": 0.01071222, "balance_loss_clip": 1.140885, "balance_loss_mlp": 1.0459137, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 1.924673396026974, "language_loss": 0.78502619, "learning_rate": 3.590228840753992e-06, "loss": 0.81026542, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.771705389022827 }, { "auxiliary_loss_clip": 0.01450587, "auxiliary_loss_mlp": 0.01068869, "balance_loss_clip": 1.14321744, "balance_loss_mlp": 1.04685104, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.4113029719914483, "language_loss": 0.87520206, "learning_rate": 3.5953126928453423e-06, "loss": 0.90039665, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.724475860595703 }, { "auxiliary_loss_clip": 0.01450361, "auxiliary_loss_mlp": 0.01078619, "balance_loss_clip": 1.14039803, "balance_loss_mlp": 1.05749464, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 2.1893314532138235, "language_loss": 0.80578887, "learning_rate": 3.600361117108239e-06, "loss": 0.83107865, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 3.7564456462860107 }, { "auxiliary_loss_clip": 0.01446321, "auxiliary_loss_mlp": 0.01072348, "balance_loss_clip": 1.13807535, "balance_loss_mlp": 1.05041409, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 2.8018449633652587, "language_loss": 0.9730289, "learning_rate": 3.6053746038991616e-06, "loss": 0.99821568, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 2.800995111465454 }, { "auxiliary_loss_clip": 0.01354881, "auxiliary_loss_mlp": 0.01027184, "balance_loss_clip": 1.14543271, "balance_loss_mlp": 1.00725198, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0524961718906605, "language_loss": 0.58465552, "learning_rate": 3.6103536334639843e-06, "loss": 0.60847616, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 4.970015287399292 }, { "auxiliary_loss_clip": 0.01437244, "auxiliary_loss_mlp": 0.01066177, "balance_loss_clip": 1.13468313, "balance_loss_mlp": 1.04539835, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 2.1058019384327302, "language_loss": 0.85418963, "learning_rate": 3.615298676214041e-06, "loss": 0.87922388, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 3.699542284011841 }, { "auxiliary_loss_clip": 0.01440453, "auxiliary_loss_mlp": 0.01070402, "balance_loss_clip": 1.13720393, "balance_loss_mlp": 1.04976666, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.046749943377627, "language_loss": 0.89099574, "learning_rate": 3.6202101929928317e-06, "loss": 0.91610432, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.800687313079834 }, { "auxiliary_loss_clip": 0.01441264, "auxiliary_loss_mlp": 0.01064125, "balance_loss_clip": 1.13609171, "balance_loss_mlp": 1.04478979, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 2.1072054355984555, "language_loss": 0.88344449, "learning_rate": 3.6250886353337413e-06, "loss": 0.90849835, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.749575614929199 }, { "auxiliary_loss_clip": 0.01437429, "auxiliary_loss_mlp": 0.0105909, "balance_loss_clip": 1.135849, "balance_loss_mlp": 1.03925323, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 1.9737595196182178, "language_loss": 0.86308873, "learning_rate": 3.6299344457091488e-06, "loss": 0.88805389, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.790248394012451 }, { "auxiliary_loss_clip": 0.01439488, "auxiliary_loss_mlp": 0.01078192, "balance_loss_clip": 1.13671196, "balance_loss_mlp": 1.05781925, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.238573141532112, "language_loss": 0.94059765, "learning_rate": 3.634748057771256e-06, "loss": 0.96577442, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.7550957202911377 }, { "auxiliary_loss_clip": 0.01427599, "auxiliary_loss_mlp": 0.0105804, "balance_loss_clip": 1.1266582, "balance_loss_mlp": 1.03970575, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.5915521445320684, "language_loss": 0.85643578, "learning_rate": 3.639529896584965e-06, "loss": 0.88129216, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.820107936859131 }, { "auxiliary_loss_clip": 0.01430669, "auxiliary_loss_mlp": 0.01059875, "balance_loss_clip": 1.13322699, "balance_loss_mlp": 1.0415405, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.769260137492558, "language_loss": 0.88744026, "learning_rate": 3.6442803788531233e-06, "loss": 0.91234577, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.7013626098632812 }, { "auxiliary_loss_clip": 0.01431323, "auxiliary_loss_mlp": 0.01061977, "balance_loss_clip": 1.1296134, "balance_loss_mlp": 1.04326081, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 3.258863247413859, "language_loss": 0.95886743, "learning_rate": 3.6489999131344357e-06, "loss": 0.98380041, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.830514907836914 }, { "auxiliary_loss_clip": 0.0142769, "auxiliary_loss_mlp": 0.01065261, "balance_loss_clip": 1.134233, "balance_loss_mlp": 1.04895306, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 1.9841251633011139, "language_loss": 0.907013, "learning_rate": 3.653688900054313e-06, "loss": 0.93194246, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.7411725521087646 }, { "auxiliary_loss_clip": 0.01423214, "auxiliary_loss_mlp": 0.01060383, "balance_loss_clip": 1.12524366, "balance_loss_mlp": 1.04120255, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.690586605004093, "language_loss": 0.75845289, "learning_rate": 3.6583477325089526e-06, "loss": 0.78328884, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.8148012161254883 }, { "auxiliary_loss_clip": 0.01421321, "auxiliary_loss_mlp": 0.01059233, "balance_loss_clip": 1.12792981, "balance_loss_mlp": 1.04157853, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.352809953029022, "language_loss": 1.04187012, "learning_rate": 3.6629767958628916e-06, "loss": 1.06667566, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.8264880180358887 }, { "auxiliary_loss_clip": 0.01421375, "auxiliary_loss_mlp": 0.01059967, "balance_loss_clip": 1.1273632, "balance_loss_mlp": 1.04277694, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.1713064446657246, "language_loss": 0.854738, "learning_rate": 3.667576468140291e-06, "loss": 0.87955135, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.7602081298828125 }, { "auxiliary_loss_clip": 0.01414862, "auxiliary_loss_mlp": 0.01051776, "balance_loss_clip": 1.12403047, "balance_loss_mlp": 1.03544462, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.6060873271372436, "language_loss": 0.88996333, "learning_rate": 3.672147120210184e-06, "loss": 0.9146297, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.8499960899353027 }, { "auxiliary_loss_clip": 0.01412016, "auxiliary_loss_mlp": 0.01060039, "balance_loss_clip": 1.12487054, "balance_loss_mlp": 1.04268193, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 1.9559202045059678, "language_loss": 0.86516941, "learning_rate": 3.6766891159659177e-06, "loss": 0.88988996, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.767106533050537 }, { "auxiliary_loss_clip": 0.01411421, "auxiliary_loss_mlp": 0.01061497, "balance_loss_clip": 1.12399995, "balance_loss_mlp": 1.04452109, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 2.81894709315836, "language_loss": 0.87793481, "learning_rate": 3.6812028124990075e-06, "loss": 0.90266395, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.7439303398132324 }, { "auxiliary_loss_clip": 0.01411097, "auxiliary_loss_mlp": 0.01052106, "balance_loss_clip": 1.1228044, "balance_loss_mlp": 1.03483212, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 2.8074200011211827, "language_loss": 0.81778455, "learning_rate": 3.6856885602676016e-06, "loss": 0.84241652, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.6884756088256836 }, { "auxiliary_loss_clip": 0.01413507, "auxiliary_loss_mlp": 0.01050099, "balance_loss_clip": 1.12342739, "balance_loss_mlp": 1.03429198, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.2076073146283215, "language_loss": 0.94065458, "learning_rate": 3.6901467032597733e-06, "loss": 0.96529067, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.842823028564453 }, { "auxiliary_loss_clip": 0.01410813, "auxiliary_loss_mlp": 0.01052435, "balance_loss_clip": 1.12379646, "balance_loss_mlp": 1.03503036, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.1869274119184716, "language_loss": 0.87030613, "learning_rate": 3.694577579151804e-06, "loss": 0.89493859, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.736877202987671 }, { "auxiliary_loss_clip": 0.01406862, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.12166619, "balance_loss_mlp": 1.03223562, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.4404651327452744, "language_loss": 0.73719752, "learning_rate": 3.6989815194616703e-06, "loss": 0.76173836, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.7721731662750244 }, { "auxiliary_loss_clip": 0.01408646, "auxiliary_loss_mlp": 0.01055466, "balance_loss_clip": 1.11953568, "balance_loss_mlp": 1.03801417, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 4.857659726631675, "language_loss": 0.8002311, "learning_rate": 3.703358849697888e-06, "loss": 0.82487226, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.7634975910186768 }, { "auxiliary_loss_clip": 0.01406838, "auxiliary_loss_mlp": 0.01052504, "balance_loss_clip": 1.12337053, "balance_loss_mlp": 1.03816307, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 2.1875892594009785, "language_loss": 0.82706326, "learning_rate": 3.7077098895038803e-06, "loss": 0.85165668, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.749976873397827 }, { "auxiliary_loss_clip": 0.01406452, "auxiliary_loss_mlp": 0.01048675, "balance_loss_clip": 1.12160575, "balance_loss_mlp": 1.03306484, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.4741749559642185, "language_loss": 0.96855152, "learning_rate": 3.712034952798045e-06, "loss": 0.99310279, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.7482552528381348 }, { "auxiliary_loss_clip": 0.01406778, "auxiliary_loss_mlp": 0.01057899, "balance_loss_clip": 1.12008226, "balance_loss_mlp": 1.03841996, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 3.279659660358365, "language_loss": 0.84822339, "learning_rate": 3.7163343479096656e-06, "loss": 0.87287021, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.939336061477661 }, { "auxiliary_loss_clip": 0.01401406, "auxiliary_loss_mlp": 0.01058633, "balance_loss_clip": 1.12103868, "balance_loss_mlp": 1.0429213, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.339435141336895, "language_loss": 0.83053195, "learning_rate": 3.720608377710802e-06, "loss": 0.85513234, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.875774383544922 }, { "auxiliary_loss_clip": 0.0139467, "auxiliary_loss_mlp": 0.01046045, "balance_loss_clip": 1.11312723, "balance_loss_mlp": 1.02862847, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.4586957232903157, "language_loss": 0.86451703, "learning_rate": 3.7248573397443277e-06, "loss": 0.88892424, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 3.7218985557556152 }, { "auxiliary_loss_clip": 0.01399487, "auxiliary_loss_mlp": 0.01068393, "balance_loss_clip": 1.11956608, "balance_loss_mlp": 1.05225229, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.3911632665998077, "language_loss": 0.97546595, "learning_rate": 3.729081526348224e-06, "loss": 1.00014472, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 4.51881217956543 }, { "auxiliary_loss_clip": 0.01397606, "auxiliary_loss_mlp": 0.01056997, "balance_loss_clip": 1.11667407, "balance_loss_mlp": 1.03977156, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 2.3935862268171713, "language_loss": 0.84945768, "learning_rate": 3.7332812247762777e-06, "loss": 0.87400365, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 3.668985605239868 }, { "auxiliary_loss_clip": 0.01398223, "auxiliary_loss_mlp": 0.01062681, "balance_loss_clip": 1.11892092, "balance_loss_mlp": 1.04707646, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.677966740313008, "language_loss": 0.9557116, "learning_rate": 3.737456717315293e-06, "loss": 0.98032069, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.7798876762390137 }, { "auxiliary_loss_clip": 0.01389692, "auxiliary_loss_mlp": 0.01061838, "balance_loss_clip": 1.11586213, "balance_loss_mlp": 1.04701412, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 2.584713260868514, "language_loss": 0.90972525, "learning_rate": 3.7416082813989552e-06, "loss": 0.93424058, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.7605347633361816 }, { "auxiliary_loss_clip": 0.01396952, "auxiliary_loss_mlp": 0.0105995, "balance_loss_clip": 1.11871052, "balance_loss_mlp": 1.04347563, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 2.2019067971469384, "language_loss": 0.89728659, "learning_rate": 3.745736189718439e-06, "loss": 0.92185569, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.8059604167938232 }, { "auxiliary_loss_clip": 0.01386524, "auxiliary_loss_mlp": 0.01057998, "balance_loss_clip": 1.11310744, "balance_loss_mlp": 1.04185688, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 5.083658055104306, "language_loss": 0.72672617, "learning_rate": 3.749840710329894e-06, "loss": 0.75117135, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.905709743499756 }, { "auxiliary_loss_clip": 0.01400805, "auxiliary_loss_mlp": 0.01062535, "balance_loss_clip": 1.1187886, "balance_loss_mlp": 1.04449844, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 3.2270729314001505, "language_loss": 0.98196453, "learning_rate": 3.7539221067588938e-06, "loss": 1.006598, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.9785890579223633 }, { "auxiliary_loss_clip": 0.01393212, "auxiliary_loss_mlp": 0.0106083, "balance_loss_clip": 1.11594725, "balance_loss_mlp": 1.0431633, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 3.0130927826453724, "language_loss": 0.9399097, "learning_rate": 3.757980638101964e-06, "loss": 0.96445012, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.7836740016937256 }, { "auxiliary_loss_clip": 0.01396677, "auxiliary_loss_mlp": 0.01055619, "balance_loss_clip": 1.11702013, "balance_loss_mlp": 1.03856063, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.3515788374465996, "language_loss": 0.89665902, "learning_rate": 3.7620165591252806e-06, "loss": 0.92118204, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.808241605758667 }, { "auxiliary_loss_clip": 0.01390258, "auxiliary_loss_mlp": 0.01049499, "balance_loss_clip": 1.11372483, "balance_loss_mlp": 1.03439546, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 2.061116092992959, "language_loss": 0.94357014, "learning_rate": 3.766030120360636e-06, "loss": 0.96796769, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.8364601135253906 }, { "auxiliary_loss_clip": 0.01383954, "auxiliary_loss_mlp": 0.01059256, "balance_loss_clip": 1.11025405, "balance_loss_mlp": 1.04348516, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.1774312005120016, "language_loss": 0.90386599, "learning_rate": 3.7700215681987578e-06, "loss": 0.92829812, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.8307816982269287 }, { "auxiliary_loss_clip": 0.01388904, "auxiliary_loss_mlp": 0.01062942, "balance_loss_clip": 1.11349559, "balance_loss_mlp": 1.04580021, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 2.106698207036957, "language_loss": 0.82442737, "learning_rate": 3.7739911449800767e-06, "loss": 0.84894586, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.771064519882202 }, { "auxiliary_loss_clip": 0.01385641, "auxiliary_loss_mlp": 0.01063684, "balance_loss_clip": 1.11253738, "balance_loss_mlp": 1.04848528, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 10.01806930537304, "language_loss": 0.80700278, "learning_rate": 3.7779390890830114e-06, "loss": 0.831496, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.750239610671997 }, { "auxiliary_loss_clip": 0.01387648, "auxiliary_loss_mlp": 0.01059842, "balance_loss_clip": 1.11061227, "balance_loss_mlp": 1.04320073, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 3.0381593078692735, "language_loss": 0.85912728, "learning_rate": 3.7818656350098723e-06, "loss": 0.88360214, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.874722957611084 }, { "auxiliary_loss_clip": 0.01391617, "auxiliary_loss_mlp": 0.01054138, "balance_loss_clip": 1.11451161, "balance_loss_mlp": 1.03827143, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.6183044213235505, "language_loss": 0.77457958, "learning_rate": 3.7857710134704447e-06, "loss": 0.7990371, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.7487235069274902 }, { "auxiliary_loss_clip": 0.01381016, "auxiliary_loss_mlp": 0.01063469, "balance_loss_clip": 1.11160409, "balance_loss_mlp": 1.04805529, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.3155219040959443, "language_loss": 0.79675031, "learning_rate": 3.7896554514633234e-06, "loss": 0.82119519, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.924100637435913 }, { "auxiliary_loss_clip": 0.01384638, "auxiliary_loss_mlp": 0.01053662, "balance_loss_clip": 1.11265886, "balance_loss_mlp": 1.03777134, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 2.030597847857301, "language_loss": 0.84090066, "learning_rate": 3.7935191723550955e-06, "loss": 0.86528367, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.7614407539367676 }, { "auxiliary_loss_clip": 0.01380448, "auxiliary_loss_mlp": 0.01061818, "balance_loss_clip": 1.1108582, "balance_loss_mlp": 1.04704189, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.4174044874421847, "language_loss": 0.88843203, "learning_rate": 3.797362395957408e-06, "loss": 0.91285467, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.827993154525757 }, { "auxiliary_loss_clip": 0.01385462, "auxiliary_loss_mlp": 0.01066293, "balance_loss_clip": 1.11246538, "balance_loss_mlp": 1.04990768, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 2.820205565799056, "language_loss": 0.7828964, "learning_rate": 3.8011853386020055e-06, "loss": 0.80741394, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.816617488861084 }, { "auxiliary_loss_clip": 0.01377878, "auxiliary_loss_mlp": 0.01055378, "balance_loss_clip": 1.10769558, "balance_loss_mlp": 1.0389626, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 4.205736982046081, "language_loss": 0.89230353, "learning_rate": 3.804988213213804e-06, "loss": 0.91663611, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.7268142700195312 }, { "auxiliary_loss_clip": 0.01332765, "auxiliary_loss_mlp": 0.010204, "balance_loss_clip": 1.13905621, "balance_loss_mlp": 1.009004, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0193065220079842, "language_loss": 0.63183057, "learning_rate": 3.808771229382049e-06, "loss": 0.65536225, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.234945058822632 }, { "auxiliary_loss_clip": 0.01376066, "auxiliary_loss_mlp": 0.01059947, "balance_loss_clip": 1.10488367, "balance_loss_mlp": 1.04489684, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.2923417103502497, "language_loss": 0.84590209, "learning_rate": 3.8125345934296324e-06, "loss": 0.87026227, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.8023629188537598 }, { "auxiliary_loss_clip": 0.01384401, "auxiliary_loss_mlp": 0.01062732, "balance_loss_clip": 1.11157298, "balance_loss_mlp": 1.04557776, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.6432770401468173, "language_loss": 0.87767673, "learning_rate": 3.81627850848061e-06, "loss": 0.90214807, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.8351337909698486 }, { "auxiliary_loss_clip": 0.01375835, "auxiliary_loss_mlp": 0.01057839, "balance_loss_clip": 1.10816979, "balance_loss_mlp": 1.0424614, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.4317739756573546, "language_loss": 0.86197197, "learning_rate": 3.820003174525994e-06, "loss": 0.88630867, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.7413034439086914 }, { "auxiliary_loss_clip": 0.01376602, "auxiliary_loss_mlp": 0.01067832, "balance_loss_clip": 1.10962224, "balance_loss_mlp": 1.05214441, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.2088585876215014, "language_loss": 0.82884306, "learning_rate": 3.823708788487851e-06, "loss": 0.85328746, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.7688775062561035 }, { "auxiliary_loss_clip": 0.01374153, "auxiliary_loss_mlp": 0.0105394, "balance_loss_clip": 1.10866654, "balance_loss_mlp": 1.03936028, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 2.2775931692604146, "language_loss": 0.84494966, "learning_rate": 3.827395544281781e-06, "loss": 0.86923057, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 3.7828962802886963 }, { "auxiliary_loss_clip": 0.013715, "auxiliary_loss_mlp": 0.01062923, "balance_loss_clip": 1.10606265, "balance_loss_mlp": 1.04691291, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 2.1097556387909506, "language_loss": 0.79050046, "learning_rate": 3.831063632877802e-06, "loss": 0.81484467, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 3.693183660507202 }, { "auxiliary_loss_clip": 0.01372649, "auxiliary_loss_mlp": 0.01057332, "balance_loss_clip": 1.1128931, "balance_loss_mlp": 1.04405248, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.7748710098979927, "language_loss": 0.75861305, "learning_rate": 3.834713242359712e-06, "loss": 0.78291291, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 4.531643390655518 }, { "auxiliary_loss_clip": 0.01374863, "auxiliary_loss_mlp": 0.01052781, "balance_loss_clip": 1.1081984, "balance_loss_mlp": 1.03691423, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 2.1194168769978687, "language_loss": 0.87429619, "learning_rate": 3.838344557982959e-06, "loss": 0.89857262, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.777866840362549 }, { "auxiliary_loss_clip": 0.01367756, "auxiliary_loss_mlp": 0.01052818, "balance_loss_clip": 1.10328841, "balance_loss_mlp": 1.03816676, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 3.0054558149711457, "language_loss": 0.84460413, "learning_rate": 3.841957762231063e-06, "loss": 0.86880988, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.683396339416504 }, { "auxiliary_loss_clip": 0.01368141, "auxiliary_loss_mlp": 0.01059586, "balance_loss_clip": 1.10404384, "balance_loss_mlp": 1.04553103, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.3682594129790293, "language_loss": 0.87701333, "learning_rate": 3.8455530348706454e-06, "loss": 0.9012906, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.7738115787506104 }, { "auxiliary_loss_clip": 0.01371453, "auxiliary_loss_mlp": 0.01053554, "balance_loss_clip": 1.10883713, "balance_loss_mlp": 1.03942215, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 2.221813409638455, "language_loss": 0.77522755, "learning_rate": 3.849130553005099e-06, "loss": 0.79947764, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.752946615219116 }, { "auxiliary_loss_clip": 0.01365549, "auxiliary_loss_mlp": 0.01049086, "balance_loss_clip": 1.10420012, "balance_loss_mlp": 1.03451836, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 1.870271482861934, "language_loss": 0.83964068, "learning_rate": 3.852690491126933e-06, "loss": 0.86378706, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.697665214538574 }, { "auxiliary_loss_clip": 0.01370377, "auxiliary_loss_mlp": 0.01060337, "balance_loss_clip": 1.10610855, "balance_loss_mlp": 1.04486394, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 4.4223544878559276, "language_loss": 0.91339481, "learning_rate": 3.856233021168845e-06, "loss": 0.93770194, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.8299307823181152 }, { "auxiliary_loss_clip": 0.01360187, "auxiliary_loss_mlp": 0.01061447, "balance_loss_clip": 1.10306025, "balance_loss_mlp": 1.04707015, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.3332945254978066, "language_loss": 0.91160154, "learning_rate": 3.859758312553544e-06, "loss": 0.9358179, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.800576686859131 }, { "auxiliary_loss_clip": 0.01368615, "auxiliary_loss_mlp": 0.01053305, "balance_loss_clip": 1.10833526, "balance_loss_mlp": 1.03942871, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.0325659393015827, "language_loss": 0.91972327, "learning_rate": 3.8632665322423735e-06, "loss": 0.94394255, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.7743022441864014 }, { "auxiliary_loss_clip": 0.01360453, "auxiliary_loss_mlp": 0.01053911, "balance_loss_clip": 1.103719, "balance_loss_mlp": 1.04061317, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 2.257265616818119, "language_loss": 0.85757107, "learning_rate": 3.866757844782762e-06, "loss": 0.8817147, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.7585670948028564 }, { "auxiliary_loss_clip": 0.01364872, "auxiliary_loss_mlp": 0.01052165, "balance_loss_clip": 1.10511982, "balance_loss_mlp": 1.03812778, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 3.00746894247213, "language_loss": 0.91636527, "learning_rate": 3.870232412354527e-06, "loss": 0.94053555, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.7799816131591797 }, { "auxiliary_loss_clip": 0.01362747, "auxiliary_loss_mlp": 0.01050501, "balance_loss_clip": 1.10177195, "balance_loss_mlp": 1.03611851, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.361205924004267, "language_loss": 0.92321646, "learning_rate": 3.873690394815086e-06, "loss": 0.94734895, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.6885297298431396 }, { "auxiliary_loss_clip": 0.01362161, "auxiliary_loss_mlp": 0.01056806, "balance_loss_clip": 1.10361063, "balance_loss_mlp": 1.04296613, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.7122818308156673, "language_loss": 0.91295362, "learning_rate": 3.877131949743587e-06, "loss": 0.93714333, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.734980344772339 }, { "auxiliary_loss_clip": 0.0136257, "auxiliary_loss_mlp": 0.01058233, "balance_loss_clip": 1.10355306, "balance_loss_mlp": 1.04359412, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 1.9471941622752686, "language_loss": 0.77953184, "learning_rate": 3.880557232483993e-06, "loss": 0.80373985, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.8809571266174316 }, { "auxiliary_loss_clip": 0.01358599, "auxiliary_loss_mlp": 0.01059711, "balance_loss_clip": 1.09979248, "balance_loss_mlp": 1.0455606, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 2.058100426221792, "language_loss": 0.86874056, "learning_rate": 3.883966396187164e-06, "loss": 0.89292371, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.7649528980255127 }, { "auxiliary_loss_clip": 0.01361408, "auxiliary_loss_mlp": 0.01056279, "balance_loss_clip": 1.10471392, "balance_loss_mlp": 1.04271269, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.06182492731019, "language_loss": 0.9002409, "learning_rate": 3.887359591851937e-06, "loss": 0.92441773, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.7615795135498047 }, { "auxiliary_loss_clip": 0.01359557, "auxiliary_loss_mlp": 0.01059355, "balance_loss_clip": 1.10376525, "balance_loss_mlp": 1.04664099, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 2.2948977226564065, "language_loss": 0.92339492, "learning_rate": 3.890736968365265e-06, "loss": 0.94758397, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.831876754760742 }, { "auxiliary_loss_clip": 0.01356212, "auxiliary_loss_mlp": 0.01051982, "balance_loss_clip": 1.10108805, "balance_loss_mlp": 1.03895843, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 1.9002254306811344, "language_loss": 0.84894013, "learning_rate": 3.894098672541412e-06, "loss": 0.87302208, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.769833564758301 }, { "auxiliary_loss_clip": 0.01360296, "auxiliary_loss_mlp": 0.01043609, "balance_loss_clip": 1.10322666, "balance_loss_mlp": 1.02880335, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 2.983336770809252, "language_loss": 0.7539891, "learning_rate": 3.89744484916025e-06, "loss": 0.77802813, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.8319971561431885 }, { "auxiliary_loss_clip": 0.01357317, "auxiliary_loss_mlp": 0.0105667, "balance_loss_clip": 1.1006856, "balance_loss_mlp": 1.04200745, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 2.0334399927727214, "language_loss": 0.87341565, "learning_rate": 3.900775641004673e-06, "loss": 0.89755559, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.747948408126831 }, { "auxiliary_loss_clip": 0.01364569, "auxiliary_loss_mlp": 0.01058721, "balance_loss_clip": 1.10413694, "balance_loss_mlp": 1.04421937, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 2.819072398591503, "language_loss": 0.73976684, "learning_rate": 3.904091188897156e-06, "loss": 0.7639997, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.894177198410034 }, { "auxiliary_loss_clip": 0.01360033, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.10461283, "balance_loss_mlp": 1.03708887, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.0996588794038673, "language_loss": 0.8175354, "learning_rate": 3.90739163173548e-06, "loss": 0.84165013, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.7752740383148193 }, { "auxiliary_loss_clip": 0.01357105, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.10279036, "balance_loss_mlp": 1.03192544, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.3464482690171358, "language_loss": 0.88298255, "learning_rate": 3.910677106527646e-06, "loss": 0.90701038, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.787811040878296 }, { "auxiliary_loss_clip": 0.01351518, "auxiliary_loss_mlp": 0.01053075, "balance_loss_clip": 1.09929323, "balance_loss_mlp": 1.03961086, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 3.253100233402835, "language_loss": 0.8435421, "learning_rate": 3.913947748426004e-06, "loss": 0.86758804, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.812692165374756 }, { "auxiliary_loss_clip": 0.0135301, "auxiliary_loss_mlp": 0.01053136, "balance_loss_clip": 1.10340643, "balance_loss_mlp": 1.03986788, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.9640195959748294, "language_loss": 0.76391083, "learning_rate": 3.9172036907606136e-06, "loss": 0.78797233, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 3.6649186611175537 }, { "auxiliary_loss_clip": 0.01351957, "auxiliary_loss_mlp": 0.01050038, "balance_loss_clip": 1.0982511, "balance_loss_mlp": 1.03568554, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.751343768477567, "language_loss": 0.94717634, "learning_rate": 3.920445065071855e-06, "loss": 0.97119629, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 2.7335052490234375 }, { "auxiliary_loss_clip": 0.01360616, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.10528016, "balance_loss_mlp": 1.0361886, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 2.30077936299829, "language_loss": 0.79769337, "learning_rate": 3.923672001142322e-06, "loss": 0.82179981, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 4.62237024307251 }, { "auxiliary_loss_clip": 0.01352292, "auxiliary_loss_mlp": 0.0105772, "balance_loss_clip": 1.10008276, "balance_loss_mlp": 1.04387426, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 1.9024021131716646, "language_loss": 0.84345257, "learning_rate": 3.926884627027996e-06, "loss": 0.86755276, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 2.83705997467041 }, { "auxiliary_loss_clip": 0.01352244, "auxiliary_loss_mlp": 0.01053726, "balance_loss_clip": 1.09980464, "balance_loss_mlp": 1.04023778, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 2.9542252370744757, "language_loss": 0.77466595, "learning_rate": 3.930083069088744e-06, "loss": 0.79872561, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 3.6264679431915283 }, { "auxiliary_loss_clip": 0.01306465, "auxiliary_loss_mlp": 0.01013097, "balance_loss_clip": 1.1239388, "balance_loss_mlp": 1.00470495, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9902577423682047, "language_loss": 0.5929786, "learning_rate": 3.933267452018137e-06, "loss": 0.61617422, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.2040202617645264 }, { "auxiliary_loss_clip": 0.01350931, "auxiliary_loss_mlp": 0.01051655, "balance_loss_clip": 1.10382247, "balance_loss_mlp": 1.03811252, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.2743680103587933, "language_loss": 0.84586841, "learning_rate": 3.936437898872622e-06, "loss": 0.86989427, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.7399940490722656 }, { "auxiliary_loss_clip": 0.0134869, "auxiliary_loss_mlp": 0.01045317, "balance_loss_clip": 1.09896278, "balance_loss_mlp": 1.03119087, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 2.3971266980341017, "language_loss": 0.79809111, "learning_rate": 3.9395945311000525e-06, "loss": 0.8220312, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.787911891937256 }, { "auxiliary_loss_clip": 0.01354364, "auxiliary_loss_mlp": 0.01061729, "balance_loss_clip": 1.10214245, "balance_loss_mlp": 1.04802585, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 2.4284518178037535, "language_loss": 0.91098785, "learning_rate": 3.942737468567608e-06, "loss": 0.93514878, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.765449285507202 }, { "auxiliary_loss_clip": 0.01348988, "auxiliary_loss_mlp": 0.01047904, "balance_loss_clip": 1.09852827, "balance_loss_mlp": 1.0341239, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 5.3465684806328975, "language_loss": 0.86013812, "learning_rate": 3.9458668295891026e-06, "loss": 0.88410699, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.9893720149993896 }, { "auxiliary_loss_clip": 0.01348164, "auxiliary_loss_mlp": 0.01047751, "balance_loss_clip": 1.09742785, "balance_loss_mlp": 1.03361297, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.3889118383210137, "language_loss": 0.86683476, "learning_rate": 3.948982730951712e-06, "loss": 0.89079392, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.7209746837615967 }, { "auxiliary_loss_clip": 0.01355515, "auxiliary_loss_mlp": 0.01047735, "balance_loss_clip": 1.10196698, "balance_loss_mlp": 1.03400862, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.3153398062930006, "language_loss": 0.82083392, "learning_rate": 3.9520852879421254e-06, "loss": 0.84486639, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.7447385787963867 }, { "auxiliary_loss_clip": 0.0134852, "auxiliary_loss_mlp": 0.01049668, "balance_loss_clip": 1.10179472, "balance_loss_mlp": 1.03695416, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.6035341777533425, "language_loss": 0.81793094, "learning_rate": 3.955174614372137e-06, "loss": 0.84191281, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.9367239475250244 }, { "auxiliary_loss_clip": 0.01351472, "auxiliary_loss_mlp": 0.01051407, "balance_loss_clip": 1.10115337, "balance_loss_mlp": 1.03737617, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 3.0205303310042786, "language_loss": 0.84392649, "learning_rate": 3.9582508226037045e-06, "loss": 0.86795521, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.8009884357452393 }, { "auxiliary_loss_clip": 0.01355882, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.10224128, "balance_loss_mlp": 1.0320245, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 3.126810420876177, "language_loss": 0.94031394, "learning_rate": 3.9613140235734636e-06, "loss": 0.96434212, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.818848133087158 }, { "auxiliary_loss_clip": 0.01345716, "auxiliary_loss_mlp": 0.01051909, "balance_loss_clip": 1.09649253, "balance_loss_mlp": 1.03852165, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 1.9492820369422654, "language_loss": 0.81114453, "learning_rate": 3.96436432681674e-06, "loss": 0.83512074, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.7809040546417236 }, { "auxiliary_loss_clip": 0.01351098, "auxiliary_loss_mlp": 0.01059997, "balance_loss_clip": 1.09921718, "balance_loss_mlp": 1.0455842, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.896015817043678, "language_loss": 0.88993847, "learning_rate": 3.967401840491044e-06, "loss": 0.91404939, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.804818868637085 }, { "auxiliary_loss_clip": 0.01342768, "auxiliary_loss_mlp": 0.01055283, "balance_loss_clip": 1.09838605, "balance_loss_mlp": 1.04143643, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.4441988796828573, "language_loss": 0.8776691, "learning_rate": 3.97042667139909e-06, "loss": 0.90164959, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.754904270172119 }, { "auxiliary_loss_clip": 0.01345238, "auxiliary_loss_mlp": 0.0105009, "balance_loss_clip": 1.09929109, "balance_loss_mlp": 1.03543878, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 6.068383854857364, "language_loss": 0.87527883, "learning_rate": 3.973438925011327e-06, "loss": 0.89923203, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.7754368782043457 }, { "auxiliary_loss_clip": 0.01345657, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.09687865, "balance_loss_mlp": 1.03799999, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.669159319658, "language_loss": 0.91498315, "learning_rate": 3.976438705488002e-06, "loss": 0.93895006, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.8563334941864014 }, { "auxiliary_loss_clip": 0.01345993, "auxiliary_loss_mlp": 0.01052138, "balance_loss_clip": 1.10155296, "balance_loss_mlp": 1.03893578, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.503728396137688, "language_loss": 0.93030715, "learning_rate": 3.9794261157007744e-06, "loss": 0.95428848, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.77337384223938 }, { "auxiliary_loss_clip": 0.01350233, "auxiliary_loss_mlp": 0.01060041, "balance_loss_clip": 1.09979486, "balance_loss_mlp": 1.04515171, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.5069334635064298, "language_loss": 0.84689534, "learning_rate": 3.982401257253887e-06, "loss": 0.87099808, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.765263795852661 }, { "auxiliary_loss_clip": 0.01340168, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.09528947, "balance_loss_mlp": 1.03563106, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.685864990034809, "language_loss": 0.9006784, "learning_rate": 3.985364230504893e-06, "loss": 0.92456895, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.804231882095337 }, { "auxiliary_loss_clip": 0.01344364, "auxiliary_loss_mlp": 0.01057639, "balance_loss_clip": 1.09699297, "balance_loss_mlp": 1.04313135, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 1.8932610035630282, "language_loss": 0.84514695, "learning_rate": 3.988315134584976e-06, "loss": 0.86916697, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.8952322006225586 }, { "auxiliary_loss_clip": 0.01343434, "auxiliary_loss_mlp": 0.01051519, "balance_loss_clip": 1.0947144, "balance_loss_mlp": 1.03574777, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 3.821309773843259, "language_loss": 0.80428189, "learning_rate": 3.991254067418851e-06, "loss": 0.82823145, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 3.032799482345581 }, { "auxiliary_loss_clip": 0.01339841, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.09616446, "balance_loss_mlp": 1.0384717, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.1582477880021402, "language_loss": 0.83072686, "learning_rate": 3.994181125744254e-06, "loss": 0.85464025, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.971174955368042 }, { "auxiliary_loss_clip": 0.01340289, "auxiliary_loss_mlp": 0.01049199, "balance_loss_clip": 1.09623718, "balance_loss_mlp": 1.03635406, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 2.088632066253113, "language_loss": 0.73913443, "learning_rate": 3.99709640513106e-06, "loss": 0.76302928, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.8281075954437256 }, { "auxiliary_loss_clip": 0.01348462, "auxiliary_loss_mlp": 0.01051117, "balance_loss_clip": 1.09823823, "balance_loss_mlp": 1.03733075, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 6.816670829178058, "language_loss": 0.8588227, "learning_rate": 4e-06, "loss": 0.88281846, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 3.8108372688293457 }, { "auxiliary_loss_clip": 0.01341821, "auxiliary_loss_mlp": 0.010688, "balance_loss_clip": 1.09857023, "balance_loss_mlp": 1.05453646, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 3.125625763282027, "language_loss": 0.88394189, "learning_rate": 3.999999848300794e-06, "loss": 0.90804815, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.801835060119629 }, { "auxiliary_loss_clip": 0.01335552, "auxiliary_loss_mlp": 0.01049475, "balance_loss_clip": 1.09202397, "balance_loss_mlp": 1.03714848, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 2.86659690833024, "language_loss": 0.89084959, "learning_rate": 3.999999393203203e-06, "loss": 0.91469985, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 3.948835611343384 }, { "auxiliary_loss_clip": 0.01335395, "auxiliary_loss_mlp": 0.01054395, "balance_loss_clip": 1.09271145, "balance_loss_mlp": 1.0413233, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 1.9779316473184654, "language_loss": 0.84849322, "learning_rate": 3.999998634707293e-06, "loss": 0.8723911, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 3.857210636138916 }, { "auxiliary_loss_clip": 0.01347754, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.10188508, "balance_loss_mlp": 1.03646266, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 4.733068461318254, "language_loss": 0.96485597, "learning_rate": 3.999997572813182e-06, "loss": 0.98883545, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.930072069168091 }, { "auxiliary_loss_clip": 0.01332245, "auxiliary_loss_mlp": 0.01052874, "balance_loss_clip": 1.09131348, "balance_loss_mlp": 1.04056013, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 2.074594538834533, "language_loss": 0.8743943, "learning_rate": 3.999996207521028e-06, "loss": 0.89824545, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.823927402496338 }, { "auxiliary_loss_clip": 0.01342606, "auxiliary_loss_mlp": 0.01058079, "balance_loss_clip": 1.09483993, "balance_loss_mlp": 1.04451942, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.6900103329889355, "language_loss": 0.82006526, "learning_rate": 3.999994538831039e-06, "loss": 0.8440721, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.78749942779541 }, { "auxiliary_loss_clip": 0.013415, "auxiliary_loss_mlp": 0.01044121, "balance_loss_clip": 1.09565234, "balance_loss_mlp": 1.03155029, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 3.1255889585630223, "language_loss": 0.85788071, "learning_rate": 3.99999256674347e-06, "loss": 0.88173693, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.9167113304138184 }, { "auxiliary_loss_clip": 0.01287033, "auxiliary_loss_mlp": 0.01009573, "balance_loss_clip": 1.11401916, "balance_loss_mlp": 1.00194359, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0133523503063164, "language_loss": 0.53469801, "learning_rate": 3.999990291258618e-06, "loss": 0.55766404, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.3357157707214355 }, { "auxiliary_loss_clip": 0.01340913, "auxiliary_loss_mlp": 0.01047525, "balance_loss_clip": 1.09750795, "balance_loss_mlp": 1.0347513, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.5352565504313773, "language_loss": 0.86768931, "learning_rate": 3.999987712376829e-06, "loss": 0.89157367, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.8712048530578613 }, { "auxiliary_loss_clip": 0.01339515, "auxiliary_loss_mlp": 0.01045031, "balance_loss_clip": 1.09789824, "balance_loss_mlp": 1.03243625, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 2.149696148376434, "language_loss": 0.81988156, "learning_rate": 3.999984830098494e-06, "loss": 0.84372699, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.8163013458251953 }, { "auxiliary_loss_clip": 0.01336853, "auxiliary_loss_mlp": 0.01053589, "balance_loss_clip": 1.09521079, "balance_loss_mlp": 1.04053593, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 10.197211473278525, "language_loss": 0.97925705, "learning_rate": 3.999981644424051e-06, "loss": 1.00316143, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.8005497455596924 }, { "auxiliary_loss_clip": 0.01337548, "auxiliary_loss_mlp": 0.01055254, "balance_loss_clip": 1.09510875, "balance_loss_mlp": 1.0418545, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.5538454581829955, "language_loss": 0.86118734, "learning_rate": 3.999978155353982e-06, "loss": 0.88511544, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.778538942337036 }, { "auxiliary_loss_clip": 0.01338932, "auxiliary_loss_mlp": 0.01052875, "balance_loss_clip": 1.09458613, "balance_loss_mlp": 1.03946364, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 4.031965289812195, "language_loss": 0.8030324, "learning_rate": 3.9999743628888186e-06, "loss": 0.82695049, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.9718143939971924 }, { "auxiliary_loss_clip": 0.01336032, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.09265518, "balance_loss_mlp": 1.02889729, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 3.17582198277961, "language_loss": 0.89755172, "learning_rate": 3.999970267029133e-06, "loss": 0.92133093, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.877045154571533 }, { "auxiliary_loss_clip": 0.01335376, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.09457791, "balance_loss_mlp": 1.03043103, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 3.4862208557705006, "language_loss": 0.80065787, "learning_rate": 3.999965867775548e-06, "loss": 0.82443881, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.843622922897339 }, { "auxiliary_loss_clip": 0.01329444, "auxiliary_loss_mlp": 0.01051522, "balance_loss_clip": 1.09027636, "balance_loss_mlp": 1.03839076, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 3.8454419625070932, "language_loss": 0.86982346, "learning_rate": 3.9999611651287315e-06, "loss": 0.89363313, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.7257847785949707 }, { "auxiliary_loss_clip": 0.013381, "auxiliary_loss_mlp": 0.01049526, "balance_loss_clip": 1.09426999, "balance_loss_mlp": 1.03616261, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 2.5900079616428258, "language_loss": 0.78845614, "learning_rate": 3.999956159089396e-06, "loss": 0.81233239, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.782325267791748 }, { "auxiliary_loss_clip": 0.01332773, "auxiliary_loss_mlp": 0.01052993, "balance_loss_clip": 1.09255803, "balance_loss_mlp": 1.03933704, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.546387430530761, "language_loss": 0.79494566, "learning_rate": 3.999950849658302e-06, "loss": 0.81880331, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.7528021335601807 }, { "auxiliary_loss_clip": 0.01338098, "auxiliary_loss_mlp": 0.01053685, "balance_loss_clip": 1.09447348, "balance_loss_mlp": 1.04033995, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.4056250199516427, "language_loss": 0.84279263, "learning_rate": 3.999945236836254e-06, "loss": 0.86671042, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.662045478820801 }, { "auxiliary_loss_clip": 0.0134117, "auxiliary_loss_mlp": 0.01047767, "balance_loss_clip": 1.09677863, "balance_loss_mlp": 1.03439212, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 7.269938419316769, "language_loss": 0.94754583, "learning_rate": 3.999939320624103e-06, "loss": 0.97143525, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.6919608116149902 }, { "auxiliary_loss_clip": 0.01338612, "auxiliary_loss_mlp": 0.01053971, "balance_loss_clip": 1.0957396, "balance_loss_mlp": 1.04126334, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.9794311798870219, "language_loss": 0.89741373, "learning_rate": 3.999933101022749e-06, "loss": 0.92133951, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.7683424949645996 }, { "auxiliary_loss_clip": 0.0133512, "auxiliary_loss_mlp": 0.01051519, "balance_loss_clip": 1.09472823, "balance_loss_mlp": 1.03762519, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 5.93741361288679, "language_loss": 0.86896449, "learning_rate": 3.999926578033132e-06, "loss": 0.89283085, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.735405445098877 }, { "auxiliary_loss_clip": 0.01335904, "auxiliary_loss_mlp": 0.01052581, "balance_loss_clip": 1.09321272, "balance_loss_mlp": 1.03800726, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.163945435116917, "language_loss": 0.63048041, "learning_rate": 3.999919751656244e-06, "loss": 0.6543653, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.951040506362915 }, { "auxiliary_loss_clip": 0.01337855, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.09649622, "balance_loss_mlp": 1.03689516, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 2.6761635615992465, "language_loss": 0.76008534, "learning_rate": 3.9999126218931195e-06, "loss": 0.78397465, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.782667398452759 }, { "auxiliary_loss_clip": 0.01336813, "auxiliary_loss_mlp": 0.01051471, "balance_loss_clip": 1.09756875, "balance_loss_mlp": 1.03808951, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.762134296438335, "language_loss": 0.89619279, "learning_rate": 3.99990518874484e-06, "loss": 0.92007565, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.68269419670105 }, { "auxiliary_loss_clip": 0.01337846, "auxiliary_loss_mlp": 0.01064409, "balance_loss_clip": 1.09644914, "balance_loss_mlp": 1.05105162, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 2.6973177973036084, "language_loss": 0.92412651, "learning_rate": 3.999897452212534e-06, "loss": 0.94814909, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 3.7675364017486572 }, { "auxiliary_loss_clip": 0.01335095, "auxiliary_loss_mlp": 0.01054433, "balance_loss_clip": 1.09305429, "balance_loss_mlp": 1.03920376, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 5.853511870832606, "language_loss": 1.0015769, "learning_rate": 3.999889412297374e-06, "loss": 1.02547228, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 2.779381275177002 }, { "auxiliary_loss_clip": 0.01332744, "auxiliary_loss_mlp": 0.01059679, "balance_loss_clip": 1.09134758, "balance_loss_mlp": 1.04568958, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 3.052034395652839, "language_loss": 0.78805721, "learning_rate": 3.999881069000581e-06, "loss": 0.81198144, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 4.577516555786133 }, { "auxiliary_loss_clip": 0.01337845, "auxiliary_loss_mlp": 0.01049394, "balance_loss_clip": 1.09674704, "balance_loss_mlp": 1.03582764, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.835161479941073, "language_loss": 0.86861134, "learning_rate": 3.99987242232342e-06, "loss": 0.89248371, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 3.576709032058716 }, { "auxiliary_loss_clip": 0.01336151, "auxiliary_loss_mlp": 0.01049955, "balance_loss_clip": 1.09769392, "balance_loss_mlp": 1.03590035, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 3.0008860052515454, "language_loss": 0.79732925, "learning_rate": 3.9998634722672026e-06, "loss": 0.82119036, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.694018602371216 }, { "auxiliary_loss_clip": 0.01337212, "auxiliary_loss_mlp": 0.01050959, "balance_loss_clip": 1.09715867, "balance_loss_mlp": 1.03674936, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 1.9833914608646728, "language_loss": 0.77976882, "learning_rate": 3.999854218833286e-06, "loss": 0.80365056, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.8835127353668213 }, { "auxiliary_loss_clip": 0.01336006, "auxiliary_loss_mlp": 0.01048855, "balance_loss_clip": 1.09717059, "balance_loss_mlp": 1.03543186, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 7.416062628628208, "language_loss": 0.8189227, "learning_rate": 3.999844662023075e-06, "loss": 0.84277135, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.756861925125122 }, { "auxiliary_loss_clip": 0.01328241, "auxiliary_loss_mlp": 0.01047332, "balance_loss_clip": 1.09263444, "balance_loss_mlp": 1.03517294, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.8272934930057139, "language_loss": 0.92315215, "learning_rate": 3.999834801838018e-06, "loss": 0.94690788, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.702091932296753 }, { "auxiliary_loss_clip": 0.01330363, "auxiliary_loss_mlp": 0.01043827, "balance_loss_clip": 1.09400415, "balance_loss_mlp": 1.03143549, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.9201385675533613, "language_loss": 0.74219394, "learning_rate": 3.9998246382796115e-06, "loss": 0.7659359, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.7426586151123047 }, { "auxiliary_loss_clip": 0.01339152, "auxiliary_loss_mlp": 0.01049101, "balance_loss_clip": 1.09695017, "balance_loss_mlp": 1.03556442, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.69208050639597, "language_loss": 0.91137791, "learning_rate": 3.999814171349399e-06, "loss": 0.93526042, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.718337059020996 }, { "auxiliary_loss_clip": 0.01329816, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.09148002, "balance_loss_mlp": 1.03306794, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 1.71229236036837, "language_loss": 0.7350924, "learning_rate": 3.9998034010489655e-06, "loss": 0.75884902, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.8784754276275635 }, { "auxiliary_loss_clip": 0.01334825, "auxiliary_loss_mlp": 0.01047586, "balance_loss_clip": 1.09780109, "balance_loss_mlp": 1.03493786, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.18126020922581, "language_loss": 0.76005703, "learning_rate": 3.999792327379946e-06, "loss": 0.78388119, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.7193641662597656 }, { "auxiliary_loss_clip": 0.01333397, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.09866226, "balance_loss_mlp": 1.03551829, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.3163462828698504, "language_loss": 0.96117485, "learning_rate": 3.999780950344021e-06, "loss": 0.98499185, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.7083470821380615 }, { "auxiliary_loss_clip": 0.0133637, "auxiliary_loss_mlp": 0.01055054, "balance_loss_clip": 1.09743774, "balance_loss_mlp": 1.04204226, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.8503928032587733, "language_loss": 0.82561493, "learning_rate": 3.999769269942916e-06, "loss": 0.84952915, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.7275896072387695 }, { "auxiliary_loss_clip": 0.0132747, "auxiliary_loss_mlp": 0.01055209, "balance_loss_clip": 1.09342456, "balance_loss_mlp": 1.04283464, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 1.9341498231337546, "language_loss": 0.81119931, "learning_rate": 3.999757286178402e-06, "loss": 0.83502614, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.799401044845581 }, { "auxiliary_loss_clip": 0.01327405, "auxiliary_loss_mlp": 0.01047209, "balance_loss_clip": 1.09213984, "balance_loss_mlp": 1.03295732, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 5.173096817397172, "language_loss": 0.90947974, "learning_rate": 3.999744999052299e-06, "loss": 0.93322587, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.7106142044067383 }, { "auxiliary_loss_clip": 0.01265485, "auxiliary_loss_mlp": 0.01013141, "balance_loss_clip": 1.09887481, "balance_loss_mlp": 1.00584495, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9580747397684949, "language_loss": 0.61135793, "learning_rate": 3.9997324085664675e-06, "loss": 0.63414419, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.19050669670105 }, { "auxiliary_loss_clip": 0.01326484, "auxiliary_loss_mlp": 0.01048959, "balance_loss_clip": 1.09220505, "balance_loss_mlp": 1.03653145, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.296082579829729, "language_loss": 0.92035115, "learning_rate": 3.999719514722821e-06, "loss": 0.94410563, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.7352406978607178 }, { "auxiliary_loss_clip": 0.01323834, "auxiliary_loss_mlp": 0.01046206, "balance_loss_clip": 1.09316397, "balance_loss_mlp": 1.03332579, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 2.9613055609848042, "language_loss": 0.74819589, "learning_rate": 3.999706317523314e-06, "loss": 0.77189624, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.849459648132324 }, { "auxiliary_loss_clip": 0.0132674, "auxiliary_loss_mlp": 0.01057503, "balance_loss_clip": 1.09248543, "balance_loss_mlp": 1.04563606, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.5700569200861163, "language_loss": 0.86365366, "learning_rate": 3.999692816969948e-06, "loss": 0.88749611, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.799098491668701 }, { "auxiliary_loss_clip": 0.01260599, "auxiliary_loss_mlp": 0.01009384, "balance_loss_clip": 1.09567714, "balance_loss_mlp": 1.00218379, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.0041374230104665, "language_loss": 0.69402945, "learning_rate": 3.999679013064772e-06, "loss": 0.71672928, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.275622606277466 }, { "auxiliary_loss_clip": 0.01322486, "auxiliary_loss_mlp": 0.01052437, "balance_loss_clip": 1.08904421, "balance_loss_mlp": 1.03928864, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 9.542947139609506, "language_loss": 0.85500067, "learning_rate": 3.99966490580988e-06, "loss": 0.87874991, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.7616453170776367 }, { "auxiliary_loss_clip": 0.01328753, "auxiliary_loss_mlp": 0.01059982, "balance_loss_clip": 1.09283423, "balance_loss_mlp": 1.04747081, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.7774151684842283, "language_loss": 0.65936404, "learning_rate": 3.999650495207411e-06, "loss": 0.68325138, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.894382953643799 }, { "auxiliary_loss_clip": 0.01325217, "auxiliary_loss_mlp": 0.01050845, "balance_loss_clip": 1.09470499, "balance_loss_mlp": 1.03760076, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 2.74616115415114, "language_loss": 0.90536159, "learning_rate": 3.999635781259553e-06, "loss": 0.92912221, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.7299745082855225 }, { "auxiliary_loss_clip": 0.01251956, "auxiliary_loss_mlp": 0.01009736, "balance_loss_clip": 1.09032929, "balance_loss_mlp": 1.00291681, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.918459688057845, "language_loss": 0.52247119, "learning_rate": 3.999620763968535e-06, "loss": 0.54508817, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.0763652324676514 }, { "auxiliary_loss_clip": 0.01321418, "auxiliary_loss_mlp": 0.01051347, "balance_loss_clip": 1.09049988, "balance_loss_mlp": 1.03913426, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 2.0023628024299764, "language_loss": 0.86660218, "learning_rate": 3.999605443336638e-06, "loss": 0.89032984, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.7721498012542725 }, { "auxiliary_loss_clip": 0.01329302, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.09402323, "balance_loss_mlp": 1.03703284, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 3.349462954381713, "language_loss": 0.89777809, "learning_rate": 3.999589819366185e-06, "loss": 0.92157441, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.6580698490142822 }, { "auxiliary_loss_clip": 0.01325041, "auxiliary_loss_mlp": 0.01055517, "balance_loss_clip": 1.09165907, "balance_loss_mlp": 1.04260087, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.591831385443148, "language_loss": 0.84824634, "learning_rate": 3.999573892059547e-06, "loss": 0.87205184, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 3.7360095977783203 }, { "auxiliary_loss_clip": 0.01327344, "auxiliary_loss_mlp": 0.01052715, "balance_loss_clip": 1.09166026, "balance_loss_mlp": 1.03919077, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 2.1236231168962902, "language_loss": 0.81070441, "learning_rate": 3.999557661419138e-06, "loss": 0.83450496, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 3.549142599105835 }, { "auxiliary_loss_clip": 0.01328457, "auxiliary_loss_mlp": 0.01055584, "balance_loss_clip": 1.0944314, "balance_loss_mlp": 1.04292989, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 2.079245562488903, "language_loss": 0.81450164, "learning_rate": 3.9995411274474225e-06, "loss": 0.83834201, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 4.4558680057525635 }, { "auxiliary_loss_clip": 0.01322952, "auxiliary_loss_mlp": 0.01054476, "balance_loss_clip": 1.09091389, "balance_loss_mlp": 1.0422039, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 2.9377998583891745, "language_loss": 0.81516302, "learning_rate": 3.999524290146908e-06, "loss": 0.83893728, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 2.78656005859375 }, { "auxiliary_loss_clip": 0.01327186, "auxiliary_loss_mlp": 0.01051168, "balance_loss_clip": 1.09433341, "balance_loss_mlp": 1.03826356, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 3.1902606135717475, "language_loss": 0.92629868, "learning_rate": 3.9995071495201485e-06, "loss": 0.95008218, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.6811363697052 }, { "auxiliary_loss_clip": 0.01323956, "auxiliary_loss_mlp": 0.0104652, "balance_loss_clip": 1.09033835, "balance_loss_mlp": 1.03448009, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.818004209338718, "language_loss": 0.97866166, "learning_rate": 3.999489705569744e-06, "loss": 1.0023663, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.7377045154571533 }, { "auxiliary_loss_clip": 0.01322331, "auxiliary_loss_mlp": 0.01049714, "balance_loss_clip": 1.09023857, "balance_loss_mlp": 1.03597534, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 2.683957019916267, "language_loss": 0.86502182, "learning_rate": 3.999471958298341e-06, "loss": 0.88874227, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.654801607131958 }, { "auxiliary_loss_clip": 0.01326591, "auxiliary_loss_mlp": 0.01052988, "balance_loss_clip": 1.0928179, "balance_loss_mlp": 1.04085255, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 1.9380618188348777, "language_loss": 0.76169431, "learning_rate": 3.999453907708631e-06, "loss": 0.7854901, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.8453903198242188 }, { "auxiliary_loss_clip": 0.01319266, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.09047198, "balance_loss_mlp": 1.03659689, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 2.16920827890716, "language_loss": 0.81369871, "learning_rate": 3.999435553803353e-06, "loss": 0.83737987, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.6997928619384766 }, { "auxiliary_loss_clip": 0.01318903, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.08667326, "balance_loss_mlp": 1.03653085, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.9911587449391392, "language_loss": 0.83243591, "learning_rate": 3.999416896585292e-06, "loss": 0.85611558, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.731782913208008 }, { "auxiliary_loss_clip": 0.01317331, "auxiliary_loss_mlp": 0.01054271, "balance_loss_clip": 1.0867604, "balance_loss_mlp": 1.04227901, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 3.86198837380606, "language_loss": 0.86530519, "learning_rate": 3.9993979360572775e-06, "loss": 0.88902116, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.702183246612549 }, { "auxiliary_loss_clip": 0.01326459, "auxiliary_loss_mlp": 0.01052061, "balance_loss_clip": 1.09223163, "balance_loss_mlp": 1.03891826, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 2.5439029542667635, "language_loss": 0.82894719, "learning_rate": 3.999378672222185e-06, "loss": 0.85273242, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.6662468910217285 }, { "auxiliary_loss_clip": 0.01317341, "auxiliary_loss_mlp": 0.01045863, "balance_loss_clip": 1.08905101, "balance_loss_mlp": 1.03337622, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 10.491733417348213, "language_loss": 0.82849598, "learning_rate": 3.9993591050829385e-06, "loss": 0.85212803, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.7052464485168457 }, { "auxiliary_loss_clip": 0.01319947, "auxiliary_loss_mlp": 0.0104828, "balance_loss_clip": 1.09030318, "balance_loss_mlp": 1.03651404, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 2.0942061985009492, "language_loss": 0.7941404, "learning_rate": 3.999339234642506e-06, "loss": 0.81782264, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.722428798675537 }, { "auxiliary_loss_clip": 0.01317439, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.08655167, "balance_loss_mlp": 1.0270164, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 2.0174633497747285, "language_loss": 0.8383162, "learning_rate": 3.9993190609038994e-06, "loss": 0.86189318, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.7539632320404053 }, { "auxiliary_loss_clip": 0.01316851, "auxiliary_loss_mlp": 0.01051625, "balance_loss_clip": 1.08711958, "balance_loss_mlp": 1.03955507, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 2.093004160096706, "language_loss": 0.8335638, "learning_rate": 3.999298583870182e-06, "loss": 0.85724854, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.667309522628784 }, { "auxiliary_loss_clip": 0.01318958, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.08885217, "balance_loss_mlp": 1.02857971, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 3.399519216696956, "language_loss": 0.77806628, "learning_rate": 3.999277803544458e-06, "loss": 0.80167127, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.7995965480804443 }, { "auxiliary_loss_clip": 0.01239688, "auxiliary_loss_mlp": 0.0101914, "balance_loss_clip": 1.0828054, "balance_loss_mlp": 1.01265514, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9629175216749136, "language_loss": 0.62326616, "learning_rate": 3.999256719929882e-06, "loss": 0.64585441, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.231663942337036 }, { "auxiliary_loss_clip": 0.01237717, "auxiliary_loss_mlp": 0.01012588, "balance_loss_clip": 1.08157015, "balance_loss_mlp": 1.0061512, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.2155119411555246, "language_loss": 0.67150438, "learning_rate": 3.999235333029651e-06, "loss": 0.6940074, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.197641611099243 }, { "auxiliary_loss_clip": 0.01318547, "auxiliary_loss_mlp": 0.01056175, "balance_loss_clip": 1.09142828, "balance_loss_mlp": 1.0431335, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 2.516307521713106, "language_loss": 0.82285827, "learning_rate": 3.999213642847009e-06, "loss": 0.84660548, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.681889295578003 }, { "auxiliary_loss_clip": 0.01315167, "auxiliary_loss_mlp": 0.0105171, "balance_loss_clip": 1.08657694, "balance_loss_mlp": 1.03822148, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 2.03796713796626, "language_loss": 0.91236633, "learning_rate": 3.999191649385247e-06, "loss": 0.9360351, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.835099220275879 }, { "auxiliary_loss_clip": 0.01234182, "auxiliary_loss_mlp": 0.01006151, "balance_loss_clip": 1.08021522, "balance_loss_mlp": 0.99995178, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9230242679723929, "language_loss": 0.59729338, "learning_rate": 3.999169352647702e-06, "loss": 0.61969668, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.1734840869903564 }, { "auxiliary_loss_clip": 0.01321663, "auxiliary_loss_mlp": 0.01058713, "balance_loss_clip": 1.08928835, "balance_loss_mlp": 1.04471779, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 2.0091982220977234, "language_loss": 0.83005184, "learning_rate": 3.999146752637755e-06, "loss": 0.85385561, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.7144134044647217 }, { "auxiliary_loss_clip": 0.01319545, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.08923697, "balance_loss_mlp": 1.03090048, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.768991023118778, "language_loss": 0.89615154, "learning_rate": 3.999123849358836e-06, "loss": 0.91978294, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.7900824546813965 }, { "auxiliary_loss_clip": 0.01313291, "auxiliary_loss_mlp": 0.01048588, "balance_loss_clip": 1.086285, "balance_loss_mlp": 1.03554666, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.4164799601521034, "language_loss": 0.75037265, "learning_rate": 3.999100642814418e-06, "loss": 0.77399147, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.704773187637329 }, { "auxiliary_loss_clip": 0.013154, "auxiliary_loss_mlp": 0.01044787, "balance_loss_clip": 1.08862591, "balance_loss_mlp": 1.03237081, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.707176651369592, "language_loss": 0.88552624, "learning_rate": 3.999077133008022e-06, "loss": 0.90912813, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.7529351711273193 }, { "auxiliary_loss_clip": 0.01319845, "auxiliary_loss_mlp": 0.01055489, "balance_loss_clip": 1.08997273, "balance_loss_mlp": 1.0429306, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 2.3616862390359246, "language_loss": 0.90758789, "learning_rate": 3.9990533199432145e-06, "loss": 0.93134123, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.76733136177063 }, { "auxiliary_loss_clip": 0.01316448, "auxiliary_loss_mlp": 0.01057272, "balance_loss_clip": 1.08991146, "balance_loss_mlp": 1.04439723, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 3.9174584921312485, "language_loss": 0.75717127, "learning_rate": 3.999029203623608e-06, "loss": 0.78090847, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 4.15687108039856 }, { "auxiliary_loss_clip": 0.01314787, "auxiliary_loss_mlp": 0.01055971, "balance_loss_clip": 1.0871383, "balance_loss_mlp": 1.04332924, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.3284742608390996, "language_loss": 0.87017417, "learning_rate": 3.99900478405286e-06, "loss": 0.8938818, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 3.5975711345672607 }, { "auxiliary_loss_clip": 0.01318414, "auxiliary_loss_mlp": 0.01054761, "balance_loss_clip": 1.09129167, "balance_loss_mlp": 1.04277444, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.2779199972567574, "language_loss": 0.82399404, "learning_rate": 3.998980061234676e-06, "loss": 0.84772575, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 2.6634323596954346 }, { "auxiliary_loss_clip": 0.01316697, "auxiliary_loss_mlp": 0.0104273, "balance_loss_clip": 1.08870077, "balance_loss_mlp": 1.03024232, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.4718793366970124, "language_loss": 0.75683868, "learning_rate": 3.9989550351728055e-06, "loss": 0.78043294, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 3.695343017578125 }, { "auxiliary_loss_clip": 0.01313542, "auxiliary_loss_mlp": 0.01048339, "balance_loss_clip": 1.08797932, "balance_loss_mlp": 1.03585207, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.156655917441207, "language_loss": 0.84232414, "learning_rate": 3.998929705871046e-06, "loss": 0.86594296, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.7350282669067383 }, { "auxiliary_loss_clip": 0.01311791, "auxiliary_loss_mlp": 0.01052154, "balance_loss_clip": 1.08555484, "balance_loss_mlp": 1.03998852, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.4995406128621256, "language_loss": 0.88937795, "learning_rate": 3.99890407333324e-06, "loss": 0.91301739, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.6828529834747314 }, { "auxiliary_loss_clip": 0.01308052, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.08182216, "balance_loss_mlp": 1.03277719, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.8875575310225934, "language_loss": 0.87338573, "learning_rate": 3.998878137563275e-06, "loss": 0.89691627, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.6861720085144043 }, { "auxiliary_loss_clip": 0.01311391, "auxiliary_loss_mlp": 0.01053512, "balance_loss_clip": 1.08543491, "balance_loss_mlp": 1.04143643, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 1.9172279696672405, "language_loss": 0.8518666, "learning_rate": 3.998851898565085e-06, "loss": 0.87551558, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.71268367767334 }, { "auxiliary_loss_clip": 0.01312659, "auxiliary_loss_mlp": 0.01045296, "balance_loss_clip": 1.08573139, "balance_loss_mlp": 1.03326178, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.8772838517159163, "language_loss": 0.83173716, "learning_rate": 3.998825356342653e-06, "loss": 0.8553167, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.7218480110168457 }, { "auxiliary_loss_clip": 0.01313424, "auxiliary_loss_mlp": 0.01046644, "balance_loss_clip": 1.08610535, "balance_loss_mlp": 1.03403807, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 3.2143832238226655, "language_loss": 0.73212987, "learning_rate": 3.998798510900003e-06, "loss": 0.75573051, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.8556623458862305 }, { "auxiliary_loss_clip": 0.0131282, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.08606982, "balance_loss_mlp": 1.02761221, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.672803409064474, "language_loss": 0.8386302, "learning_rate": 3.998771362241207e-06, "loss": 0.86215627, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.7353038787841797 }, { "auxiliary_loss_clip": 0.01309816, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.08534312, "balance_loss_mlp": 1.03214657, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 2.5329771268499077, "language_loss": 0.87972599, "learning_rate": 3.998743910370385e-06, "loss": 0.90326416, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.7374048233032227 }, { "auxiliary_loss_clip": 0.01318598, "auxiliary_loss_mlp": 0.01057412, "balance_loss_clip": 1.09320378, "balance_loss_mlp": 1.04509795, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.925400086986793, "language_loss": 0.73227584, "learning_rate": 3.998716155291702e-06, "loss": 0.75603592, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.730888605117798 }, { "auxiliary_loss_clip": 0.01314743, "auxiliary_loss_mlp": 0.01057503, "balance_loss_clip": 1.08908558, "balance_loss_mlp": 1.04496837, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 2.1299131893729264, "language_loss": 0.90687835, "learning_rate": 3.998688097009366e-06, "loss": 0.93060082, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.763404130935669 }, { "auxiliary_loss_clip": 0.01316659, "auxiliary_loss_mlp": 0.01036381, "balance_loss_clip": 1.0895226, "balance_loss_mlp": 1.02436447, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.6055207203575557, "language_loss": 0.80188334, "learning_rate": 3.998659735527636e-06, "loss": 0.8254137, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.7035250663757324 }, { "auxiliary_loss_clip": 0.01315583, "auxiliary_loss_mlp": 0.01060957, "balance_loss_clip": 1.08927393, "balance_loss_mlp": 1.04801083, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 2.652028115190664, "language_loss": 0.77530813, "learning_rate": 3.998631070850813e-06, "loss": 0.79907358, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.774883508682251 }, { "auxiliary_loss_clip": 0.01315483, "auxiliary_loss_mlp": 0.01054911, "balance_loss_clip": 1.09038138, "balance_loss_mlp": 1.04254889, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.2797215700280002, "language_loss": 0.83475471, "learning_rate": 3.9986021029832455e-06, "loss": 0.85845864, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.703885793685913 }, { "auxiliary_loss_clip": 0.0131579, "auxiliary_loss_mlp": 0.01045213, "balance_loss_clip": 1.08770394, "balance_loss_mlp": 1.03117597, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 3.030796941227288, "language_loss": 0.91599917, "learning_rate": 3.9985728319293285e-06, "loss": 0.93960923, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.679015636444092 }, { "auxiliary_loss_clip": 0.01325816, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.09153771, "balance_loss_mlp": 1.02602744, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 3.8945768000662886, "language_loss": 0.85800624, "learning_rate": 3.998543257693501e-06, "loss": 0.88166356, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.7280335426330566 }, { "auxiliary_loss_clip": 0.01315202, "auxiliary_loss_mlp": 0.01051962, "balance_loss_clip": 1.08983445, "balance_loss_mlp": 1.03850293, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 1.7167075645657504, "language_loss": 0.87996554, "learning_rate": 3.998513380280251e-06, "loss": 0.90363717, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.739645481109619 }, { "auxiliary_loss_clip": 0.01322566, "auxiliary_loss_mlp": 0.01051439, "balance_loss_clip": 1.09183395, "balance_loss_mlp": 1.03821325, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 2.4076618909709055, "language_loss": 0.94625205, "learning_rate": 3.99848319969411e-06, "loss": 0.9699921, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.692091703414917 }, { "auxiliary_loss_clip": 0.01324401, "auxiliary_loss_mlp": 0.01051765, "balance_loss_clip": 1.09357107, "balance_loss_mlp": 1.03833628, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.187281855772309, "language_loss": 0.79326916, "learning_rate": 3.9984527159396564e-06, "loss": 0.81703079, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.7053182125091553 }, { "auxiliary_loss_clip": 0.01314505, "auxiliary_loss_mlp": 0.01047069, "balance_loss_clip": 1.08579278, "balance_loss_mlp": 1.03338408, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 11.98314049695068, "language_loss": 0.84565604, "learning_rate": 3.9984219290215154e-06, "loss": 0.86927176, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.74955677986145 }, { "auxiliary_loss_clip": 0.01321811, "auxiliary_loss_mlp": 0.01045512, "balance_loss_clip": 1.09287691, "balance_loss_mlp": 1.03276253, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.7045985968687503, "language_loss": 0.89157057, "learning_rate": 3.998390838944356e-06, "loss": 0.9152438, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.7461862564086914 }, { "auxiliary_loss_clip": 0.01318884, "auxiliary_loss_mlp": 0.01049363, "balance_loss_clip": 1.09108543, "balance_loss_mlp": 1.03678071, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 3.105722814466636, "language_loss": 0.90750313, "learning_rate": 3.998359445712895e-06, "loss": 0.9311856, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.7680752277374268 }, { "auxiliary_loss_clip": 0.01317225, "auxiliary_loss_mlp": 0.01049417, "balance_loss_clip": 1.08790219, "balance_loss_mlp": 1.03445673, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.358702756349081, "language_loss": 0.81009972, "learning_rate": 3.9983277493318955e-06, "loss": 0.83376616, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.726205587387085 }, { "auxiliary_loss_clip": 0.01319406, "auxiliary_loss_mlp": 0.01045485, "balance_loss_clip": 1.0886234, "balance_loss_mlp": 1.03158522, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 1.549749745689828, "language_loss": 0.81196773, "learning_rate": 3.998295749806165e-06, "loss": 0.83561671, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 2.774038314819336 }, { "auxiliary_loss_clip": 0.0131737, "auxiliary_loss_mlp": 0.01051607, "balance_loss_clip": 1.09024775, "balance_loss_mlp": 1.03875613, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 1.7611119392803216, "language_loss": 0.83507997, "learning_rate": 3.998263447140558e-06, "loss": 0.85876971, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 5.104188919067383 }, { "auxiliary_loss_clip": 0.01316185, "auxiliary_loss_mlp": 0.01046862, "balance_loss_clip": 1.08828366, "balance_loss_mlp": 1.03303373, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.811930726221024, "language_loss": 0.82047862, "learning_rate": 3.998230841339976e-06, "loss": 0.84410906, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 3.7549777030944824 }, { "auxiliary_loss_clip": 0.01314138, "auxiliary_loss_mlp": 0.01049021, "balance_loss_clip": 1.08986652, "balance_loss_mlp": 1.03569937, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.2146817175965277, "language_loss": 0.85158539, "learning_rate": 3.998197932409363e-06, "loss": 0.87521684, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 3.5658648014068604 }, { "auxiliary_loss_clip": 0.01313915, "auxiliary_loss_mlp": 0.01054772, "balance_loss_clip": 1.08690929, "balance_loss_mlp": 1.04215956, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.4827541839126783, "language_loss": 0.86081922, "learning_rate": 3.9981647203537125e-06, "loss": 0.88450605, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.726011037826538 }, { "auxiliary_loss_clip": 0.01315585, "auxiliary_loss_mlp": 0.01046633, "balance_loss_clip": 1.08849406, "balance_loss_mlp": 1.03254795, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.0798969684212083, "language_loss": 0.96152538, "learning_rate": 3.998131205178063e-06, "loss": 0.98514754, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.6479341983795166 }, { "auxiliary_loss_clip": 0.01320297, "auxiliary_loss_mlp": 0.01050561, "balance_loss_clip": 1.08982396, "balance_loss_mlp": 1.03768086, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 4.1648593971594945, "language_loss": 0.76298338, "learning_rate": 3.998097386887498e-06, "loss": 0.7866919, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.6718921661376953 }, { "auxiliary_loss_clip": 0.01310542, "auxiliary_loss_mlp": 0.01059419, "balance_loss_clip": 1.08899951, "balance_loss_mlp": 1.04643083, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.843544634833622, "language_loss": 0.84847283, "learning_rate": 3.998063265487148e-06, "loss": 0.87217242, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.730661630630493 }, { "auxiliary_loss_clip": 0.01314098, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.08850348, "balance_loss_mlp": 1.03391504, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 1.81722282330652, "language_loss": 0.8125121, "learning_rate": 3.99802884098219e-06, "loss": 0.83612013, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.7239816188812256 }, { "auxiliary_loss_clip": 0.01308305, "auxiliary_loss_mlp": 0.01046016, "balance_loss_clip": 1.08349156, "balance_loss_mlp": 1.03339767, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.869313748247089, "language_loss": 0.82464725, "learning_rate": 3.997994113377845e-06, "loss": 0.84819049, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.7568349838256836 }, { "auxiliary_loss_clip": 0.01318082, "auxiliary_loss_mlp": 0.01051789, "balance_loss_clip": 1.0910399, "balance_loss_mlp": 1.03711426, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.164494109200736, "language_loss": 0.83438289, "learning_rate": 3.9979590826793815e-06, "loss": 0.85808158, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.746283769607544 }, { "auxiliary_loss_clip": 0.01313205, "auxiliary_loss_mlp": 0.01045387, "balance_loss_clip": 1.08795547, "balance_loss_mlp": 1.0310998, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.0672675736433423, "language_loss": 0.81181115, "learning_rate": 3.997923748892113e-06, "loss": 0.83539712, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.7242374420166016 }, { "auxiliary_loss_clip": 0.01309647, "auxiliary_loss_mlp": 0.01054502, "balance_loss_clip": 1.0877862, "balance_loss_mlp": 1.04180598, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.6696271771260949, "language_loss": 0.88723159, "learning_rate": 3.9978881120214015e-06, "loss": 0.91087306, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.7513535022735596 }, { "auxiliary_loss_clip": 0.01311458, "auxiliary_loss_mlp": 0.01056032, "balance_loss_clip": 1.08541811, "balance_loss_mlp": 1.04297245, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 2.1180917240140507, "language_loss": 0.79331195, "learning_rate": 3.997852172072652e-06, "loss": 0.81698686, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.745258331298828 }, { "auxiliary_loss_clip": 0.01311061, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.08643055, "balance_loss_mlp": 1.04609931, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 2.5592079533481034, "language_loss": 0.89242375, "learning_rate": 3.9978159290513155e-06, "loss": 0.91612804, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.6687264442443848 }, { "auxiliary_loss_clip": 0.01314967, "auxiliary_loss_mlp": 0.01047292, "balance_loss_clip": 1.08724999, "balance_loss_mlp": 1.03233123, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.780968279053194, "language_loss": 0.80012822, "learning_rate": 3.997779382962892e-06, "loss": 0.82375073, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.8205490112304688 }, { "auxiliary_loss_clip": 0.01311867, "auxiliary_loss_mlp": 0.01056001, "balance_loss_clip": 1.08669138, "balance_loss_mlp": 1.04361486, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 1.7918301841376456, "language_loss": 0.73952609, "learning_rate": 3.997742533812924e-06, "loss": 0.76320481, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.805116653442383 }, { "auxiliary_loss_clip": 0.01309313, "auxiliary_loss_mlp": 0.01054828, "balance_loss_clip": 1.08537269, "balance_loss_mlp": 1.04313397, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 3.3976960533425284, "language_loss": 0.92758977, "learning_rate": 3.997705381607001e-06, "loss": 0.95123124, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.683513879776001 }, { "auxiliary_loss_clip": 0.01232293, "auxiliary_loss_mlp": 0.01045035, "balance_loss_clip": 1.07810855, "balance_loss_mlp": 1.03924131, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.9845909494433404, "language_loss": 0.60295713, "learning_rate": 3.997667926350761e-06, "loss": 0.6257304, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.1811447143554688 }, { "auxiliary_loss_clip": 0.01231981, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.07860839, "balance_loss_mlp": 1.02603173, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9005062009531848, "language_loss": 0.57775062, "learning_rate": 3.997630168049886e-06, "loss": 0.60038441, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.276291847229004 }, { "auxiliary_loss_clip": 0.01315283, "auxiliary_loss_mlp": 0.01047712, "balance_loss_clip": 1.08896816, "balance_loss_mlp": 1.03400278, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 2.2470568470153527, "language_loss": 0.7748946, "learning_rate": 3.997592106710101e-06, "loss": 0.7985245, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.7636470794677734 }, { "auxiliary_loss_clip": 0.01307117, "auxiliary_loss_mlp": 0.01049803, "balance_loss_clip": 1.08362293, "balance_loss_mlp": 1.0372268, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.460747961095386, "language_loss": 0.65821815, "learning_rate": 3.997553742337182e-06, "loss": 0.68178731, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.7774670124053955 }, { "auxiliary_loss_clip": 0.01310685, "auxiliary_loss_mlp": 0.01052955, "balance_loss_clip": 1.08575654, "balance_loss_mlp": 1.03884637, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 3.1313931567018285, "language_loss": 0.91708398, "learning_rate": 3.997515074936949e-06, "loss": 0.94072044, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.7017672061920166 }, { "auxiliary_loss_clip": 0.01308385, "auxiliary_loss_mlp": 0.01054446, "balance_loss_clip": 1.08645129, "balance_loss_mlp": 1.04257238, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 2.4086636288769863, "language_loss": 0.86998355, "learning_rate": 3.997476104515268e-06, "loss": 0.89361179, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.669292449951172 }, { "auxiliary_loss_clip": 0.01304145, "auxiliary_loss_mlp": 0.01051311, "balance_loss_clip": 1.08414328, "balance_loss_mlp": 1.03856707, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 2.112666623422883, "language_loss": 0.77572364, "learning_rate": 3.9974368310780485e-06, "loss": 0.79927814, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.7089169025421143 }, { "auxiliary_loss_clip": 0.0131717, "auxiliary_loss_mlp": 0.01063275, "balance_loss_clip": 1.08883786, "balance_loss_mlp": 1.04881513, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 3.1615173002974144, "language_loss": 0.74588084, "learning_rate": 3.997397254631251e-06, "loss": 0.76968527, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.8167784214019775 }, { "auxiliary_loss_clip": 0.01221271, "auxiliary_loss_mlp": 0.01019251, "balance_loss_clip": 1.06980622, "balance_loss_mlp": 1.01422048, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8324032917547493, "language_loss": 0.60016459, "learning_rate": 3.997357375180878e-06, "loss": 0.6225698, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.652473211288452 }, { "auxiliary_loss_clip": 0.01306739, "auxiliary_loss_mlp": 0.01062215, "balance_loss_clip": 1.08298218, "balance_loss_mlp": 1.04863143, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.8856050327097973, "language_loss": 0.75321019, "learning_rate": 3.997317192732979e-06, "loss": 0.7768997, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 2.7058777809143066 }, { "auxiliary_loss_clip": 0.01306282, "auxiliary_loss_mlp": 0.01053323, "balance_loss_clip": 1.08405638, "balance_loss_mlp": 1.04041219, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 1.8380701796371148, "language_loss": 0.82709926, "learning_rate": 3.99727670729365e-06, "loss": 0.85069525, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 3.626425266265869 }, { "auxiliary_loss_clip": 0.01308823, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.08738935, "balance_loss_mlp": 1.03350842, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.8606068228668016, "language_loss": 0.78081447, "learning_rate": 3.997235918869033e-06, "loss": 0.80436826, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.6179442405700684 }, { "auxiliary_loss_clip": 0.01306093, "auxiliary_loss_mlp": 0.01048206, "balance_loss_clip": 1.08484399, "balance_loss_mlp": 1.0359807, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 2.0906936425561584, "language_loss": 0.8266747, "learning_rate": 3.997194827465315e-06, "loss": 0.8502177, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.7154734134674072 }, { "auxiliary_loss_clip": 0.01303595, "auxiliary_loss_mlp": 0.01053789, "balance_loss_clip": 1.08213067, "balance_loss_mlp": 1.04074192, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 3.124466949676271, "language_loss": 0.91347188, "learning_rate": 3.997153433088728e-06, "loss": 0.93704569, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.8448987007141113 }, { "auxiliary_loss_clip": 0.0130874, "auxiliary_loss_mlp": 0.01043814, "balance_loss_clip": 1.08617735, "balance_loss_mlp": 1.031106, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 7.405105323086718, "language_loss": 0.81352603, "learning_rate": 3.997111735745554e-06, "loss": 0.83705151, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.878868341445923 }, { "auxiliary_loss_clip": 0.01309227, "auxiliary_loss_mlp": 0.01052235, "balance_loss_clip": 1.08752251, "balance_loss_mlp": 1.03966391, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 2.162521827201954, "language_loss": 0.82643282, "learning_rate": 3.997069735442118e-06, "loss": 0.85004747, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.826540231704712 }, { "auxiliary_loss_clip": 0.0130494, "auxiliary_loss_mlp": 0.0104528, "balance_loss_clip": 1.08380568, "balance_loss_mlp": 1.03313303, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.633044366981485, "language_loss": 0.80496359, "learning_rate": 3.997027432184792e-06, "loss": 0.82846582, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.751988649368286 }, { "auxiliary_loss_clip": 0.01303805, "auxiliary_loss_mlp": 0.01065247, "balance_loss_clip": 1.08372092, "balance_loss_mlp": 1.05121648, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 1.8857219300919719, "language_loss": 0.89669561, "learning_rate": 3.99698482597999e-06, "loss": 0.9203862, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.7682433128356934 }, { "auxiliary_loss_clip": 0.01211253, "auxiliary_loss_mlp": 0.01014835, "balance_loss_clip": 1.06170225, "balance_loss_mlp": 1.01020968, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8914541389134945, "language_loss": 0.63909477, "learning_rate": 3.99694191683418e-06, "loss": 0.66135561, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.2336103916168213 }, { "auxiliary_loss_clip": 0.01310399, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.08866251, "balance_loss_mlp": 1.04120898, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 2.326238302315817, "language_loss": 0.81915522, "learning_rate": 3.996898704753867e-06, "loss": 0.84279752, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.7069883346557617 }, { "auxiliary_loss_clip": 0.01306452, "auxiliary_loss_mlp": 0.01052727, "balance_loss_clip": 1.0845263, "balance_loss_mlp": 1.03974509, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.2502587395445355, "language_loss": 0.87290937, "learning_rate": 3.996855189745609e-06, "loss": 0.89650112, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.705063581466675 }, { "auxiliary_loss_clip": 0.0130144, "auxiliary_loss_mlp": 0.0105094, "balance_loss_clip": 1.08146942, "balance_loss_mlp": 1.03866744, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 2.0431820819673723, "language_loss": 0.92716944, "learning_rate": 3.996811371816007e-06, "loss": 0.95069331, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.7023375034332275 }, { "auxiliary_loss_clip": 0.01303237, "auxiliary_loss_mlp": 0.01051454, "balance_loss_clip": 1.08438778, "balance_loss_mlp": 1.03881752, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 1.946860163461794, "language_loss": 0.78279829, "learning_rate": 3.996767250971707e-06, "loss": 0.80634522, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.8344976902008057 }, { "auxiliary_loss_clip": 0.01301604, "auxiliary_loss_mlp": 0.01041184, "balance_loss_clip": 1.08351159, "balance_loss_mlp": 1.02909052, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 2.0423103192180565, "language_loss": 0.86945927, "learning_rate": 3.996722827219403e-06, "loss": 0.89288712, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.7192254066467285 }, { "auxiliary_loss_clip": 0.01310648, "auxiliary_loss_mlp": 0.01053124, "balance_loss_clip": 1.08771288, "balance_loss_mlp": 1.04044056, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 4.045947074419687, "language_loss": 0.82541114, "learning_rate": 3.996678100565833e-06, "loss": 0.84904885, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.669356107711792 }, { "auxiliary_loss_clip": 0.01301837, "auxiliary_loss_mlp": 0.01049948, "balance_loss_clip": 1.08380139, "balance_loss_mlp": 1.03848028, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.1652997572709913, "language_loss": 0.88584268, "learning_rate": 3.996633071017783e-06, "loss": 0.90936053, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.671660900115967 }, { "auxiliary_loss_clip": 0.01302829, "auxiliary_loss_mlp": 0.01046403, "balance_loss_clip": 1.08495438, "balance_loss_mlp": 1.03416657, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.4506082369021764, "language_loss": 0.81972653, "learning_rate": 3.996587738582084e-06, "loss": 0.8432188, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.7137138843536377 }, { "auxiliary_loss_clip": 0.01297889, "auxiliary_loss_mlp": 0.01049318, "balance_loss_clip": 1.08087182, "balance_loss_mlp": 1.03826094, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 2.47300786539538, "language_loss": 0.86476421, "learning_rate": 3.9965421032656115e-06, "loss": 0.88823628, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.6535096168518066 }, { "auxiliary_loss_clip": 0.01302258, "auxiliary_loss_mlp": 0.01050644, "balance_loss_clip": 1.08380568, "balance_loss_mlp": 1.03796649, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 4.7235229006104005, "language_loss": 0.9414205, "learning_rate": 3.99649616507529e-06, "loss": 0.96494949, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.6241989135742188 }, { "auxiliary_loss_clip": 0.0120153, "auxiliary_loss_mlp": 0.01007226, "balance_loss_clip": 1.05565655, "balance_loss_mlp": 1.00262499, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8915303817992773, "language_loss": 0.63136744, "learning_rate": 3.996449924018088e-06, "loss": 0.65345502, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.1479663848876953 }, { "auxiliary_loss_clip": 0.01301998, "auxiliary_loss_mlp": 0.01056076, "balance_loss_clip": 1.08253109, "balance_loss_mlp": 1.04414892, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 1.898238300085869, "language_loss": 0.797598, "learning_rate": 3.99640338010102e-06, "loss": 0.82117873, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.6144745349884033 }, { "auxiliary_loss_clip": 0.01297295, "auxiliary_loss_mlp": 0.01048506, "balance_loss_clip": 1.07949197, "balance_loss_mlp": 1.03670478, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 3.852340539743921, "language_loss": 0.7872867, "learning_rate": 3.996356533331146e-06, "loss": 0.8107447, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.659836769104004 }, { "auxiliary_loss_clip": 0.01307093, "auxiliary_loss_mlp": 0.01051027, "balance_loss_clip": 1.08239794, "balance_loss_mlp": 1.03844428, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.391331719155586, "language_loss": 0.61526769, "learning_rate": 3.996309383715573e-06, "loss": 0.6388489, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.6600568294525146 }, { "auxiliary_loss_clip": 0.01301223, "auxiliary_loss_mlp": 0.0104623, "balance_loss_clip": 1.08176827, "balance_loss_mlp": 1.0333792, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 2.0598395705412864, "language_loss": 0.73861945, "learning_rate": 3.996261931261454e-06, "loss": 0.76209402, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.648393392562866 }, { "auxiliary_loss_clip": 0.01305805, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.08628082, "balance_loss_mlp": 1.02610755, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 2.889779043895425, "language_loss": 0.8691566, "learning_rate": 3.996214175975987e-06, "loss": 0.89260566, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 3.714207649230957 }, { "auxiliary_loss_clip": 0.01305654, "auxiliary_loss_mlp": 0.01053654, "balance_loss_clip": 1.08632755, "balance_loss_mlp": 1.04067779, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 2.0991015664106967, "language_loss": 0.79124856, "learning_rate": 3.996166117866417e-06, "loss": 0.81484163, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 2.750786066055298 }, { "auxiliary_loss_clip": 0.01292757, "auxiliary_loss_mlp": 0.01058086, "balance_loss_clip": 1.0776751, "balance_loss_mlp": 1.04617131, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 1.96144513077882, "language_loss": 0.86918318, "learning_rate": 3.996117756940035e-06, "loss": 0.89269161, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.665847063064575 }, { "auxiliary_loss_clip": 0.01297615, "auxiliary_loss_mlp": 0.01047372, "balance_loss_clip": 1.08052766, "balance_loss_mlp": 1.03495693, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.733931872251258, "language_loss": 0.97773069, "learning_rate": 3.996069093204175e-06, "loss": 1.00118065, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 4.369104862213135 }, { "auxiliary_loss_clip": 0.01306674, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.08559394, "balance_loss_mlp": 1.03552532, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.5774324687069883, "language_loss": 0.88571924, "learning_rate": 3.996020126666221e-06, "loss": 0.9092598, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 2.689587116241455 }, { "auxiliary_loss_clip": 0.01302033, "auxiliary_loss_mlp": 0.01061536, "balance_loss_clip": 1.08200908, "balance_loss_mlp": 1.04900694, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 1.9776795277651764, "language_loss": 0.82185179, "learning_rate": 3.995970857333601e-06, "loss": 0.84548748, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 3.5829367637634277 }, { "auxiliary_loss_clip": 0.01303043, "auxiliary_loss_mlp": 0.01051007, "balance_loss_clip": 1.08392942, "balance_loss_mlp": 1.03863931, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 2.775821736277416, "language_loss": 0.79643464, "learning_rate": 3.995921285213789e-06, "loss": 0.81997514, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.691967725753784 }, { "auxiliary_loss_clip": 0.01296874, "auxiliary_loss_mlp": 0.01048621, "balance_loss_clip": 1.0816195, "balance_loss_mlp": 1.03675365, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.5643306727287474, "language_loss": 0.8101064, "learning_rate": 3.995871410314305e-06, "loss": 0.83356136, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.6686508655548096 }, { "auxiliary_loss_clip": 0.0124711, "auxiliary_loss_mlp": 0.01003843, "balance_loss_clip": 1.05279446, "balance_loss_mlp": 0.99933642, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9044784405595301, "language_loss": 0.59629679, "learning_rate": 3.995821232642714e-06, "loss": 0.61880636, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.3196654319763184 }, { "auxiliary_loss_clip": 0.0134773, "auxiliary_loss_mlp": 0.0104705, "balance_loss_clip": 1.08431292, "balance_loss_mlp": 1.03458691, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 6.105268390417708, "language_loss": 0.82708848, "learning_rate": 3.995770752206629e-06, "loss": 0.85103631, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.7947630882263184 }, { "auxiliary_loss_clip": 0.01298887, "auxiliary_loss_mlp": 0.01047068, "balance_loss_clip": 1.08207691, "balance_loss_mlp": 1.03473592, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.1086486365012584, "language_loss": 0.97189355, "learning_rate": 3.995719969013709e-06, "loss": 0.9953531, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.62555193901062 }, { "auxiliary_loss_clip": 0.0139337, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.07808256, "balance_loss_mlp": 1.0254221, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 3.3586376107421123, "language_loss": 0.86010313, "learning_rate": 3.995668883071655e-06, "loss": 0.88442194, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.7219204902648926 }, { "auxiliary_loss_clip": 0.01302879, "auxiliary_loss_mlp": 0.01046034, "balance_loss_clip": 1.08350658, "balance_loss_mlp": 1.03411317, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.7030881434622533, "language_loss": 0.91208112, "learning_rate": 3.995617494388219e-06, "loss": 0.93557024, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.6995599269866943 }, { "auxiliary_loss_clip": 0.01394773, "auxiliary_loss_mlp": 0.01051334, "balance_loss_clip": 1.07597494, "balance_loss_mlp": 1.03862, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 1.9127472907637553, "language_loss": 0.80652636, "learning_rate": 3.995565802971196e-06, "loss": 0.83098745, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.7999706268310547 }, { "auxiliary_loss_clip": 0.01396202, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.07747638, "balance_loss_mlp": 1.03039432, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 1.8412309890190819, "language_loss": 0.67615855, "learning_rate": 3.995513808828427e-06, "loss": 0.700544, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.8105592727661133 }, { "auxiliary_loss_clip": 0.01392158, "auxiliary_loss_mlp": 0.01050121, "balance_loss_clip": 1.07526004, "balance_loss_mlp": 1.03870106, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 1.8630515470147893, "language_loss": 0.76652235, "learning_rate": 3.9954615119678e-06, "loss": 0.79094517, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.7337486743927 }, { "auxiliary_loss_clip": 0.01336708, "auxiliary_loss_mlp": 0.01045544, "balance_loss_clip": 1.07565689, "balance_loss_mlp": 1.03350353, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 1.9652330731101257, "language_loss": 0.80783916, "learning_rate": 3.995408912397248e-06, "loss": 0.8316617, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.736022472381592 }, { "auxiliary_loss_clip": 0.01399785, "auxiliary_loss_mlp": 0.01049029, "balance_loss_clip": 1.08094311, "balance_loss_mlp": 1.03670907, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.2983619402252344, "language_loss": 0.93526286, "learning_rate": 3.99535601012475e-06, "loss": 0.95975101, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.7143404483795166 }, { "auxiliary_loss_clip": 0.01436953, "auxiliary_loss_mlp": 0.02597362, "balance_loss_clip": 1.07382202, "balance_loss_mlp": 1.0003773, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.6750916803133802, "language_loss": 0.75875306, "learning_rate": 3.995302805158333e-06, "loss": 0.79909623, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.8402016162872314 }, { "auxiliary_loss_clip": 0.01390382, "auxiliary_loss_mlp": 0.01054293, "balance_loss_clip": 1.07460713, "balance_loss_mlp": 1.04048872, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 1.7337709849598613, "language_loss": 0.83759689, "learning_rate": 3.9952492975060665e-06, "loss": 0.86204368, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.780186176300049 }, { "auxiliary_loss_clip": 0.01341761, "auxiliary_loss_mlp": 0.010447, "balance_loss_clip": 1.07744753, "balance_loss_mlp": 1.0318495, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 6.024163243568287, "language_loss": 0.85691655, "learning_rate": 3.995195487176067e-06, "loss": 0.88078111, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.801499843597412 }, { "auxiliary_loss_clip": 0.0129923, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.08121443, "balance_loss_mlp": 1.02676749, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 2.1027453238342937, "language_loss": 0.85790116, "learning_rate": 3.995141374176499e-06, "loss": 0.88127881, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.669226884841919 }, { "auxiliary_loss_clip": 0.01342972, "auxiliary_loss_mlp": 0.0255204, "balance_loss_clip": 1.04525113, "balance_loss_mlp": 1.0001421, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8714722011856999, "language_loss": 0.63073862, "learning_rate": 3.995086958515572e-06, "loss": 0.6696887, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.4014806747436523 }, { "auxiliary_loss_clip": 0.01196332, "auxiliary_loss_mlp": 0.02551689, "balance_loss_clip": 1.0551213, "balance_loss_mlp": 1.00011826, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.8564554793007768, "language_loss": 0.59923142, "learning_rate": 3.995032240201538e-06, "loss": 0.63671154, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.2329769134521484 }, { "auxiliary_loss_clip": 0.01302182, "auxiliary_loss_mlp": 0.01008717, "balance_loss_clip": 1.05245745, "balance_loss_mlp": 1.00435352, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9414971054946931, "language_loss": 0.63128418, "learning_rate": 3.9949772192427e-06, "loss": 0.6543932, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 3.008549451828003 }, { "auxiliary_loss_clip": 0.01394536, "auxiliary_loss_mlp": 0.01058504, "balance_loss_clip": 1.0753963, "balance_loss_mlp": 1.046386, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 1.8747065905286657, "language_loss": 0.79839253, "learning_rate": 3.994921895647405e-06, "loss": 0.82292295, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.7775728702545166 }, { "auxiliary_loss_clip": 0.0119145, "auxiliary_loss_mlp": 0.01008196, "balance_loss_clip": 1.05189729, "balance_loss_mlp": 1.00397635, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8407922208504447, "language_loss": 0.55314511, "learning_rate": 3.994866269424043e-06, "loss": 0.57514155, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 4.159822463989258 }, { "auxiliary_loss_clip": 0.01537566, "auxiliary_loss_mlp": 0.01049466, "balance_loss_clip": 1.06101596, "balance_loss_mlp": 1.0377593, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.4313177801304517, "language_loss": 0.78066647, "learning_rate": 3.9948103405810545e-06, "loss": 0.80653679, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 2.8475990295410156 }, { "auxiliary_loss_clip": 0.01419809, "auxiliary_loss_mlp": 0.01041245, "balance_loss_clip": 1.06692147, "balance_loss_mlp": 1.02863836, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 2.015964201738096, "language_loss": 0.86227846, "learning_rate": 3.994754109126923e-06, "loss": 0.88688898, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 2.873560667037964 }, { "auxiliary_loss_clip": 0.01531019, "auxiliary_loss_mlp": 0.01040433, "balance_loss_clip": 1.06792402, "balance_loss_mlp": 1.02915621, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.82253190827884, "language_loss": 0.93660522, "learning_rate": 3.994697575070181e-06, "loss": 0.96231973, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 4.026853561401367 }, { "auxiliary_loss_clip": 0.01389368, "auxiliary_loss_mlp": 0.01057408, "balance_loss_clip": 1.07791817, "balance_loss_mlp": 1.04564834, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 2.0829977259153076, "language_loss": 0.91440421, "learning_rate": 3.994640738419402e-06, "loss": 0.93887198, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.8544328212738037 }, { "auxiliary_loss_clip": 0.01340916, "auxiliary_loss_mlp": 0.01046391, "balance_loss_clip": 1.0772779, "balance_loss_mlp": 1.03474414, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 1.9444326144038642, "language_loss": 0.80792636, "learning_rate": 3.9945835991832075e-06, "loss": 0.83179939, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 3.6504065990448 }, { "auxiliary_loss_clip": 0.01294665, "auxiliary_loss_mlp": 0.01040761, "balance_loss_clip": 1.08163095, "balance_loss_mlp": 1.02906668, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 3.983418448540554, "language_loss": 0.92991138, "learning_rate": 3.994526157370268e-06, "loss": 0.95326567, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.757430076599121 }, { "auxiliary_loss_clip": 0.01298935, "auxiliary_loss_mlp": 0.01016342, "balance_loss_clip": 1.04920578, "balance_loss_mlp": 1.01205051, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.8932555456730389, "language_loss": 0.59291697, "learning_rate": 3.994468412989296e-06, "loss": 0.61606967, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.386768102645874 }, { "auxiliary_loss_clip": 0.01366553, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.06777358, "balance_loss_mlp": 1.03172231, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.158043807001391, "language_loss": 0.92804426, "learning_rate": 3.994410366049052e-06, "loss": 0.95213974, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.7907450199127197 }, { "auxiliary_loss_clip": 0.0133812, "auxiliary_loss_mlp": 0.01042536, "balance_loss_clip": 1.07762647, "balance_loss_mlp": 1.03061485, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 2.9533715110317895, "language_loss": 0.82980627, "learning_rate": 3.994352016558341e-06, "loss": 0.85361284, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.822378635406494 }, { "auxiliary_loss_clip": 0.01334747, "auxiliary_loss_mlp": 0.01048665, "balance_loss_clip": 1.07721233, "balance_loss_mlp": 1.03736424, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 1.8758465701518077, "language_loss": 0.73982, "learning_rate": 3.994293364526014e-06, "loss": 0.76365411, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.7393693923950195 }, { "auxiliary_loss_clip": 0.01384783, "auxiliary_loss_mlp": 0.01046374, "balance_loss_clip": 1.07551718, "balance_loss_mlp": 1.03366649, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 2.471644391263642, "language_loss": 0.84749568, "learning_rate": 3.99423440996097e-06, "loss": 0.87180722, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.788076162338257 }, { "auxiliary_loss_clip": 0.01398057, "auxiliary_loss_mlp": 0.01049958, "balance_loss_clip": 1.08285117, "balance_loss_mlp": 1.03889513, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 2.9121968788971593, "language_loss": 0.81373096, "learning_rate": 3.994175152872152e-06, "loss": 0.83821106, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.7593419551849365 }, { "auxiliary_loss_clip": 0.01341608, "auxiliary_loss_mlp": 0.01049559, "balance_loss_clip": 1.07508349, "balance_loss_mlp": 1.03784704, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.39896849108379, "language_loss": 0.78849542, "learning_rate": 3.994115593268548e-06, "loss": 0.81240708, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.788229465484619 }, { "auxiliary_loss_clip": 0.01296644, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.08274269, "balance_loss_mlp": 1.03444433, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 2.026108188713631, "language_loss": 0.82509351, "learning_rate": 3.994055731159195e-06, "loss": 0.84852648, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.7141990661621094 }, { "auxiliary_loss_clip": 0.01342537, "auxiliary_loss_mlp": 0.01056244, "balance_loss_clip": 1.08059764, "balance_loss_mlp": 1.04547346, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.949897700982153, "language_loss": 0.86970377, "learning_rate": 3.993995566553172e-06, "loss": 0.89369166, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.78975248336792 }, { "auxiliary_loss_clip": 0.01377909, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.06948781, "balance_loss_mlp": 1.03417647, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.6208445232214201, "language_loss": 0.77037418, "learning_rate": 3.993935099459607e-06, "loss": 0.79461288, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.7688324451446533 }, { "auxiliary_loss_clip": 0.01285195, "auxiliary_loss_mlp": 0.01041679, "balance_loss_clip": 1.07643306, "balance_loss_mlp": 1.03062201, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 2.006765565759573, "language_loss": 0.74087507, "learning_rate": 3.993874329887673e-06, "loss": 0.76414382, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.743246555328369 }, { "auxiliary_loss_clip": 0.01340068, "auxiliary_loss_mlp": 0.01042565, "balance_loss_clip": 1.07964218, "balance_loss_mlp": 1.03103185, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.4071449574114636, "language_loss": 0.86846983, "learning_rate": 3.993813257846589e-06, "loss": 0.8922962, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.733025074005127 }, { "auxiliary_loss_clip": 0.0134501, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.08063376, "balance_loss_mlp": 1.03079247, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.2943209594250926, "language_loss": 0.92818052, "learning_rate": 3.993751883345619e-06, "loss": 0.95205712, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.735626459121704 }, { "auxiliary_loss_clip": 0.01384718, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.07490015, "balance_loss_mlp": 1.02978504, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.4483402481110863, "language_loss": 0.87466812, "learning_rate": 3.993690206394073e-06, "loss": 0.89893043, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.780975103378296 }, { "auxiliary_loss_clip": 0.01393323, "auxiliary_loss_mlp": 0.01061858, "balance_loss_clip": 1.07783532, "balance_loss_mlp": 1.05010366, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 3.149653630663544, "language_loss": 0.87755013, "learning_rate": 3.993628227001307e-06, "loss": 0.90210193, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.7956645488739014 }, { "auxiliary_loss_clip": 0.01389681, "auxiliary_loss_mlp": 0.01046148, "balance_loss_clip": 1.07415664, "balance_loss_mlp": 1.03465593, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 2.065949763334928, "language_loss": 0.71102798, "learning_rate": 3.993565945176726e-06, "loss": 0.73538625, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 3.014958381652832 }, { "auxiliary_loss_clip": 0.01390092, "auxiliary_loss_mlp": 0.01043874, "balance_loss_clip": 1.07963181, "balance_loss_mlp": 1.03284085, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 3.712574980724963, "language_loss": 0.83908689, "learning_rate": 3.993503360929776e-06, "loss": 0.86342657, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.722482204437256 }, { "auxiliary_loss_clip": 0.01579178, "auxiliary_loss_mlp": 0.01046989, "balance_loss_clip": 1.0671066, "balance_loss_mlp": 1.0338757, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.8679573968181775, "language_loss": 0.81036043, "learning_rate": 3.99344047426995e-06, "loss": 0.83662212, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.264531135559082 }, { "auxiliary_loss_clip": 0.01482147, "auxiliary_loss_mlp": 0.01050426, "balance_loss_clip": 1.06989145, "balance_loss_mlp": 1.03733075, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.2804849320614453, "language_loss": 0.93377417, "learning_rate": 3.993377285206789e-06, "loss": 0.95909989, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 4.1829447746276855 }, { "auxiliary_loss_clip": 0.01410458, "auxiliary_loss_mlp": 0.0104385, "balance_loss_clip": 1.06527376, "balance_loss_mlp": 1.03159487, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 2.0233703894585577, "language_loss": 0.86709929, "learning_rate": 3.99331379374988e-06, "loss": 0.89164239, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 3.1145362854003906 }, { "auxiliary_loss_clip": 0.01383109, "auxiliary_loss_mlp": 0.01045914, "balance_loss_clip": 1.0669744, "balance_loss_mlp": 1.03407037, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 1.9690926836623683, "language_loss": 0.80230081, "learning_rate": 3.993249999908852e-06, "loss": 0.82659101, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 2.9279301166534424 }, { "auxiliary_loss_clip": 0.01290579, "auxiliary_loss_mlp": 0.0104518, "balance_loss_clip": 1.07807219, "balance_loss_mlp": 1.03398657, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 6.29333754228556, "language_loss": 0.87430573, "learning_rate": 3.993185903693384e-06, "loss": 0.8976633, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.8366644382476807 }, { "auxiliary_loss_clip": 0.01385726, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.07429326, "balance_loss_mlp": 1.02858675, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 2.73463022450099, "language_loss": 0.82378399, "learning_rate": 3.9931215051131995e-06, "loss": 0.8480441, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 4.6471028327941895 }, { "auxiliary_loss_clip": 0.01395373, "auxiliary_loss_mlp": 0.0104778, "balance_loss_clip": 1.07382774, "balance_loss_mlp": 1.03647256, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 2.0088925792448338, "language_loss": 0.80269474, "learning_rate": 3.993056804178068e-06, "loss": 0.82712626, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 2.782848834991455 }, { "auxiliary_loss_clip": 0.014867, "auxiliary_loss_mlp": 0.01048236, "balance_loss_clip": 1.07388723, "balance_loss_mlp": 1.03719091, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.123317855320974, "language_loss": 0.84404427, "learning_rate": 3.992991800897803e-06, "loss": 0.86939359, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 3.7424652576446533 }, { "auxiliary_loss_clip": 0.01291301, "auxiliary_loss_mlp": 0.01046513, "balance_loss_clip": 1.07906985, "balance_loss_mlp": 1.03517032, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.5701531458329567, "language_loss": 0.89960182, "learning_rate": 3.9929264952822665e-06, "loss": 0.92297995, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.628903388977051 }, { "auxiliary_loss_clip": 0.01341056, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.07647586, "balance_loss_mlp": 1.03302598, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 3.948731737540374, "language_loss": 0.88446224, "learning_rate": 3.992860887341366e-06, "loss": 0.908319, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.730656147003174 }, { "auxiliary_loss_clip": 0.01421514, "auxiliary_loss_mlp": 0.01041206, "balance_loss_clip": 1.06796205, "balance_loss_mlp": 1.03010714, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.5359429892669207, "language_loss": 0.81432533, "learning_rate": 3.992794977085052e-06, "loss": 0.83895254, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.821216344833374 }, { "auxiliary_loss_clip": 0.01437074, "auxiliary_loss_mlp": 0.01041455, "balance_loss_clip": 1.07284641, "balance_loss_mlp": 1.02937365, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.141113394266978, "language_loss": 0.84916866, "learning_rate": 3.992728764523326e-06, "loss": 0.873954, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.899751663208008 }, { "auxiliary_loss_clip": 0.0138808, "auxiliary_loss_mlp": 0.01043418, "balance_loss_clip": 1.07883132, "balance_loss_mlp": 1.0317595, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.7982611965769124, "language_loss": 0.80770737, "learning_rate": 3.99266224966623e-06, "loss": 0.83202237, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.778958320617676 }, { "auxiliary_loss_clip": 0.01373724, "auxiliary_loss_mlp": 0.01046484, "balance_loss_clip": 1.07433283, "balance_loss_mlp": 1.03508735, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 2.172366940929174, "language_loss": 0.88026309, "learning_rate": 3.992595432523855e-06, "loss": 0.90446514, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.794980049133301 }, { "auxiliary_loss_clip": 0.01423649, "auxiliary_loss_mlp": 0.01044927, "balance_loss_clip": 1.07352173, "balance_loss_mlp": 1.03448462, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 2.1498525691929626, "language_loss": 0.85855269, "learning_rate": 3.992528313106338e-06, "loss": 0.88323843, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.8148341178894043 }, { "auxiliary_loss_clip": 0.01289423, "auxiliary_loss_mlp": 0.02589687, "balance_loss_clip": 1.08206439, "balance_loss_mlp": 1.00020552, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.468596090875915, "language_loss": 0.82414341, "learning_rate": 3.9924608914238595e-06, "loss": 0.86293447, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.6749465465545654 }, { "auxiliary_loss_clip": 0.01338408, "auxiliary_loss_mlp": 0.01045723, "balance_loss_clip": 1.07976532, "balance_loss_mlp": 1.03454757, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.181273164539161, "language_loss": 0.83925503, "learning_rate": 3.992393167486648e-06, "loss": 0.86309636, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.8562870025634766 }, { "auxiliary_loss_clip": 0.012906, "auxiliary_loss_mlp": 0.01044744, "balance_loss_clip": 1.08134127, "balance_loss_mlp": 1.03306186, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 3.0803289129230014, "language_loss": 0.80859125, "learning_rate": 3.992325141304977e-06, "loss": 0.8319447, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.679352045059204 }, { "auxiliary_loss_clip": 0.0141644, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.06840444, "balance_loss_mlp": 1.02889669, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.236982242254772, "language_loss": 0.86659515, "learning_rate": 3.992256812889166e-06, "loss": 0.89115435, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.824808120727539 }, { "auxiliary_loss_clip": 0.01291539, "auxiliary_loss_mlp": 0.01053718, "balance_loss_clip": 1.0834769, "balance_loss_mlp": 1.04252386, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.3015394024671534, "language_loss": 0.76655966, "learning_rate": 3.992188182249582e-06, "loss": 0.79001224, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.855139970779419 }, { "auxiliary_loss_clip": 0.01389415, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.08053946, "balance_loss_mlp": 1.03440642, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.9884243216618267, "language_loss": 0.90674686, "learning_rate": 3.992119249396633e-06, "loss": 0.93110776, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.724443197250366 }, { "auxiliary_loss_clip": 0.0137713, "auxiliary_loss_mlp": 0.02592626, "balance_loss_clip": 1.07214403, "balance_loss_mlp": 1.00026309, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 1.9109725623678235, "language_loss": 0.82141632, "learning_rate": 3.992050014340778e-06, "loss": 0.86111391, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.8482606410980225 }, { "auxiliary_loss_clip": 0.01234104, "auxiliary_loss_mlp": 0.01012394, "balance_loss_clip": 1.05112767, "balance_loss_mlp": 1.0084362, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.831231748248587, "language_loss": 0.54982752, "learning_rate": 3.99198047709252e-06, "loss": 0.57229251, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.3505139350891113 }, { "auxiliary_loss_clip": 0.01422743, "auxiliary_loss_mlp": 0.01051564, "balance_loss_clip": 1.06864119, "balance_loss_mlp": 1.03974462, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 1.785208231763675, "language_loss": 0.78725946, "learning_rate": 3.991910637662408e-06, "loss": 0.81200254, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.808192253112793 }, { "auxiliary_loss_clip": 0.0128449, "auxiliary_loss_mlp": 0.01047578, "balance_loss_clip": 1.0779866, "balance_loss_mlp": 1.03674781, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 1.821964399784007, "language_loss": 0.80953604, "learning_rate": 3.9918404960610355e-06, "loss": 0.83285677, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.778062105178833 }, { "auxiliary_loss_clip": 0.01340844, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.07803798, "balance_loss_mlp": 1.02808785, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.4630790653793975, "language_loss": 0.77438307, "learning_rate": 3.991770052299043e-06, "loss": 0.79818749, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.742734432220459 }, { "auxiliary_loss_clip": 0.01388433, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.07291698, "balance_loss_mlp": 1.03792763, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.4682943087630167, "language_loss": 0.87775803, "learning_rate": 3.991699306387118e-06, "loss": 0.9021337, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.773005485534668 }, { "auxiliary_loss_clip": 0.01338138, "auxiliary_loss_mlp": 0.0104326, "balance_loss_clip": 1.07489038, "balance_loss_mlp": 1.03263259, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.927129271754468, "language_loss": 0.78184831, "learning_rate": 3.991628258335991e-06, "loss": 0.80566227, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 3.9529383182525635 }, { "auxiliary_loss_clip": 0.01421832, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.06895614, "balance_loss_mlp": 1.02245069, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 2.9437093109542296, "language_loss": 0.8764087, "learning_rate": 3.991556908156442e-06, "loss": 0.90096503, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.85764479637146 }, { "auxiliary_loss_clip": 0.01386267, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.07248592, "balance_loss_mlp": 1.03169811, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 4.502778052358142, "language_loss": 0.88107145, "learning_rate": 3.9914852558592914e-06, "loss": 0.90536219, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.782132863998413 }, { "auxiliary_loss_clip": 0.01336898, "auxiliary_loss_mlp": 0.01045366, "balance_loss_clip": 1.07810688, "balance_loss_mlp": 1.03389239, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 4.525075121576098, "language_loss": 0.8113057, "learning_rate": 3.991413301455413e-06, "loss": 0.83512831, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.7843244075775146 }, { "auxiliary_loss_clip": 0.01372749, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.07131362, "balance_loss_mlp": 1.02951002, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.3229675753938794, "language_loss": 0.77898836, "learning_rate": 3.991341044955719e-06, "loss": 0.80311728, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 4.61077356338501 }, { "auxiliary_loss_clip": 0.01333344, "auxiliary_loss_mlp": 0.02594116, "balance_loss_clip": 1.07560611, "balance_loss_mlp": 1.00033402, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 1.9999151645334488, "language_loss": 0.81527877, "learning_rate": 3.991268486371172e-06, "loss": 0.85455334, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 3.6616294384002686 }, { "auxiliary_loss_clip": 0.01377256, "auxiliary_loss_mlp": 0.01043719, "balance_loss_clip": 1.07121789, "balance_loss_mlp": 1.03199482, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 3.334826920784108, "language_loss": 0.87661719, "learning_rate": 3.991195625712779e-06, "loss": 0.90082693, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.7935218811035156 }, { "auxiliary_loss_clip": 0.0128739, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.08054471, "balance_loss_mlp": 1.04252362, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 2.461401774917469, "language_loss": 0.8168065, "learning_rate": 3.991122462991592e-06, "loss": 0.84021062, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.7021284103393555 }, { "auxiliary_loss_clip": 0.0128876, "auxiliary_loss_mlp": 0.01055389, "balance_loss_clip": 1.08047962, "balance_loss_mlp": 1.04312801, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.19515233785663, "language_loss": 0.81496066, "learning_rate": 3.991048998218712e-06, "loss": 0.83840215, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.613429307937622 }, { "auxiliary_loss_clip": 0.01337661, "auxiliary_loss_mlp": 0.01051205, "balance_loss_clip": 1.07701743, "balance_loss_mlp": 1.03971326, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.1911182378063323, "language_loss": 0.7665711, "learning_rate": 3.990975231405281e-06, "loss": 0.79045975, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.7370147705078125 }, { "auxiliary_loss_clip": 0.01332136, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.07468891, "balance_loss_mlp": 1.03963721, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 2.135056577901825, "language_loss": 0.78709203, "learning_rate": 3.990901162562491e-06, "loss": 0.8109265, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.7803773880004883 }, { "auxiliary_loss_clip": 0.01430342, "auxiliary_loss_mlp": 0.02592104, "balance_loss_clip": 1.06937909, "balance_loss_mlp": 1.00029397, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 2.4378212281429685, "language_loss": 0.90592283, "learning_rate": 3.9908267917015765e-06, "loss": 0.94614732, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.7880215644836426 }, { "auxiliary_loss_clip": 0.01323754, "auxiliary_loss_mlp": 0.01049831, "balance_loss_clip": 1.07179236, "balance_loss_mlp": 1.037642, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.4235067996303847, "language_loss": 0.9322477, "learning_rate": 3.990752118833821e-06, "loss": 0.95598358, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.7675631046295166 }, { "auxiliary_loss_clip": 0.01285998, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.0789845, "balance_loss_mlp": 1.04069102, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 2.2308288588271448, "language_loss": 0.77986097, "learning_rate": 3.990677143970553e-06, "loss": 0.80324113, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.667393684387207 }, { "auxiliary_loss_clip": 0.01430072, "auxiliary_loss_mlp": 0.01043177, "balance_loss_clip": 1.07685113, "balance_loss_mlp": 1.03130984, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 2.4246089819123253, "language_loss": 0.81347179, "learning_rate": 3.990601867123144e-06, "loss": 0.83820426, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.8088667392730713 }, { "auxiliary_loss_clip": 0.01485477, "auxiliary_loss_mlp": 0.0104259, "balance_loss_clip": 1.07154405, "balance_loss_mlp": 1.03090715, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.7322123837250634, "language_loss": 0.85156941, "learning_rate": 3.990526288303014e-06, "loss": 0.87685007, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.7381744384765625 }, { "auxiliary_loss_clip": 0.01384167, "auxiliary_loss_mlp": 0.02587236, "balance_loss_clip": 1.07754529, "balance_loss_mlp": 1.00030696, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 1.7152141506577199, "language_loss": 0.90749979, "learning_rate": 3.9904504075216295e-06, "loss": 0.94721377, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.785727024078369 }, { "auxiliary_loss_clip": 0.01442569, "auxiliary_loss_mlp": 0.01046587, "balance_loss_clip": 1.07199991, "balance_loss_mlp": 1.03532732, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.479196328665803, "language_loss": 0.93807387, "learning_rate": 3.990374224790501e-06, "loss": 0.96296537, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.7510111331939697 }, { "auxiliary_loss_clip": 0.01388776, "auxiliary_loss_mlp": 0.01049471, "balance_loss_clip": 1.07992327, "balance_loss_mlp": 1.03713894, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.1543029842506236, "language_loss": 0.7078101, "learning_rate": 3.990297740121185e-06, "loss": 0.73219258, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.8235971927642822 }, { "auxiliary_loss_clip": 0.01336011, "auxiliary_loss_mlp": 0.02588922, "balance_loss_clip": 1.0780803, "balance_loss_mlp": 1.00034034, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 1.8713905346154884, "language_loss": 0.78052092, "learning_rate": 3.990220953525284e-06, "loss": 0.81977022, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.714449405670166 }, { "auxiliary_loss_clip": 0.01371065, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.07048845, "balance_loss_mlp": 1.03305721, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.436693492620822, "language_loss": 0.74442101, "learning_rate": 3.9901438650144465e-06, "loss": 0.76857579, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.8620800971984863 }, { "auxiliary_loss_clip": 0.01322448, "auxiliary_loss_mlp": 0.01045818, "balance_loss_clip": 1.07269657, "balance_loss_mlp": 1.03424287, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 3.1939837615412157, "language_loss": 0.91763836, "learning_rate": 3.990066474600367e-06, "loss": 0.94132102, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.7474286556243896 }, { "auxiliary_loss_clip": 0.01320437, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.06985033, "balance_loss_mlp": 1.02667284, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.9259156873359928, "language_loss": 0.67850578, "learning_rate": 3.989988782294786e-06, "loss": 0.70208967, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.7614176273345947 }, { "auxiliary_loss_clip": 0.01418214, "auxiliary_loss_mlp": 0.01042148, "balance_loss_clip": 1.06969333, "balance_loss_mlp": 1.03081703, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 2.0349497899977242, "language_loss": 0.94901383, "learning_rate": 3.989910788109489e-06, "loss": 0.97361743, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.794086456298828 }, { "auxiliary_loss_clip": 0.01428153, "auxiliary_loss_mlp": 0.01048314, "balance_loss_clip": 1.07287478, "balance_loss_mlp": 1.03617287, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.1477411826776485, "language_loss": 0.74773109, "learning_rate": 3.989832492056307e-06, "loss": 0.77249575, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.873129367828369 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01051488, "balance_loss_clip": 1.07802272, "balance_loss_mlp": 1.03936481, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 2.3494585717691403, "language_loss": 0.80999517, "learning_rate": 3.989753894147119e-06, "loss": 0.83381152, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.802497386932373 }, { "auxiliary_loss_clip": 0.01331325, "auxiliary_loss_mlp": 0.01043877, "balance_loss_clip": 1.08285379, "balance_loss_mlp": 1.03346395, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 1.8059239012009642, "language_loss": 0.80135679, "learning_rate": 3.989674994393846e-06, "loss": 0.82510877, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 3.693615436553955 }, { "auxiliary_loss_clip": 0.01329435, "auxiliary_loss_mlp": 0.01049822, "balance_loss_clip": 1.07935524, "balance_loss_mlp": 1.03767443, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.4528722970029935, "language_loss": 0.94220328, "learning_rate": 3.98959579280846e-06, "loss": 0.96599591, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.7505152225494385 }, { "auxiliary_loss_clip": 0.01466094, "auxiliary_loss_mlp": 0.01040726, "balance_loss_clip": 1.07189322, "balance_loss_mlp": 1.02960968, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.001891067819318, "language_loss": 0.82848465, "learning_rate": 3.989516289402973e-06, "loss": 0.85355282, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 2.8366262912750244 }, { "auxiliary_loss_clip": 0.01504798, "auxiliary_loss_mlp": 0.01042325, "balance_loss_clip": 1.05627012, "balance_loss_mlp": 1.03072596, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 4.434682594424167, "language_loss": 0.80288541, "learning_rate": 3.989436484189447e-06, "loss": 0.82835662, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 4.645785570144653 }, { "auxiliary_loss_clip": 0.01332659, "auxiliary_loss_mlp": 0.01043157, "balance_loss_clip": 1.07402205, "balance_loss_mlp": 1.03117657, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 4.065112550920332, "language_loss": 0.80581224, "learning_rate": 3.9893563771799885e-06, "loss": 0.82957041, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 2.6870930194854736 }, { "auxiliary_loss_clip": 0.01285072, "auxiliary_loss_mlp": 0.01060984, "balance_loss_clip": 1.07961071, "balance_loss_mlp": 1.04918861, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.146904990666255, "language_loss": 0.86439621, "learning_rate": 3.989275968386749e-06, "loss": 0.88785678, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 2.8156635761260986 }, { "auxiliary_loss_clip": 0.01375638, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.07051945, "balance_loss_mlp": 1.02710235, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 2.1978842227881494, "language_loss": 0.76982653, "learning_rate": 3.989195257821926e-06, "loss": 0.79395664, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 3.6766786575317383 }, { "auxiliary_loss_clip": 0.01375151, "auxiliary_loss_mlp": 0.01054189, "balance_loss_clip": 1.07570076, "balance_loss_mlp": 1.04158807, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.1642201517837263, "language_loss": 0.84413886, "learning_rate": 3.989114245497765e-06, "loss": 0.86843228, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.7432920932769775 }, { "auxiliary_loss_clip": 0.01330062, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.0717392, "balance_loss_mlp": 1.04171157, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.8716325533816605, "language_loss": 0.94697893, "learning_rate": 3.989032931426554e-06, "loss": 0.97080332, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.6574881076812744 }, { "auxiliary_loss_clip": 0.01378085, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.07664013, "balance_loss_mlp": 1.03232884, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 1.935553595238754, "language_loss": 0.86917228, "learning_rate": 3.9889513156206295e-06, "loss": 0.8933813, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.7724766731262207 }, { "auxiliary_loss_clip": 0.01430354, "auxiliary_loss_mlp": 0.01042185, "balance_loss_clip": 1.0720861, "balance_loss_mlp": 1.03035378, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 3.7777106651528074, "language_loss": 0.7383883, "learning_rate": 3.988869398092371e-06, "loss": 0.76311362, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.799110174179077 }, { "auxiliary_loss_clip": 0.01377958, "auxiliary_loss_mlp": 0.01051011, "balance_loss_clip": 1.07433844, "balance_loss_mlp": 1.03897083, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.194106022234836, "language_loss": 0.78713441, "learning_rate": 3.988787178854206e-06, "loss": 0.81142402, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.744763135910034 }, { "auxiliary_loss_clip": 0.01284873, "auxiliary_loss_mlp": 0.01039092, "balance_loss_clip": 1.0806365, "balance_loss_mlp": 1.02803564, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.1323494005652774, "language_loss": 0.87759608, "learning_rate": 3.988704657918608e-06, "loss": 0.90083575, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.698155403137207 }, { "auxiliary_loss_clip": 0.01332438, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.07992041, "balance_loss_mlp": 1.03207326, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.7193814959760765, "language_loss": 0.79990411, "learning_rate": 3.988621835298094e-06, "loss": 0.82365149, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.6493992805480957 }, { "auxiliary_loss_clip": 0.01278493, "auxiliary_loss_mlp": 0.01044815, "balance_loss_clip": 1.0786624, "balance_loss_mlp": 1.03411043, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 1.9434929368163127, "language_loss": 0.9168281, "learning_rate": 3.988538711005229e-06, "loss": 0.94006121, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.767162322998047 }, { "auxiliary_loss_clip": 0.01326543, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.07694626, "balance_loss_mlp": 1.03083193, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.4724727137926794, "language_loss": 0.88501745, "learning_rate": 3.988455285052622e-06, "loss": 0.90870678, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.7261240482330322 }, { "auxiliary_loss_clip": 0.01327678, "auxiliary_loss_mlp": 0.0104665, "balance_loss_clip": 1.07648516, "balance_loss_mlp": 1.03585017, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.1771662745987452, "language_loss": 0.83844984, "learning_rate": 3.98837155745293e-06, "loss": 0.86219311, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.7480103969573975 }, { "auxiliary_loss_clip": 0.01329406, "auxiliary_loss_mlp": 0.01042957, "balance_loss_clip": 1.07641196, "balance_loss_mlp": 1.03230548, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.441567102108424, "language_loss": 0.76065409, "learning_rate": 3.988287528218854e-06, "loss": 0.78437769, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.6685051918029785 }, { "auxiliary_loss_clip": 0.01327573, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.0787344, "balance_loss_mlp": 1.0246594, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.057891123789396, "language_loss": 0.90468502, "learning_rate": 3.98820319736314e-06, "loss": 0.92831039, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.7294294834136963 }, { "auxiliary_loss_clip": 0.0143093, "auxiliary_loss_mlp": 0.010437, "balance_loss_clip": 1.07174361, "balance_loss_mlp": 1.03217864, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.86635856665021, "language_loss": 0.85371739, "learning_rate": 3.988118564898582e-06, "loss": 0.87846375, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.738189220428467 }, { "auxiliary_loss_clip": 0.0142237, "auxiliary_loss_mlp": 0.02593444, "balance_loss_clip": 1.07156849, "balance_loss_mlp": 1.00047576, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.3759302907147792, "language_loss": 0.89304161, "learning_rate": 3.988033630838019e-06, "loss": 0.9331997, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.7597556114196777 }, { "auxiliary_loss_clip": 0.01339918, "auxiliary_loss_mlp": 0.01051675, "balance_loss_clip": 1.07938445, "balance_loss_mlp": 1.0406003, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 1.7461418145768164, "language_loss": 0.88191944, "learning_rate": 3.987948395194334e-06, "loss": 0.90583539, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.7791481018066406 }, { "auxiliary_loss_clip": 0.01325163, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.07355201, "balance_loss_mlp": 1.02403438, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.273154722351999, "language_loss": 0.76750576, "learning_rate": 3.987862857980458e-06, "loss": 0.79111362, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.6336472034454346 }, { "auxiliary_loss_clip": 0.01430264, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.07289875, "balance_loss_mlp": 1.03416002, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.4488289136278563, "language_loss": 0.76991928, "learning_rate": 3.987777019209368e-06, "loss": 0.79467499, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.832770586013794 }, { "auxiliary_loss_clip": 0.01280086, "auxiliary_loss_mlp": 0.01045136, "balance_loss_clip": 1.07737648, "balance_loss_mlp": 1.03350079, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 2.1449124327835545, "language_loss": 0.81161028, "learning_rate": 3.987690878894084e-06, "loss": 0.83486247, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.656219244003296 }, { "auxiliary_loss_clip": 0.01387869, "auxiliary_loss_mlp": 0.01047259, "balance_loss_clip": 1.07502842, "balance_loss_mlp": 1.03486753, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 3.0061928758697025, "language_loss": 0.85024822, "learning_rate": 3.987604437047673e-06, "loss": 0.87459946, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.7694480419158936 }, { "auxiliary_loss_clip": 0.01330637, "auxiliary_loss_mlp": 0.01054386, "balance_loss_clip": 1.07715964, "balance_loss_mlp": 1.04384208, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.2007826894850866, "language_loss": 0.77364302, "learning_rate": 3.987517693683251e-06, "loss": 0.79749328, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 3.570066452026367 }, { "auxiliary_loss_clip": 0.01373933, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.07787561, "balance_loss_mlp": 1.03118348, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.791769252946017, "language_loss": 0.96036196, "learning_rate": 3.9874306488139745e-06, "loss": 0.98452526, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.701536178588867 }, { "auxiliary_loss_clip": 0.01423505, "auxiliary_loss_mlp": 0.01045905, "balance_loss_clip": 1.07269454, "balance_loss_mlp": 1.03521752, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.050594570437705, "language_loss": 0.88076347, "learning_rate": 3.987343302453049e-06, "loss": 0.90545756, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.840214729309082 }, { "auxiliary_loss_clip": 0.01382282, "auxiliary_loss_mlp": 0.01047498, "balance_loss_clip": 1.07679033, "balance_loss_mlp": 1.03601861, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.6440472646862636, "language_loss": 0.82530093, "learning_rate": 3.987255654613724e-06, "loss": 0.84959877, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 4.506173133850098 }, { "auxiliary_loss_clip": 0.01422225, "auxiliary_loss_mlp": 0.01043608, "balance_loss_clip": 1.06882322, "balance_loss_mlp": 1.03243184, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 2.710527587529789, "language_loss": 0.70056903, "learning_rate": 3.987167705309296e-06, "loss": 0.72522736, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.756824016571045 }, { "auxiliary_loss_clip": 0.0133355, "auxiliary_loss_mlp": 0.02586979, "balance_loss_clip": 1.07855594, "balance_loss_mlp": 1.00040293, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 1.9473578397455387, "language_loss": 0.95363802, "learning_rate": 3.987079454553108e-06, "loss": 0.99284327, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 2.724221706390381 }, { "auxiliary_loss_clip": 0.01426009, "auxiliary_loss_mlp": 0.01044853, "balance_loss_clip": 1.07419801, "balance_loss_mlp": 1.0328846, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 2.6866994754450335, "language_loss": 0.91175014, "learning_rate": 3.986990902358546e-06, "loss": 0.93645871, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 3.602112293243408 }, { "auxiliary_loss_clip": 0.0133351, "auxiliary_loss_mlp": 0.01045581, "balance_loss_clip": 1.07617486, "balance_loss_mlp": 1.03428602, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.088561049797057, "language_loss": 0.93313849, "learning_rate": 3.986902048739045e-06, "loss": 0.95692945, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.8568389415740967 }, { "auxiliary_loss_clip": 0.01385083, "auxiliary_loss_mlp": 0.01053188, "balance_loss_clip": 1.07593298, "balance_loss_mlp": 1.04098141, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.588043030370696, "language_loss": 0.80191618, "learning_rate": 3.986812893708082e-06, "loss": 0.82629895, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.861245632171631 }, { "auxiliary_loss_clip": 0.01383875, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.0738008, "balance_loss_mlp": 1.02233934, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 2.3582632391546245, "language_loss": 0.81240129, "learning_rate": 3.9867234372791826e-06, "loss": 0.8365767, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.842209577560425 }, { "auxiliary_loss_clip": 0.01334307, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.07940674, "balance_loss_mlp": 1.03022027, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 3.6556345993853805, "language_loss": 0.87577903, "learning_rate": 3.986633679465918e-06, "loss": 0.89953369, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.8336970806121826 }, { "auxiliary_loss_clip": 0.0147805, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.07172573, "balance_loss_mlp": 1.04076576, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.8779770303274246, "language_loss": 0.81105512, "learning_rate": 3.986543620281904e-06, "loss": 0.83635449, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.8675637245178223 }, { "auxiliary_loss_clip": 0.01363289, "auxiliary_loss_mlp": 0.01041466, "balance_loss_clip": 1.07140517, "balance_loss_mlp": 1.03048134, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.7735646885983758, "language_loss": 0.91261196, "learning_rate": 3.986453259740802e-06, "loss": 0.93665946, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.9319019317626953 }, { "auxiliary_loss_clip": 0.01381551, "auxiliary_loss_mlp": 0.01055833, "balance_loss_clip": 1.07925749, "balance_loss_mlp": 1.04370975, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 2.6453955041953425, "language_loss": 0.79335439, "learning_rate": 3.986362597856319e-06, "loss": 0.81772828, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.8234612941741943 }, { "auxiliary_loss_clip": 0.01378367, "auxiliary_loss_mlp": 0.0259381, "balance_loss_clip": 1.07358885, "balance_loss_mlp": 1.00035644, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 2.88873710215487, "language_loss": 0.81916302, "learning_rate": 3.986271634642211e-06, "loss": 0.85888481, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.8693578243255615 }, { "auxiliary_loss_clip": 0.01284504, "auxiliary_loss_mlp": 0.01057406, "balance_loss_clip": 1.0833869, "balance_loss_mlp": 1.04538369, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 2.749595750168124, "language_loss": 0.81660116, "learning_rate": 3.986180370112274e-06, "loss": 0.8400203, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.7583374977111816 }, { "auxiliary_loss_clip": 0.01337732, "auxiliary_loss_mlp": 0.02593384, "balance_loss_clip": 1.08093166, "balance_loss_mlp": 1.0003171, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.8043063801694297, "language_loss": 0.74654502, "learning_rate": 3.986088804280354e-06, "loss": 0.78585619, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.880911111831665 }, { "auxiliary_loss_clip": 0.01386154, "auxiliary_loss_mlp": 0.01047414, "balance_loss_clip": 1.0786674, "balance_loss_mlp": 1.03630972, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.4995736339455368, "language_loss": 0.94176149, "learning_rate": 3.985996937160342e-06, "loss": 0.96609712, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.8973259925842285 }, { "auxiliary_loss_clip": 0.01330298, "auxiliary_loss_mlp": 0.01056266, "balance_loss_clip": 1.08067977, "balance_loss_mlp": 1.04418373, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.166176094058847, "language_loss": 0.689996, "learning_rate": 3.985904768766173e-06, "loss": 0.71386158, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 3.039015054702759 }, { "auxiliary_loss_clip": 0.0142996, "auxiliary_loss_mlp": 0.01046628, "balance_loss_clip": 1.07243133, "balance_loss_mlp": 1.03402138, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 6.81220451517536, "language_loss": 0.75713813, "learning_rate": 3.98581229911183e-06, "loss": 0.78190404, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.874995708465576 }, { "auxiliary_loss_clip": 0.01328016, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.07273531, "balance_loss_mlp": 1.03306913, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 3.113459100639243, "language_loss": 0.92233849, "learning_rate": 3.985719528211341e-06, "loss": 0.94606304, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.6852171421051025 }, { "auxiliary_loss_clip": 0.01312094, "auxiliary_loss_mlp": 0.01015488, "balance_loss_clip": 1.06258845, "balance_loss_mlp": 1.00971842, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8502188889705806, "language_loss": 0.63034678, "learning_rate": 3.985626456078777e-06, "loss": 0.65362263, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.4033663272857666 }, { "auxiliary_loss_clip": 0.01429979, "auxiliary_loss_mlp": 0.01048635, "balance_loss_clip": 1.07456017, "balance_loss_mlp": 1.03753054, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.3250178521845726, "language_loss": 0.86175084, "learning_rate": 3.985533082728259e-06, "loss": 0.88653702, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.73799204826355 }, { "auxiliary_loss_clip": 0.01282404, "auxiliary_loss_mlp": 0.01043173, "balance_loss_clip": 1.07829249, "balance_loss_mlp": 1.03116882, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 1.932863032138568, "language_loss": 0.75136709, "learning_rate": 3.985439408173951e-06, "loss": 0.77462286, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.718461036682129 }, { "auxiliary_loss_clip": 0.01283836, "auxiliary_loss_mlp": 0.01053204, "balance_loss_clip": 1.07893491, "balance_loss_mlp": 1.04202199, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 1.9784239340395544, "language_loss": 0.71076858, "learning_rate": 3.9853454324300634e-06, "loss": 0.73413897, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.691452741622925 }, { "auxiliary_loss_clip": 0.01531704, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.06669199, "balance_loss_mlp": 1.04826987, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.7025763554139486, "language_loss": 0.77929604, "learning_rate": 3.985251155510852e-06, "loss": 0.80521154, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.7816076278686523 }, { "auxiliary_loss_clip": 0.01475176, "auxiliary_loss_mlp": 0.01050699, "balance_loss_clip": 1.07255602, "balance_loss_mlp": 1.03933799, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.7970612625131432, "language_loss": 0.80439258, "learning_rate": 3.98515657743062e-06, "loss": 0.82965136, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 3.810678720474243 }, { "auxiliary_loss_clip": 0.01375468, "auxiliary_loss_mlp": 0.01046475, "balance_loss_clip": 1.07121444, "balance_loss_mlp": 1.03489947, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 2.175017138614066, "language_loss": 0.77940625, "learning_rate": 3.985061698203711e-06, "loss": 0.8036257, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 2.651338815689087 }, { "auxiliary_loss_clip": 0.01198954, "auxiliary_loss_mlp": 0.01005594, "balance_loss_clip": 1.06411529, "balance_loss_mlp": 0.9999668, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8831298396705786, "language_loss": 0.63821709, "learning_rate": 3.984966517844523e-06, "loss": 0.66026258, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.186741590499878 }, { "auxiliary_loss_clip": 0.01282993, "auxiliary_loss_mlp": 0.01043832, "balance_loss_clip": 1.07873702, "balance_loss_mlp": 1.03239346, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.811485283995285, "language_loss": 0.808442, "learning_rate": 3.984871036367492e-06, "loss": 0.83171022, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 3.574991226196289 }, { "auxiliary_loss_clip": 0.01331603, "auxiliary_loss_mlp": 0.02592403, "balance_loss_clip": 1.07974792, "balance_loss_mlp": 1.00019908, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 2.132544219830055, "language_loss": 0.83489972, "learning_rate": 3.984775253787102e-06, "loss": 0.87413979, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 3.6243817806243896 }, { "auxiliary_loss_clip": 0.01334866, "auxiliary_loss_mlp": 0.01048819, "balance_loss_clip": 1.07502401, "balance_loss_mlp": 1.03769064, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 3.624395823004769, "language_loss": 0.88061869, "learning_rate": 3.984679170117885e-06, "loss": 0.90445548, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 2.646669387817383 }, { "auxiliary_loss_clip": 0.01323984, "auxiliary_loss_mlp": 0.010413, "balance_loss_clip": 1.07420087, "balance_loss_mlp": 1.02984393, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 4.905166664749562, "language_loss": 0.78321946, "learning_rate": 3.984582785374415e-06, "loss": 0.80687225, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 3.559953212738037 }, { "auxiliary_loss_clip": 0.01379948, "auxiliary_loss_mlp": 0.02591523, "balance_loss_clip": 1.07702088, "balance_loss_mlp": 1.00020242, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 2.2353455054588016, "language_loss": 0.80898416, "learning_rate": 3.9844860995713155e-06, "loss": 0.84869879, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.75514554977417 }, { "auxiliary_loss_clip": 0.01332288, "auxiliary_loss_mlp": 0.01051538, "balance_loss_clip": 1.081002, "balance_loss_mlp": 1.03925359, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 2.380604135870617, "language_loss": 0.82608557, "learning_rate": 3.9843891127232524e-06, "loss": 0.84992379, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.7105190753936768 }, { "auxiliary_loss_clip": 0.01469889, "auxiliary_loss_mlp": 0.01044517, "balance_loss_clip": 1.06804264, "balance_loss_mlp": 1.03313804, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.323454812733296, "language_loss": 0.67126977, "learning_rate": 3.984291824844938e-06, "loss": 0.69641387, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.7614810466766357 }, { "auxiliary_loss_clip": 0.01278101, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.07646096, "balance_loss_mlp": 1.03456211, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 2.3684043832581123, "language_loss": 0.85084629, "learning_rate": 3.984194235951132e-06, "loss": 0.87409055, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.6283726692199707 }, { "auxiliary_loss_clip": 0.01284537, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.08174014, "balance_loss_mlp": 1.03303409, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 2.5980287288834596, "language_loss": 0.84594715, "learning_rate": 3.9840963460566375e-06, "loss": 0.86924231, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.619202136993408 }, { "auxiliary_loss_clip": 0.01515479, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.0642643, "balance_loss_mlp": 1.02582741, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.6440755303063885, "language_loss": 0.89472568, "learning_rate": 3.983998155176305e-06, "loss": 0.92025274, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.8686869144439697 }, { "auxiliary_loss_clip": 0.01193103, "auxiliary_loss_mlp": 0.01011794, "balance_loss_clip": 1.06166148, "balance_loss_mlp": 1.00614381, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8174076643515461, "language_loss": 0.57029331, "learning_rate": 3.9838996633250305e-06, "loss": 0.59234226, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.1449880599975586 }, { "auxiliary_loss_clip": 0.01327795, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.07266545, "balance_loss_mlp": 1.0342505, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.0650688660850354, "language_loss": 0.88425511, "learning_rate": 3.983800870517753e-06, "loss": 0.90798926, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.640011787414551 }, { "auxiliary_loss_clip": 0.01326504, "auxiliary_loss_mlp": 0.01049885, "balance_loss_clip": 1.08058357, "balance_loss_mlp": 1.03818464, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 2.995720277815689, "language_loss": 0.77964306, "learning_rate": 3.983701776769463e-06, "loss": 0.80340689, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.6770009994506836 }, { "auxiliary_loss_clip": 0.0131564, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.07288527, "balance_loss_mlp": 1.04086816, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 2.091972156753445, "language_loss": 0.85534918, "learning_rate": 3.9836023820951885e-06, "loss": 0.87902611, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.661500930786133 }, { "auxiliary_loss_clip": 0.01414079, "auxiliary_loss_mlp": 0.01046674, "balance_loss_clip": 1.06534457, "balance_loss_mlp": 1.03568256, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 1.9213090339996477, "language_loss": 0.68743938, "learning_rate": 3.983502686510011e-06, "loss": 0.71204686, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.7717554569244385 }, { "auxiliary_loss_clip": 0.01327992, "auxiliary_loss_mlp": 0.02590111, "balance_loss_clip": 1.07304001, "balance_loss_mlp": 1.00012863, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.8138744500343353, "language_loss": 0.73308027, "learning_rate": 3.9834026900290525e-06, "loss": 0.77226132, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.641983985900879 }, { "auxiliary_loss_clip": 0.01279935, "auxiliary_loss_mlp": 0.01036456, "balance_loss_clip": 1.07861042, "balance_loss_mlp": 1.02573895, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 3.0064046867762197, "language_loss": 1.00068271, "learning_rate": 3.983302392667482e-06, "loss": 1.02384663, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.7552082538604736 }, { "auxiliary_loss_clip": 0.01325849, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.07711685, "balance_loss_mlp": 1.02761459, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.7923193074041923, "language_loss": 0.9359417, "learning_rate": 3.983201794440517e-06, "loss": 0.9595862, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.679828643798828 }, { "auxiliary_loss_clip": 0.0136172, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.06959522, "balance_loss_mlp": 1.03293014, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.9162713801064692, "language_loss": 0.67672759, "learning_rate": 3.9831008953634165e-06, "loss": 0.70078766, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.740778684616089 }, { "auxiliary_loss_clip": 0.01458923, "auxiliary_loss_mlp": 0.01049088, "balance_loss_clip": 1.06546712, "balance_loss_mlp": 1.03757215, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 1.9446417163439316, "language_loss": 0.81267118, "learning_rate": 3.9829996954514864e-06, "loss": 0.83775127, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.8337106704711914 }, { "auxiliary_loss_clip": 0.01320857, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.07396078, "balance_loss_mlp": 1.02917981, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.9407157925798153, "language_loss": 0.84394038, "learning_rate": 3.982898194720079e-06, "loss": 0.86755127, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.7459158897399902 }, { "auxiliary_loss_clip": 0.01372263, "auxiliary_loss_mlp": 0.02589897, "balance_loss_clip": 1.07277942, "balance_loss_mlp": 1.00015092, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.4170370707422655, "language_loss": 0.82439572, "learning_rate": 3.982796393184592e-06, "loss": 0.86401737, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.792184352874756 }, { "auxiliary_loss_clip": 0.01246312, "auxiliary_loss_mlp": 0.01004903, "balance_loss_clip": 1.06016707, "balance_loss_mlp": 0.99956208, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7929743237528213, "language_loss": 0.62719899, "learning_rate": 3.98269429086047e-06, "loss": 0.64971119, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 4.102558851242065 }, { "auxiliary_loss_clip": 0.01364728, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.07058382, "balance_loss_mlp": 1.0285871, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 3.6028106634356956, "language_loss": 0.86754799, "learning_rate": 3.982591887763199e-06, "loss": 0.89160717, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.740933895111084 }, { "auxiliary_loss_clip": 0.01401204, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.05884552, "balance_loss_mlp": 1.02393281, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.358007728259653, "language_loss": 0.82031995, "learning_rate": 3.982489183908316e-06, "loss": 0.84468818, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.7460474967956543 }, { "auxiliary_loss_clip": 0.01500306, "auxiliary_loss_mlp": 0.010464, "balance_loss_clip": 1.05585265, "balance_loss_mlp": 1.03578436, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 2.046104845492566, "language_loss": 0.84522831, "learning_rate": 3.982386179311399e-06, "loss": 0.87069535, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.794407367706299 }, { "auxiliary_loss_clip": 0.01323779, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.07319307, "balance_loss_mlp": 1.02775931, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.404099222864785, "language_loss": 0.87385124, "learning_rate": 3.982282873988075e-06, "loss": 0.89748245, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 3.615161180496216 }, { "auxiliary_loss_clip": 0.01369181, "auxiliary_loss_mlp": 0.01044211, "balance_loss_clip": 1.07268095, "balance_loss_mlp": 1.03310037, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.6258481454373053, "language_loss": 0.86850631, "learning_rate": 3.982179267954016e-06, "loss": 0.89264023, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 3.680293560028076 }, { "auxiliary_loss_clip": 0.01275702, "auxiliary_loss_mlp": 0.01040114, "balance_loss_clip": 1.07606971, "balance_loss_mlp": 1.02884829, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.6214719704868488, "language_loss": 0.96085095, "learning_rate": 3.982075361224937e-06, "loss": 0.98400915, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.594738245010376 }, { "auxiliary_loss_clip": 0.01318117, "auxiliary_loss_mlp": 0.02588341, "balance_loss_clip": 1.07338858, "balance_loss_mlp": 1.0000689, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 2.705273344435775, "language_loss": 0.88262516, "learning_rate": 3.981971153816602e-06, "loss": 0.92168975, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.696108818054199 }, { "auxiliary_loss_clip": 0.01271215, "auxiliary_loss_mlp": 0.01042222, "balance_loss_clip": 1.07816267, "balance_loss_mlp": 1.03194594, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.5766541195687978, "language_loss": 0.96275836, "learning_rate": 3.981866645744819e-06, "loss": 0.98589271, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.6414456367492676 }, { "auxiliary_loss_clip": 0.01275667, "auxiliary_loss_mlp": 0.02588094, "balance_loss_clip": 1.07619989, "balance_loss_mlp": 1.00006914, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.634035392804029, "language_loss": 0.81332111, "learning_rate": 3.9817618370254416e-06, "loss": 0.85195875, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.63093638420105 }, { "auxiliary_loss_clip": 0.01274481, "auxiliary_loss_mlp": 0.01041099, "balance_loss_clip": 1.07569158, "balance_loss_mlp": 1.03001833, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.034289880150531, "language_loss": 0.87367582, "learning_rate": 3.9816567276743684e-06, "loss": 0.89683163, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.705021619796753 }, { "auxiliary_loss_clip": 0.01361201, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.07030821, "balance_loss_mlp": 1.0254662, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 2.169846062661895, "language_loss": 0.77494448, "learning_rate": 3.9815513177075466e-06, "loss": 0.79891324, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.747922658920288 }, { "auxiliary_loss_clip": 0.01316175, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.07418871, "balance_loss_mlp": 1.0296979, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.8050628545500094, "language_loss": 0.70299006, "learning_rate": 3.9814456071409646e-06, "loss": 0.72656167, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.7633063793182373 }, { "auxiliary_loss_clip": 0.01470169, "auxiliary_loss_mlp": 0.01046465, "balance_loss_clip": 1.06658196, "balance_loss_mlp": 1.03414512, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 2.752607975583128, "language_loss": 0.85236084, "learning_rate": 3.981339595990659e-06, "loss": 0.87752718, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.8157124519348145 }, { "auxiliary_loss_clip": 0.01317142, "auxiliary_loss_mlp": 0.010377, "balance_loss_clip": 1.07230854, "balance_loss_mlp": 1.0262146, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.271951294932348, "language_loss": 0.80880862, "learning_rate": 3.981233284272713e-06, "loss": 0.83235705, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.720517158508301 }, { "auxiliary_loss_clip": 0.01418958, "auxiliary_loss_mlp": 0.01041325, "balance_loss_clip": 1.06824207, "balance_loss_mlp": 1.03125238, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.866897519682805, "language_loss": 0.90240413, "learning_rate": 3.981126672003253e-06, "loss": 0.92700696, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.8238914012908936 }, { "auxiliary_loss_clip": 0.01374475, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.06761181, "balance_loss_mlp": 1.02534616, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.4516474696374697, "language_loss": 0.78286362, "learning_rate": 3.981019759198451e-06, "loss": 0.80695915, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.780959367752075 }, { "auxiliary_loss_clip": 0.01370477, "auxiliary_loss_mlp": 0.01039585, "balance_loss_clip": 1.06986582, "balance_loss_mlp": 1.02950597, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.2060154818369635, "language_loss": 0.84251404, "learning_rate": 3.980912545874528e-06, "loss": 0.8666147, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.716996431350708 }, { "auxiliary_loss_clip": 0.01317729, "auxiliary_loss_mlp": 0.02586734, "balance_loss_clip": 1.0708034, "balance_loss_mlp": 1.00000763, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 1.991931988063435, "language_loss": 0.8560164, "learning_rate": 3.980805032047746e-06, "loss": 0.89506108, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.7763590812683105 }, { "auxiliary_loss_clip": 0.01370352, "auxiliary_loss_mlp": 0.0103848, "balance_loss_clip": 1.07419145, "balance_loss_mlp": 1.02638626, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 2.5649290994390754, "language_loss": 0.80951202, "learning_rate": 3.980697217734415e-06, "loss": 0.83360034, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.6567769050598145 }, { "auxiliary_loss_clip": 0.01466115, "auxiliary_loss_mlp": 0.02583623, "balance_loss_clip": 1.06662631, "balance_loss_mlp": 1.0000236, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 2.1525396954012863, "language_loss": 0.91592002, "learning_rate": 3.980589102950891e-06, "loss": 0.95641744, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.7612292766571045 }, { "auxiliary_loss_clip": 0.01372268, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.07645941, "balance_loss_mlp": 1.02950263, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.9314771281598087, "language_loss": 0.76049507, "learning_rate": 3.9804806877135755e-06, "loss": 0.78462386, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.786144971847534 }, { "auxiliary_loss_clip": 0.01319401, "auxiliary_loss_mlp": 0.02588991, "balance_loss_clip": 1.06915426, "balance_loss_mlp": 1.00003171, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.589581944171058, "language_loss": 0.86401415, "learning_rate": 3.980371972038915e-06, "loss": 0.90309811, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.6964573860168457 }, { "auxiliary_loss_clip": 0.01274618, "auxiliary_loss_mlp": 0.01049529, "balance_loss_clip": 1.0769341, "balance_loss_mlp": 1.03879404, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 2.1131833168189282, "language_loss": 0.84383118, "learning_rate": 3.980262955943399e-06, "loss": 0.86707258, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.6877377033233643 }, { "auxiliary_loss_clip": 0.01366884, "auxiliary_loss_mlp": 0.01046273, "balance_loss_clip": 1.07567906, "balance_loss_mlp": 1.0353651, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.5141329825158545, "language_loss": 0.8668468, "learning_rate": 3.980153639443569e-06, "loss": 0.8909784, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.6435089111328125 }, { "auxiliary_loss_clip": 0.0137721, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.0724529, "balance_loss_mlp": 1.03463817, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.713643393749694, "language_loss": 0.80228472, "learning_rate": 3.980044022556005e-06, "loss": 0.82651412, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.7358334064483643 }, { "auxiliary_loss_clip": 0.01322716, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0754385, "balance_loss_mlp": 1.0336287, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.167438549205323, "language_loss": 0.73055214, "learning_rate": 3.9799341052973375e-06, "loss": 0.75423074, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 3.617845058441162 }, { "auxiliary_loss_clip": 0.01369878, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.07542133, "balance_loss_mlp": 1.03163719, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.4416784665132534, "language_loss": 0.75106311, "learning_rate": 3.979823887684241e-06, "loss": 0.77519464, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.750213861465454 }, { "auxiliary_loss_clip": 0.01270006, "auxiliary_loss_mlp": 0.01037998, "balance_loss_clip": 1.07558298, "balance_loss_mlp": 1.02725697, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.418715251030413, "language_loss": 0.84618211, "learning_rate": 3.979713369733434e-06, "loss": 0.8692621, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 2.6227598190307617 }, { "auxiliary_loss_clip": 0.01315707, "auxiliary_loss_mlp": 0.01054328, "balance_loss_clip": 1.07169461, "balance_loss_mlp": 1.04257393, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 1.9995778997369211, "language_loss": 0.85182232, "learning_rate": 3.979602551461683e-06, "loss": 0.87552267, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.6946120262145996 }, { "auxiliary_loss_clip": 0.01365554, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.0724659, "balance_loss_mlp": 1.02854037, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 4.100800929031688, "language_loss": 0.9167217, "learning_rate": 3.979491432885799e-06, "loss": 0.94077504, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 4.443578004837036 }, { "auxiliary_loss_clip": 0.01403637, "auxiliary_loss_mlp": 0.02582791, "balance_loss_clip": 1.06645751, "balance_loss_mlp": 0.99995482, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 1.9997598613473169, "language_loss": 0.83144248, "learning_rate": 3.97938001402264e-06, "loss": 0.87130678, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 2.769505262374878 }, { "auxiliary_loss_clip": 0.01417089, "auxiliary_loss_mlp": 0.01049562, "balance_loss_clip": 1.07027745, "balance_loss_mlp": 1.03823113, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 11.70966078542185, "language_loss": 0.80008483, "learning_rate": 3.979268294889105e-06, "loss": 0.82475138, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 2.726390838623047 }, { "auxiliary_loss_clip": 0.01271643, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.07534194, "balance_loss_mlp": 1.03347182, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 2.715976302951772, "language_loss": 0.73882186, "learning_rate": 3.979156275502143e-06, "loss": 0.7619828, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 3.7880072593688965 }, { "auxiliary_loss_clip": 0.0141842, "auxiliary_loss_mlp": 0.01043934, "balance_loss_clip": 1.0701499, "balance_loss_mlp": 1.03241885, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.410617047985421, "language_loss": 0.91473055, "learning_rate": 3.979043955878749e-06, "loss": 0.93935406, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.7290053367614746 }, { "auxiliary_loss_clip": 0.01368003, "auxiliary_loss_mlp": 0.01052275, "balance_loss_clip": 1.07316232, "balance_loss_mlp": 1.04105711, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 1.9931475024015706, "language_loss": 0.83341336, "learning_rate": 3.978931336035959e-06, "loss": 0.85761613, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.816680431365967 }, { "auxiliary_loss_clip": 0.01320053, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.07497036, "balance_loss_mlp": 1.02707386, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.3023782102516406, "language_loss": 0.82541013, "learning_rate": 3.9788184159908595e-06, "loss": 0.84899068, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.680757999420166 }, { "auxiliary_loss_clip": 0.01362111, "auxiliary_loss_mlp": 0.01041706, "balance_loss_clip": 1.0692277, "balance_loss_mlp": 1.03023851, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 3.796256522796148, "language_loss": 0.8212083, "learning_rate": 3.97870519576058e-06, "loss": 0.84524649, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.6835975646972656 }, { "auxiliary_loss_clip": 0.01410353, "auxiliary_loss_mlp": 0.02584336, "balance_loss_clip": 1.0658592, "balance_loss_mlp": 0.99998689, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.421101239204768, "language_loss": 0.81034607, "learning_rate": 3.978591675362295e-06, "loss": 0.85029292, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.8495218753814697 }, { "auxiliary_loss_clip": 0.01464973, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.07168055, "balance_loss_mlp": 1.02637863, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 2.104075856930006, "language_loss": 0.87813652, "learning_rate": 3.978477854813226e-06, "loss": 0.9031601, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.7442119121551514 }, { "auxiliary_loss_clip": 0.01312933, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.06802881, "balance_loss_mlp": 1.03216171, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.921031606221562, "language_loss": 0.82181978, "learning_rate": 3.97836373413064e-06, "loss": 0.84538138, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.661221742630005 }, { "auxiliary_loss_clip": 0.0127106, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.07547498, "balance_loss_mlp": 1.02770209, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 2.0445131805385466, "language_loss": 0.74909699, "learning_rate": 3.978249313331848e-06, "loss": 0.77219296, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.633612632751465 }, { "auxiliary_loss_clip": 0.01325148, "auxiliary_loss_mlp": 0.02586405, "balance_loss_clip": 1.07289243, "balance_loss_mlp": 0.99997771, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 4.988731211072895, "language_loss": 0.62452149, "learning_rate": 3.978134592434208e-06, "loss": 0.66363704, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.747529983520508 }, { "auxiliary_loss_clip": 0.01378924, "auxiliary_loss_mlp": 0.01006631, "balance_loss_clip": 1.04696894, "balance_loss_mlp": 1.00253069, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0190153987490642, "language_loss": 0.59394765, "learning_rate": 3.978019571455123e-06, "loss": 0.61780316, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.343780040740967 }, { "auxiliary_loss_clip": 0.01263092, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.072896, "balance_loss_mlp": 1.03248382, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.4984586642493722, "language_loss": 0.84336615, "learning_rate": 3.977904250412042e-06, "loss": 0.86641848, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.6478588581085205 }, { "auxiliary_loss_clip": 0.01367995, "auxiliary_loss_mlp": 0.01047633, "balance_loss_clip": 1.06855333, "balance_loss_mlp": 1.03718472, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.2757272039393626, "language_loss": 0.85617548, "learning_rate": 3.97778862932246e-06, "loss": 0.88033175, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.678265333175659 }, { "auxiliary_loss_clip": 0.01587019, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.0459404, "balance_loss_mlp": 1.02720833, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.2287751583473328, "language_loss": 0.94204599, "learning_rate": 3.9776727082039144e-06, "loss": 0.96828699, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.0301592350006104 }, { "auxiliary_loss_clip": 0.01189881, "auxiliary_loss_mlp": 0.01008643, "balance_loss_clip": 1.06086266, "balance_loss_mlp": 1.00349355, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.8076855183480597, "language_loss": 0.55457997, "learning_rate": 3.977556487073991e-06, "loss": 0.57656527, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.4673964977264404 }, { "auxiliary_loss_clip": 0.01354606, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.06308961, "balance_loss_mlp": 1.03010142, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.7477138630038351, "language_loss": 0.81525916, "learning_rate": 3.97743996595032e-06, "loss": 0.83921659, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.745070695877075 }, { "auxiliary_loss_clip": 0.01266199, "auxiliary_loss_mlp": 0.01044778, "balance_loss_clip": 1.07383657, "balance_loss_mlp": 1.03403091, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.6920031253231589, "language_loss": 0.81616622, "learning_rate": 3.9773231448505804e-06, "loss": 0.83927596, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.736001491546631 }, { "auxiliary_loss_clip": 0.01361731, "auxiliary_loss_mlp": 0.0258318, "balance_loss_clip": 1.07162213, "balance_loss_mlp": 0.99994522, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 2.1112790470177845, "language_loss": 0.78240907, "learning_rate": 3.977206023792491e-06, "loss": 0.82185823, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.7096707820892334 }, { "auxiliary_loss_clip": 0.0131472, "auxiliary_loss_mlp": 0.01044972, "balance_loss_clip": 1.07485461, "balance_loss_mlp": 1.03427863, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.655831997104829, "language_loss": 0.81327236, "learning_rate": 3.97708860279382e-06, "loss": 0.83686924, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.697643756866455 }, { "auxiliary_loss_clip": 0.01410196, "auxiliary_loss_mlp": 0.01042089, "balance_loss_clip": 1.06550765, "balance_loss_mlp": 1.03143156, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.87958228618036, "language_loss": 0.77932179, "learning_rate": 3.97697088187238e-06, "loss": 0.80384469, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 3.794719696044922 }, { "auxiliary_loss_clip": 0.01362012, "auxiliary_loss_mlp": 0.01042051, "balance_loss_clip": 1.07248259, "balance_loss_mlp": 1.03158426, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.1444766363261785, "language_loss": 0.91988015, "learning_rate": 3.976852861046029e-06, "loss": 0.94392079, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.6646389961242676 }, { "auxiliary_loss_clip": 0.01450535, "auxiliary_loss_mlp": 0.01040663, "balance_loss_clip": 1.06318104, "balance_loss_mlp": 1.03047633, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 2.225725985946451, "language_loss": 0.80374354, "learning_rate": 3.97673454033267e-06, "loss": 0.82865554, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 2.8306381702423096 }, { "auxiliary_loss_clip": 0.0136648, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.06988263, "balance_loss_mlp": 1.02686429, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 2.1134771855324064, "language_loss": 0.82620847, "learning_rate": 3.976615919750254e-06, "loss": 0.85024428, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.7072434425354004 }, { "auxiliary_loss_clip": 0.01311302, "auxiliary_loss_mlp": 0.01040538, "balance_loss_clip": 1.07239223, "balance_loss_mlp": 1.0305661, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 1.8677005912745428, "language_loss": 0.87147593, "learning_rate": 3.976496999316775e-06, "loss": 0.89499426, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 4.388904094696045 }, { "auxiliary_loss_clip": 0.01361242, "auxiliary_loss_mlp": 0.01045683, "balance_loss_clip": 1.0734725, "balance_loss_mlp": 1.034621, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.5602259073398894, "language_loss": 0.84191418, "learning_rate": 3.976377779050271e-06, "loss": 0.86598343, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 2.718125581741333 }, { "auxiliary_loss_clip": 0.01306414, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.06855416, "balance_loss_mlp": 1.028198, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.6345616907279337, "language_loss": 0.84358037, "learning_rate": 3.976258258968831e-06, "loss": 0.86703771, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.6911776065826416 }, { "auxiliary_loss_clip": 0.01410764, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.07140398, "balance_loss_mlp": 1.03406525, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.396067901460766, "language_loss": 0.74805725, "learning_rate": 3.976138439090583e-06, "loss": 0.77262014, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.830609083175659 }, { "auxiliary_loss_clip": 0.01410726, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.06949258, "balance_loss_mlp": 1.02756548, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.84078444117434, "language_loss": 0.8527171, "learning_rate": 3.976018319433706e-06, "loss": 0.87721515, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.787923812866211 }, { "auxiliary_loss_clip": 0.01310687, "auxiliary_loss_mlp": 0.01039637, "balance_loss_clip": 1.07126689, "balance_loss_mlp": 1.02941442, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.2673977423636846, "language_loss": 0.91244781, "learning_rate": 3.9758979000164205e-06, "loss": 0.93595099, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.6814589500427246 }, { "auxiliary_loss_clip": 0.01419086, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 1.06971252, "balance_loss_mlp": 1.01899576, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.8115009914860862, "language_loss": 0.71952432, "learning_rate": 3.975777180856995e-06, "loss": 0.74401617, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.810716390609741 }, { "auxiliary_loss_clip": 0.01263775, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.07106519, "balance_loss_mlp": 1.03206921, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 4.330494463606602, "language_loss": 0.86133635, "learning_rate": 3.975656161973742e-06, "loss": 0.88440001, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.651812791824341 }, { "auxiliary_loss_clip": 0.01265407, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.07265532, "balance_loss_mlp": 1.03549242, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 4.194050394785483, "language_loss": 0.88856524, "learning_rate": 3.9755348433850194e-06, "loss": 0.91168725, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.7462494373321533 }, { "auxiliary_loss_clip": 0.01286369, "auxiliary_loss_mlp": 0.01004568, "balance_loss_clip": 1.05582106, "balance_loss_mlp": 1.00058615, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9666744035646373, "language_loss": 0.63580668, "learning_rate": 3.975413225109232e-06, "loss": 0.65871596, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.2896504402160645 }, { "auxiliary_loss_clip": 0.01313808, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.07289994, "balance_loss_mlp": 1.02604818, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 4.452199040366271, "language_loss": 0.93651056, "learning_rate": 3.975291307164829e-06, "loss": 0.96001345, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.8129379749298096 }, { "auxiliary_loss_clip": 0.01401579, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.06469536, "balance_loss_mlp": 1.02234268, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 2.475213529453682, "language_loss": 0.85347223, "learning_rate": 3.975169089570306e-06, "loss": 0.87780601, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.8161113262176514 }, { "auxiliary_loss_clip": 0.01301611, "auxiliary_loss_mlp": 0.0103798, "balance_loss_clip": 1.06883311, "balance_loss_mlp": 1.02731133, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 2.22432295673805, "language_loss": 0.91681844, "learning_rate": 3.975046572344202e-06, "loss": 0.94021434, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.73071026802063 }, { "auxiliary_loss_clip": 0.01414539, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.06401336, "balance_loss_mlp": 1.02551389, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.80850665105051, "language_loss": 0.71157044, "learning_rate": 3.974923755505103e-06, "loss": 0.73607361, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.7933425903320312 }, { "auxiliary_loss_clip": 0.01408148, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.06573009, "balance_loss_mlp": 1.02629519, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 2.418848461148237, "language_loss": 0.91312742, "learning_rate": 3.974800639071641e-06, "loss": 0.93757206, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.7976441383361816 }, { "auxiliary_loss_clip": 0.01503583, "auxiliary_loss_mlp": 0.02582603, "balance_loss_clip": 1.06052828, "balance_loss_mlp": 0.99993908, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 3.827365474743317, "language_loss": 1.00730145, "learning_rate": 3.974677223062492e-06, "loss": 1.04816329, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.994826316833496 }, { "auxiliary_loss_clip": 0.01359994, "auxiliary_loss_mlp": 0.01029487, "balance_loss_clip": 1.07393765, "balance_loss_mlp": 1.0203855, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 2.0519239915557663, "language_loss": 0.74577761, "learning_rate": 3.974553507496378e-06, "loss": 0.76967239, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 3.0851714611053467 }, { "auxiliary_loss_clip": 0.0135571, "auxiliary_loss_mlp": 0.01042213, "balance_loss_clip": 1.07052684, "balance_loss_mlp": 1.03188324, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.7590544493417344, "language_loss": 0.89596009, "learning_rate": 3.974429492392068e-06, "loss": 0.91993934, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.8539884090423584 }, { "auxiliary_loss_clip": 0.01263218, "auxiliary_loss_mlp": 0.02581248, "balance_loss_clip": 1.07579803, "balance_loss_mlp": 0.99993646, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 2.160894253854102, "language_loss": 0.91024911, "learning_rate": 3.974305177768373e-06, "loss": 0.94869375, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.6899778842926025 }, { "auxiliary_loss_clip": 0.014046, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.06904674, "balance_loss_mlp": 1.02746129, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.4196737054207365, "language_loss": 0.87168801, "learning_rate": 3.974180563644152e-06, "loss": 0.89611423, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.735672950744629 }, { "auxiliary_loss_clip": 0.01364285, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.06969213, "balance_loss_mlp": 1.03160977, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.2317896603682508, "language_loss": 0.88769913, "learning_rate": 3.97405565003831e-06, "loss": 0.91175985, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.7124035358428955 }, { "auxiliary_loss_clip": 0.01411397, "auxiliary_loss_mlp": 0.01038003, "balance_loss_clip": 1.06897187, "balance_loss_mlp": 1.02776933, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.2202219116534176, "language_loss": 0.78043801, "learning_rate": 3.973930436969794e-06, "loss": 0.804932, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.7314093112945557 }, { "auxiliary_loss_clip": 0.01351569, "auxiliary_loss_mlp": 0.01038754, "balance_loss_clip": 1.06511188, "balance_loss_mlp": 1.02831078, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 2.2298956317520697, "language_loss": 0.85867828, "learning_rate": 3.973804924457602e-06, "loss": 0.88258147, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 3.6435461044311523 }, { "auxiliary_loss_clip": 0.01356209, "auxiliary_loss_mlp": 0.01046172, "balance_loss_clip": 1.06595874, "balance_loss_mlp": 1.03523493, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.8231781356165335, "language_loss": 0.85696983, "learning_rate": 3.973679112520771e-06, "loss": 0.88099372, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.7839086055755615 }, { "auxiliary_loss_clip": 0.01402569, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.06276011, "balance_loss_mlp": 1.02384806, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 1.9005585747904765, "language_loss": 0.98955309, "learning_rate": 3.973553001178389e-06, "loss": 1.01392365, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 2.7513139247894287 }, { "auxiliary_loss_clip": 0.01409803, "auxiliary_loss_mlp": 0.01041733, "balance_loss_clip": 1.07002664, "balance_loss_mlp": 1.03115964, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.6202573737832022, "language_loss": 0.75817412, "learning_rate": 3.973426590449585e-06, "loss": 0.78268945, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 3.7121331691741943 }, { "auxiliary_loss_clip": 0.01456722, "auxiliary_loss_mlp": 0.01043782, "balance_loss_clip": 1.06553507, "balance_loss_mlp": 1.03324413, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 2.9911098963833727, "language_loss": 0.75127769, "learning_rate": 3.9732998803535364e-06, "loss": 0.77628273, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 3.675887107849121 }, { "auxiliary_loss_clip": 0.01262034, "auxiliary_loss_mlp": 0.01049825, "balance_loss_clip": 1.07189822, "balance_loss_mlp": 1.0396862, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.7144893555440266, "language_loss": 0.85259402, "learning_rate": 3.973172870909465e-06, "loss": 0.87571263, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 2.6463539600372314 }, { "auxiliary_loss_clip": 0.01364566, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.06731057, "balance_loss_mlp": 1.02623761, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 3.223350592851593, "language_loss": 0.80817735, "learning_rate": 3.973045562136638e-06, "loss": 0.83219314, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 2.7328684329986572 }, { "auxiliary_loss_clip": 0.0131357, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.06932211, "balance_loss_mlp": 1.0285809, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.197025500857738, "language_loss": 0.91614586, "learning_rate": 3.972917954054368e-06, "loss": 0.93966913, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 3.5203258991241455 }, { "auxiliary_loss_clip": 0.01358625, "auxiliary_loss_mlp": 0.01042499, "balance_loss_clip": 1.06962013, "balance_loss_mlp": 1.0314604, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 3.0291848761949223, "language_loss": 0.82208902, "learning_rate": 3.972790046682013e-06, "loss": 0.84610027, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.8011114597320557 }, { "auxiliary_loss_clip": 0.01409444, "auxiliary_loss_mlp": 0.01037354, "balance_loss_clip": 1.06315792, "balance_loss_mlp": 1.02735853, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.9748189433286991, "language_loss": 0.79138386, "learning_rate": 3.972661840038977e-06, "loss": 0.81585187, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.7794582843780518 }, { "auxiliary_loss_clip": 0.01314533, "auxiliary_loss_mlp": 0.01050385, "balance_loss_clip": 1.07207286, "balance_loss_mlp": 1.04024649, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.3678113288527345, "language_loss": 0.83488858, "learning_rate": 3.972533334144707e-06, "loss": 0.85853773, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.7211384773254395 }, { "auxiliary_loss_clip": 0.01314279, "auxiliary_loss_mlp": 0.01040187, "balance_loss_clip": 1.06811666, "balance_loss_mlp": 1.0297507, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.4270072497877297, "language_loss": 0.78467476, "learning_rate": 3.972404529018699e-06, "loss": 0.80821943, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.722174644470215 }, { "auxiliary_loss_clip": 0.01360077, "auxiliary_loss_mlp": 0.0103198, "balance_loss_clip": 1.06197524, "balance_loss_mlp": 1.02218711, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 2.839141865278411, "language_loss": 0.85579526, "learning_rate": 3.972275424680493e-06, "loss": 0.8797158, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.739910840988159 }, { "auxiliary_loss_clip": 0.01259816, "auxiliary_loss_mlp": 0.01037663, "balance_loss_clip": 1.07028222, "balance_loss_mlp": 1.02645707, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.3543173210471373, "language_loss": 0.91800451, "learning_rate": 3.972146021149673e-06, "loss": 0.94097924, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.6989400386810303 }, { "auxiliary_loss_clip": 0.01404949, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.06414425, "balance_loss_mlp": 1.03450322, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 2.8730445061163237, "language_loss": 0.79016995, "learning_rate": 3.972016318445868e-06, "loss": 0.81465542, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.8389296531677246 }, { "auxiliary_loss_clip": 0.01313591, "auxiliary_loss_mlp": 0.01037415, "balance_loss_clip": 1.07077038, "balance_loss_mlp": 1.02673948, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 1.9830962458314851, "language_loss": 0.92355907, "learning_rate": 3.971886316588757e-06, "loss": 0.94706911, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.8532533645629883 }, { "auxiliary_loss_clip": 0.01401912, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.06584013, "balance_loss_mlp": 1.0257647, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.3779268328919008, "language_loss": 0.73717785, "learning_rate": 3.9717560155980595e-06, "loss": 0.76155996, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.83217716217041 }, { "auxiliary_loss_clip": 0.01308499, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 1.06761336, "balance_loss_mlp": 1.03359127, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 2.313076678205687, "language_loss": 0.9213745, "learning_rate": 3.971625415493542e-06, "loss": 0.94489378, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.807955265045166 }, { "auxiliary_loss_clip": 0.01402447, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.06333137, "balance_loss_mlp": 1.02419972, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 2.2640871146991635, "language_loss": 0.87610269, "learning_rate": 3.971494516295017e-06, "loss": 0.9004727, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.9822959899902344 }, { "auxiliary_loss_clip": 0.01405906, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.06119478, "balance_loss_mlp": 1.02743173, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 38.265939661304444, "language_loss": 0.85398424, "learning_rate": 3.971363318022341e-06, "loss": 0.87840933, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.8582754135131836 }, { "auxiliary_loss_clip": 0.01362062, "auxiliary_loss_mlp": 0.01029425, "balance_loss_clip": 1.06243217, "balance_loss_mlp": 1.0195843, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 1.876856375304612, "language_loss": 0.68467045, "learning_rate": 3.971231820695417e-06, "loss": 0.70858538, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 3.0165061950683594 }, { "auxiliary_loss_clip": 0.01359633, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.06579626, "balance_loss_mlp": 1.02769578, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 1.9152014785182616, "language_loss": 0.80927873, "learning_rate": 3.971100024334193e-06, "loss": 0.833251, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.8164446353912354 }, { "auxiliary_loss_clip": 0.01393606, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.05856955, "balance_loss_mlp": 1.03076458, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.776459366740247, "language_loss": 0.86205864, "learning_rate": 3.970967928958663e-06, "loss": 0.88639635, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.8166511058807373 }, { "auxiliary_loss_clip": 0.01458614, "auxiliary_loss_mlp": 0.01039283, "balance_loss_clip": 1.05963683, "balance_loss_mlp": 1.02906644, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.6951955706755704, "language_loss": 0.83538759, "learning_rate": 3.970835534588865e-06, "loss": 0.86036658, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.8753840923309326 }, { "auxiliary_loss_clip": 0.01363196, "auxiliary_loss_mlp": 0.01036394, "balance_loss_clip": 1.07137954, "balance_loss_mlp": 1.02632046, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 2.2110592799773277, "language_loss": 0.85439324, "learning_rate": 3.970702841244883e-06, "loss": 0.87838912, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.8695499897003174 }, { "auxiliary_loss_clip": 0.01312252, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.07136893, "balance_loss_mlp": 1.02616477, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.9279812050122374, "language_loss": 0.82704985, "learning_rate": 3.970569848946847e-06, "loss": 0.85053039, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.754331350326538 }, { "auxiliary_loss_clip": 0.01299622, "auxiliary_loss_mlp": 0.01042106, "balance_loss_clip": 1.06768799, "balance_loss_mlp": 1.03227699, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 2.109284883161969, "language_loss": 0.83064294, "learning_rate": 3.970436557714932e-06, "loss": 0.85406017, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 3.618786573410034 }, { "auxiliary_loss_clip": 0.01352168, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.06718338, "balance_loss_mlp": 1.02587318, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 3.1218122666125074, "language_loss": 0.86961806, "learning_rate": 3.970302967569358e-06, "loss": 0.89350289, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.7313051223754883 }, { "auxiliary_loss_clip": 0.0131378, "auxiliary_loss_mlp": 0.01037024, "balance_loss_clip": 1.07343316, "balance_loss_mlp": 1.02636659, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 2.112607004014389, "language_loss": 0.68074435, "learning_rate": 3.9701690785303896e-06, "loss": 0.70425242, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 2.7107737064361572 }, { "auxiliary_loss_clip": 0.01310597, "auxiliary_loss_mlp": 0.01039408, "balance_loss_clip": 1.06735635, "balance_loss_mlp": 1.02985334, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.1609038756217744, "language_loss": 0.88638783, "learning_rate": 3.970034890618339e-06, "loss": 0.90988779, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 3.622317314147949 }, { "auxiliary_loss_clip": 0.01296117, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.06408632, "balance_loss_mlp": 1.02334058, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 2.0763870966544937, "language_loss": 0.87794471, "learning_rate": 3.969900403853562e-06, "loss": 0.90123856, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 3.5528438091278076 }, { "auxiliary_loss_clip": 0.01260266, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.07046449, "balance_loss_mlp": 1.02561069, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 3.612553247367886, "language_loss": 0.77904582, "learning_rate": 3.96976561825646e-06, "loss": 0.80199981, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 2.5657222270965576 }, { "auxiliary_loss_clip": 0.0146017, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.06425536, "balance_loss_mlp": 1.02991903, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 2.2245951677955262, "language_loss": 0.86985171, "learning_rate": 3.969630533847479e-06, "loss": 0.89485472, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.834428071975708 }, { "auxiliary_loss_clip": 0.01309261, "auxiliary_loss_mlp": 0.01042104, "balance_loss_clip": 1.06710505, "balance_loss_mlp": 1.03262651, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 2.4310948367085854, "language_loss": 0.84461898, "learning_rate": 3.969495150647113e-06, "loss": 0.86813265, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 3.5609936714172363 }, { "auxiliary_loss_clip": 0.01404076, "auxiliary_loss_mlp": 0.01041372, "balance_loss_clip": 1.06693387, "balance_loss_mlp": 1.03165627, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.967039175698942, "language_loss": 0.76337361, "learning_rate": 3.969359468675899e-06, "loss": 0.78782809, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.7711572647094727 }, { "auxiliary_loss_clip": 0.013017, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.06699646, "balance_loss_mlp": 1.02748251, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.027036615795237, "language_loss": 0.89415979, "learning_rate": 3.969223487954418e-06, "loss": 0.917548, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.6487808227539062 }, { "auxiliary_loss_clip": 0.01458497, "auxiliary_loss_mlp": 0.01040474, "balance_loss_clip": 1.06358862, "balance_loss_mlp": 1.03096104, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.164338354856163, "language_loss": 0.82637346, "learning_rate": 3.969087208503301e-06, "loss": 0.85136324, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.8083207607269287 }, { "auxiliary_loss_clip": 0.01392182, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.06284499, "balance_loss_mlp": 1.02566481, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.588409663549193, "language_loss": 0.84950829, "learning_rate": 3.968950630343219e-06, "loss": 0.87378716, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.782013177871704 }, { "auxiliary_loss_clip": 0.01353019, "auxiliary_loss_mlp": 0.01041297, "balance_loss_clip": 1.06403077, "balance_loss_mlp": 1.0302825, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.1671375730269724, "language_loss": 0.9364624, "learning_rate": 3.968813753494892e-06, "loss": 0.96040553, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.712681770324707 }, { "auxiliary_loss_clip": 0.01400574, "auxiliary_loss_mlp": 0.02582686, "balance_loss_clip": 1.05918312, "balance_loss_mlp": 1.00011683, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.1762123696503926, "language_loss": 0.75796485, "learning_rate": 3.968676577979084e-06, "loss": 0.79779744, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.8413641452789307 }, { "auxiliary_loss_clip": 0.01449133, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.06035495, "balance_loss_mlp": 1.03153634, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.5156930537807165, "language_loss": 0.78596365, "learning_rate": 3.968539103816605e-06, "loss": 0.81086576, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.8760921955108643 }, { "auxiliary_loss_clip": 0.01354272, "auxiliary_loss_mlp": 0.02582546, "balance_loss_clip": 1.06618738, "balance_loss_mlp": 1.00014782, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 2.707649812894315, "language_loss": 0.89538515, "learning_rate": 3.9684013310283085e-06, "loss": 0.9347533, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.7541263103485107 }, { "auxiliary_loss_clip": 0.01355035, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.0692575, "balance_loss_mlp": 1.03001094, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 2.0822320502986997, "language_loss": 0.63641429, "learning_rate": 3.9682632596350956e-06, "loss": 0.66036057, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.9098339080810547 }, { "auxiliary_loss_clip": 0.01302547, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.06659746, "balance_loss_mlp": 1.02392459, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.0875243227293505, "language_loss": 0.78525651, "learning_rate": 3.968124889657911e-06, "loss": 0.80861282, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.6050527095794678 }, { "auxiliary_loss_clip": 0.01455483, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.06179881, "balance_loss_mlp": 1.02720141, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.5423510809886496, "language_loss": 0.90828383, "learning_rate": 3.967986221117746e-06, "loss": 0.93321133, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.786137580871582 }, { "auxiliary_loss_clip": 0.0155894, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.05858707, "balance_loss_mlp": 1.02811933, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 2.2721326025294863, "language_loss": 0.86787164, "learning_rate": 3.967847254035635e-06, "loss": 0.89384413, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 2.8332087993621826 }, { "auxiliary_loss_clip": 0.01409181, "auxiliary_loss_mlp": 0.01040297, "balance_loss_clip": 1.06656885, "balance_loss_mlp": 1.03042662, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 2.9004123929465786, "language_loss": 0.86519164, "learning_rate": 3.967707988432661e-06, "loss": 0.88968641, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.7373852729797363 }, { "auxiliary_loss_clip": 0.01255634, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.06697726, "balance_loss_mlp": 1.02327847, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.186666590465645, "language_loss": 0.87872839, "learning_rate": 3.967568424329949e-06, "loss": 0.90162182, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.6712751388549805 }, { "auxiliary_loss_clip": 0.01289318, "auxiliary_loss_mlp": 0.01007289, "balance_loss_clip": 1.04996145, "balance_loss_mlp": 1.00345039, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8361809849798827, "language_loss": 0.55509484, "learning_rate": 3.967428561748671e-06, "loss": 0.57806098, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.4481446743011475 }, { "auxiliary_loss_clip": 0.01443565, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.05588245, "balance_loss_mlp": 1.03068399, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 3.59231187685513, "language_loss": 0.87734973, "learning_rate": 3.967288400710045e-06, "loss": 0.90220237, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.780897617340088 }, { "auxiliary_loss_clip": 0.01399904, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.0677073, "balance_loss_mlp": 1.02401769, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 2.1290540253941113, "language_loss": 0.88680243, "learning_rate": 3.9671479412353335e-06, "loss": 0.9111352, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.7679481506347656 }, { "auxiliary_loss_clip": 0.01303525, "auxiliary_loss_mlp": 0.01039582, "balance_loss_clip": 1.06732881, "balance_loss_mlp": 1.02913892, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.5282921618239897, "language_loss": 0.7411834, "learning_rate": 3.967007183345843e-06, "loss": 0.76461452, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.7239208221435547 }, { "auxiliary_loss_clip": 0.01301153, "auxiliary_loss_mlp": 0.01042188, "balance_loss_clip": 1.06759024, "balance_loss_mlp": 1.03182912, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 3.8568717781930784, "language_loss": 0.89714462, "learning_rate": 3.966866127062927e-06, "loss": 0.920578, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 3.5563740730285645 }, { "auxiliary_loss_clip": 0.0122397, "auxiliary_loss_mlp": 0.01004495, "balance_loss_clip": 1.05337191, "balance_loss_mlp": 1.00070417, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8653826537885116, "language_loss": 0.62683767, "learning_rate": 3.966724772407982e-06, "loss": 0.6491223, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.0836548805236816 }, { "auxiliary_loss_clip": 0.01394753, "auxiliary_loss_mlp": 0.01037582, "balance_loss_clip": 1.06184387, "balance_loss_mlp": 1.02821207, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.6773046656118495, "language_loss": 0.8850053, "learning_rate": 3.966583119402454e-06, "loss": 0.90932864, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 2.748619556427002 }, { "auxiliary_loss_clip": 0.01300028, "auxiliary_loss_mlp": 0.02579728, "balance_loss_clip": 1.06684113, "balance_loss_mlp": 1.00008512, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.7196564964298402, "language_loss": 0.82139409, "learning_rate": 3.9664411680678305e-06, "loss": 0.86019164, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 3.7022294998168945 }, { "auxiliary_loss_clip": 0.01323082, "auxiliary_loss_mlp": 0.01004616, "balance_loss_clip": 1.04743457, "balance_loss_mlp": 1.00077713, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8435989803055133, "language_loss": 0.61431336, "learning_rate": 3.966298918425644e-06, "loss": 0.63759035, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 4.195866823196411 }, { "auxiliary_loss_clip": 0.01310107, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.06789541, "balance_loss_mlp": 1.03906298, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 1.8403463291431765, "language_loss": 0.82822466, "learning_rate": 3.966156370497476e-06, "loss": 0.85181832, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 2.859050989151001 }, { "auxiliary_loss_clip": 0.01310252, "auxiliary_loss_mlp": 0.01033425, "balance_loss_clip": 1.06747699, "balance_loss_mlp": 1.02398324, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 3.084536665437749, "language_loss": 0.88859707, "learning_rate": 3.96601352430495e-06, "loss": 0.91203386, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.7160415649414062 }, { "auxiliary_loss_clip": 0.01357447, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.07080507, "balance_loss_mlp": 1.02987015, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.4745326717067637, "language_loss": 0.82751548, "learning_rate": 3.965870379869735e-06, "loss": 0.85148156, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 3.714097499847412 }, { "auxiliary_loss_clip": 0.0130583, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.06499219, "balance_loss_mlp": 1.02646172, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.2893605625281834, "language_loss": 0.87099385, "learning_rate": 3.965726937213547e-06, "loss": 0.89441288, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.810105562210083 }, { "auxiliary_loss_clip": 0.01302987, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.0640558, "balance_loss_mlp": 1.01974094, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.1016513308701024, "language_loss": 0.81654745, "learning_rate": 3.965583196358144e-06, "loss": 0.83988512, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.642669677734375 }, { "auxiliary_loss_clip": 0.01260356, "auxiliary_loss_mlp": 0.01035475, "balance_loss_clip": 1.07025957, "balance_loss_mlp": 1.02559292, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.145429686902692, "language_loss": 0.74335688, "learning_rate": 3.965439157325335e-06, "loss": 0.76631522, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.6900532245635986 }, { "auxiliary_loss_clip": 0.01354779, "auxiliary_loss_mlp": 0.01039442, "balance_loss_clip": 1.06310511, "balance_loss_mlp": 1.02935112, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 1.8941370294092865, "language_loss": 0.75919729, "learning_rate": 3.965294820136968e-06, "loss": 0.78313947, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.7971136569976807 }, { "auxiliary_loss_clip": 0.01348227, "auxiliary_loss_mlp": 0.0103766, "balance_loss_clip": 1.06530714, "balance_loss_mlp": 1.02787328, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 3.797939621026205, "language_loss": 0.87065512, "learning_rate": 3.965150184814938e-06, "loss": 0.89451396, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.760451316833496 }, { "auxiliary_loss_clip": 0.01347902, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.06369352, "balance_loss_mlp": 1.02640688, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.3761576618973614, "language_loss": 0.76776218, "learning_rate": 3.965005251381189e-06, "loss": 0.79160774, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.7108702659606934 }, { "auxiliary_loss_clip": 0.01219179, "auxiliary_loss_mlp": 0.01008597, "balance_loss_clip": 1.0449636, "balance_loss_mlp": 1.00497341, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.9518711678504015, "language_loss": 0.64535224, "learning_rate": 3.964860019857705e-06, "loss": 0.66763008, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.2629663944244385 }, { "auxiliary_loss_clip": 0.01255384, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.07126415, "balance_loss_mlp": 1.02574921, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.857181033378721, "language_loss": 0.84165937, "learning_rate": 3.964714490266518e-06, "loss": 0.86456704, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.6731762886047363 }, { "auxiliary_loss_clip": 0.0121226, "auxiliary_loss_mlp": 0.01003133, "balance_loss_clip": 1.04130924, "balance_loss_mlp": 0.99948531, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8811400491936832, "language_loss": 0.64573991, "learning_rate": 3.964568662629706e-06, "loss": 0.66789377, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.1388072967529297 }, { "auxiliary_loss_clip": 0.01303171, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.06472993, "balance_loss_mlp": 1.02672434, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 2.2512747602528447, "language_loss": 0.84613991, "learning_rate": 3.9644225369693895e-06, "loss": 0.86953223, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.7511446475982666 }, { "auxiliary_loss_clip": 0.01256909, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.07242942, "balance_loss_mlp": 1.02328479, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 4.414156027546238, "language_loss": 0.87072128, "learning_rate": 3.964276113307735e-06, "loss": 0.89362484, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.6838343143463135 }, { "auxiliary_loss_clip": 0.01396674, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.06355166, "balance_loss_mlp": 1.02755404, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.952472192862226, "language_loss": 0.80591631, "learning_rate": 3.9641293916669574e-06, "loss": 0.83025908, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.7900123596191406 }, { "auxiliary_loss_clip": 0.01401693, "auxiliary_loss_mlp": 0.01040277, "balance_loss_clip": 1.06600142, "balance_loss_mlp": 1.02941763, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 3.1346043280852465, "language_loss": 0.83007139, "learning_rate": 3.9639823720693115e-06, "loss": 0.85449106, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.7639260292053223 }, { "auxiliary_loss_clip": 0.01336932, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.06296682, "balance_loss_mlp": 1.02401495, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8493393548171678, "language_loss": 0.60027277, "learning_rate": 3.963835054537102e-06, "loss": 0.62391436, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.313080310821533 }, { "auxiliary_loss_clip": 0.01353929, "auxiliary_loss_mlp": 0.0104956, "balance_loss_clip": 1.06317592, "balance_loss_mlp": 1.03880191, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.1966853973686424, "language_loss": 0.60749972, "learning_rate": 3.963687439092676e-06, "loss": 0.63153464, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.5952529907226562 }, { "auxiliary_loss_clip": 0.01302784, "auxiliary_loss_mlp": 0.01038374, "balance_loss_clip": 1.06880653, "balance_loss_mlp": 1.0280025, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 1.9541694641005884, "language_loss": 0.80422556, "learning_rate": 3.963539525758427e-06, "loss": 0.8276372, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.561311960220337 }, { "auxiliary_loss_clip": 0.01357794, "auxiliary_loss_mlp": 0.01048324, "balance_loss_clip": 1.06715453, "balance_loss_mlp": 1.03781009, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.254177087945771, "language_loss": 0.68021184, "learning_rate": 3.9633913145567925e-06, "loss": 0.70427299, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.698265552520752 }, { "auxiliary_loss_clip": 0.01353127, "auxiliary_loss_mlp": 0.01037298, "balance_loss_clip": 1.06548202, "balance_loss_mlp": 1.02773774, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 2.05393218000211, "language_loss": 0.81722689, "learning_rate": 3.9632428055102575e-06, "loss": 0.84113109, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.7311532497406006 }, { "auxiliary_loss_clip": 0.01311594, "auxiliary_loss_mlp": 0.01044098, "balance_loss_clip": 1.0718348, "balance_loss_mlp": 1.03367901, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 7.129120560807403, "language_loss": 0.6686846, "learning_rate": 3.9630939986413495e-06, "loss": 0.69224155, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 3.6818323135375977 }, { "auxiliary_loss_clip": 0.01394289, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.0604589, "balance_loss_mlp": 1.0335176, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.7190951814224327, "language_loss": 0.78193617, "learning_rate": 3.962944893972643e-06, "loss": 0.80631369, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.730952739715576 }, { "auxiliary_loss_clip": 0.01350475, "auxiliary_loss_mlp": 0.01042963, "balance_loss_clip": 1.06693494, "balance_loss_mlp": 1.03283024, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.594676716641194, "language_loss": 0.90837628, "learning_rate": 3.962795491526756e-06, "loss": 0.93231058, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 2.683987855911255 }, { "auxiliary_loss_clip": 0.01260606, "auxiliary_loss_mlp": 0.01047111, "balance_loss_clip": 1.07087564, "balance_loss_mlp": 1.0361079, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.479035372331964, "language_loss": 0.88961422, "learning_rate": 3.962645791326354e-06, "loss": 0.91269147, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.590918779373169 }, { "auxiliary_loss_clip": 0.01302054, "auxiliary_loss_mlp": 0.0105199, "balance_loss_clip": 1.06787634, "balance_loss_mlp": 1.04164267, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 3.071554934680499, "language_loss": 0.83462262, "learning_rate": 3.962495793394146e-06, "loss": 0.85816312, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 4.373162031173706 }, { "auxiliary_loss_clip": 0.01155813, "auxiliary_loss_mlp": 0.01018394, "balance_loss_clip": 1.03262281, "balance_loss_mlp": 1.01434124, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7424081652029518, "language_loss": 0.61192942, "learning_rate": 3.9623454977528864e-06, "loss": 0.63367146, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 3.0555105209350586 }, { "auxiliary_loss_clip": 0.0141058, "auxiliary_loss_mlp": 0.01041296, "balance_loss_clip": 1.06495988, "balance_loss_mlp": 1.03069258, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 7.108364220458012, "language_loss": 0.85116076, "learning_rate": 3.962194904425375e-06, "loss": 0.87567955, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 2.7749440670013428 }, { "auxiliary_loss_clip": 0.01300232, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.06624711, "balance_loss_mlp": 1.02108228, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 1.9502881523619102, "language_loss": 0.67904025, "learning_rate": 3.9620440134344566e-06, "loss": 0.70235354, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 3.659010171890259 }, { "auxiliary_loss_clip": 0.01403664, "auxiliary_loss_mlp": 0.01047113, "balance_loss_clip": 1.06683469, "balance_loss_mlp": 1.03625298, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 4.876023168403352, "language_loss": 0.82464576, "learning_rate": 3.9618928248030215e-06, "loss": 0.84915352, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.7422611713409424 }, { "auxiliary_loss_clip": 0.01303294, "auxiliary_loss_mlp": 0.01038701, "balance_loss_clip": 1.06926775, "balance_loss_mlp": 1.0286994, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 4.975457258030548, "language_loss": 0.82889283, "learning_rate": 3.961741338554005e-06, "loss": 0.85231268, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.6946282386779785 }, { "auxiliary_loss_clip": 0.01360675, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.06714964, "balance_loss_mlp": 1.02530801, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 2.242171888305962, "language_loss": 0.75739336, "learning_rate": 3.9615895547103865e-06, "loss": 0.7813555, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.8749704360961914 }, { "auxiliary_loss_clip": 0.01354448, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.06517148, "balance_loss_mlp": 1.02643931, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 2.526811948931708, "language_loss": 0.77857608, "learning_rate": 3.961437473295193e-06, "loss": 0.80248767, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.7294087409973145 }, { "auxiliary_loss_clip": 0.01445579, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.056566, "balance_loss_mlp": 1.02639771, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.39161622791147, "language_loss": 0.72365761, "learning_rate": 3.961285094331495e-06, "loss": 0.74847937, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.837576150894165 }, { "auxiliary_loss_clip": 0.01252016, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.06826508, "balance_loss_mlp": 1.02702546, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 2.149406199634861, "language_loss": 0.85738033, "learning_rate": 3.961132417842406e-06, "loss": 0.88027108, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.699751615524292 }, { "auxiliary_loss_clip": 0.0129628, "auxiliary_loss_mlp": 0.01039954, "balance_loss_clip": 1.06446385, "balance_loss_mlp": 1.0297792, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 2.5560026975325316, "language_loss": 0.75476611, "learning_rate": 3.960979443851089e-06, "loss": 0.77812839, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.688504219055176 }, { "auxiliary_loss_clip": 0.01354498, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.06376648, "balance_loss_mlp": 1.02823424, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.6302794559122626, "language_loss": 0.78947717, "learning_rate": 3.96082617238075e-06, "loss": 0.81340939, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.7945070266723633 }, { "auxiliary_loss_clip": 0.01356489, "auxiliary_loss_mlp": 0.01038865, "balance_loss_clip": 1.06719697, "balance_loss_mlp": 1.02886891, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.5863569032400675, "language_loss": 0.796229, "learning_rate": 3.960672603454639e-06, "loss": 0.82018256, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.842303514480591 }, { "auxiliary_loss_clip": 0.0130027, "auxiliary_loss_mlp": 0.01040685, "balance_loss_clip": 1.06557357, "balance_loss_mlp": 1.03084445, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 4.103497911934647, "language_loss": 0.77315772, "learning_rate": 3.960518737096054e-06, "loss": 0.79656726, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.6801748275756836 }, { "auxiliary_loss_clip": 0.0130732, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.06855977, "balance_loss_mlp": 1.02530396, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.764526456365886, "language_loss": 0.72817451, "learning_rate": 3.960364573328334e-06, "loss": 0.75160348, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.6751911640167236 }, { "auxiliary_loss_clip": 0.01407289, "auxiliary_loss_mlp": 0.01041637, "balance_loss_clip": 1.06383526, "balance_loss_mlp": 1.03180814, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 3.918601558582504, "language_loss": 0.88640869, "learning_rate": 3.9602101121748675e-06, "loss": 0.91089785, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.7737317085266113 }, { "auxiliary_loss_clip": 0.01353482, "auxiliary_loss_mlp": 0.01033903, "balance_loss_clip": 1.0684706, "balance_loss_mlp": 1.02429521, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 3.512128758614238, "language_loss": 0.72186911, "learning_rate": 3.960055353659085e-06, "loss": 0.74574298, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.6308038234710693 }, { "auxiliary_loss_clip": 0.01405598, "auxiliary_loss_mlp": 0.01039037, "balance_loss_clip": 1.06620336, "balance_loss_mlp": 1.02917266, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 1.9330620763254391, "language_loss": 0.83718765, "learning_rate": 3.959900297804465e-06, "loss": 0.86163396, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.7982285022735596 }, { "auxiliary_loss_clip": 0.01347831, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.06505632, "balance_loss_mlp": 1.02766144, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 2.015871726975309, "language_loss": 0.77593559, "learning_rate": 3.9597449446345276e-06, "loss": 0.79978669, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.694493532180786 }, { "auxiliary_loss_clip": 0.01341549, "auxiliary_loss_mlp": 0.01035572, "balance_loss_clip": 1.06068921, "balance_loss_mlp": 1.02641046, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.779871629715447, "language_loss": 0.83478492, "learning_rate": 3.95958929417284e-06, "loss": 0.85855615, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.687399387359619 }, { "auxiliary_loss_clip": 0.01211678, "auxiliary_loss_mlp": 0.01014034, "balance_loss_clip": 1.03795266, "balance_loss_mlp": 1.01033807, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7659338506635649, "language_loss": 0.5875811, "learning_rate": 3.9594333464430145e-06, "loss": 0.60983825, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.376556873321533 }, { "auxiliary_loss_clip": 0.01537862, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.05359113, "balance_loss_mlp": 1.02773428, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 1.8732742692720303, "language_loss": 0.88046199, "learning_rate": 3.959277101468709e-06, "loss": 0.9062143, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.849626064300537 }, { "auxiliary_loss_clip": 0.01342295, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.06192172, "balance_loss_mlp": 1.02283907, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 3.159480831027179, "language_loss": 0.78738081, "learning_rate": 3.959120559273624e-06, "loss": 0.81113255, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 3.7400615215301514 }, { "auxiliary_loss_clip": 0.01340639, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.06508923, "balance_loss_mlp": 1.02885747, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 1.9269695621306626, "language_loss": 0.83555305, "learning_rate": 3.958963719881509e-06, "loss": 0.85934401, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.670522451400757 }, { "auxiliary_loss_clip": 0.01303509, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.06878364, "balance_loss_mlp": 1.02583325, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 2.005401622591965, "language_loss": 0.93735003, "learning_rate": 3.958806583316154e-06, "loss": 0.96073592, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 2.6314430236816406 }, { "auxiliary_loss_clip": 0.01256161, "auxiliary_loss_mlp": 0.01045672, "balance_loss_clip": 1.07044625, "balance_loss_mlp": 1.03589094, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 3.8376339795513155, "language_loss": 0.78951025, "learning_rate": 3.9586491496013985e-06, "loss": 0.81252855, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 3.629610776901245 }, { "auxiliary_loss_clip": 0.01306247, "auxiliary_loss_mlp": 0.01041585, "balance_loss_clip": 1.06667233, "balance_loss_mlp": 1.03184509, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.0153414805847074, "language_loss": 0.82961088, "learning_rate": 3.958491418761124e-06, "loss": 0.85308921, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 3.608821392059326 }, { "auxiliary_loss_clip": 0.01351853, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.06184685, "balance_loss_mlp": 1.02861643, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 2.1691343485274164, "language_loss": 0.72433114, "learning_rate": 3.958333390819258e-06, "loss": 0.74824011, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 2.753573417663574 }, { "auxiliary_loss_clip": 0.0125264, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.06867456, "balance_loss_mlp": 1.0263865, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.062264720239107, "language_loss": 0.80244809, "learning_rate": 3.9581750657997754e-06, "loss": 0.82534111, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.655562162399292 }, { "auxiliary_loss_clip": 0.01344805, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.06048727, "balance_loss_mlp": 1.01971364, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 2.394105679354208, "language_loss": 0.89398283, "learning_rate": 3.95801644372669e-06, "loss": 0.91772127, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 3.6469974517822266 }, { "auxiliary_loss_clip": 0.01357896, "auxiliary_loss_mlp": 0.01046417, "balance_loss_clip": 1.06303811, "balance_loss_mlp": 1.03660059, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 3.3083158959730037, "language_loss": 0.84337449, "learning_rate": 3.957857524624068e-06, "loss": 0.86741757, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.7452552318573 }, { "auxiliary_loss_clip": 0.01349341, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.06181586, "balance_loss_mlp": 1.0291605, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.7330541563695752, "language_loss": 0.89551222, "learning_rate": 3.957698308516016e-06, "loss": 0.91939247, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.7227189540863037 }, { "auxiliary_loss_clip": 0.01298793, "auxiliary_loss_mlp": 0.02578303, "balance_loss_clip": 1.06932855, "balance_loss_mlp": 1.00015235, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 2.0063095413566665, "language_loss": 0.82348573, "learning_rate": 3.957538795426688e-06, "loss": 0.86225671, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.6857783794403076 }, { "auxiliary_loss_clip": 0.01354879, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.06599164, "balance_loss_mlp": 1.02592802, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 5.292314124655492, "language_loss": 0.77017176, "learning_rate": 3.9573789853802804e-06, "loss": 0.79407465, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.6873743534088135 }, { "auxiliary_loss_clip": 0.0135271, "auxiliary_loss_mlp": 0.02580956, "balance_loss_clip": 1.06863463, "balance_loss_mlp": 1.00016832, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 1.9796106928531685, "language_loss": 0.74743629, "learning_rate": 3.957218878401037e-06, "loss": 0.78677291, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.748136520385742 }, { "auxiliary_loss_clip": 0.01258772, "auxiliary_loss_mlp": 0.01039207, "balance_loss_clip": 1.07331872, "balance_loss_mlp": 1.02903223, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 1.8937080079344857, "language_loss": 0.89350581, "learning_rate": 3.957058474513246e-06, "loss": 0.91648555, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.6662962436676025 }, { "auxiliary_loss_clip": 0.0130446, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.07030725, "balance_loss_mlp": 1.02987766, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.9074537137907868, "language_loss": 0.78553396, "learning_rate": 3.956897773741241e-06, "loss": 0.80897152, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.8393797874450684 }, { "auxiliary_loss_clip": 0.01341039, "auxiliary_loss_mlp": 0.01037614, "balance_loss_clip": 1.06089544, "balance_loss_mlp": 1.02786303, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 1.8169635397708785, "language_loss": 0.71213341, "learning_rate": 3.956736776109398e-06, "loss": 0.73591995, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.759185314178467 }, { "auxiliary_loss_clip": 0.01289307, "auxiliary_loss_mlp": 0.0258298, "balance_loss_clip": 1.0600791, "balance_loss_mlp": 1.0001893, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 2.7290088573052933, "language_loss": 0.83541107, "learning_rate": 3.956575481642143e-06, "loss": 0.87413394, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.692680597305298 }, { "auxiliary_loss_clip": 0.01442805, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.05516315, "balance_loss_mlp": 1.0278337, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.3465075328999365, "language_loss": 0.74849319, "learning_rate": 3.956413890363943e-06, "loss": 0.77329236, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.8058485984802246 }, { "auxiliary_loss_clip": 0.01302729, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.06897068, "balance_loss_mlp": 1.02450466, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.559183535337093, "language_loss": 0.81780982, "learning_rate": 3.956252002299312e-06, "loss": 0.84118605, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.635739803314209 }, { "auxiliary_loss_clip": 0.01253591, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.07018113, "balance_loss_mlp": 1.03068948, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 1.8891441524584396, "language_loss": 0.91263819, "learning_rate": 3.956089817472807e-06, "loss": 0.93557298, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.6143836975097656 }, { "auxiliary_loss_clip": 0.01347876, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.06557083, "balance_loss_mlp": 1.02942574, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 3.121485379887564, "language_loss": 0.85626978, "learning_rate": 3.955927335909032e-06, "loss": 0.88014746, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.7574801445007324 }, { "auxiliary_loss_clip": 0.01445303, "auxiliary_loss_mlp": 0.01044821, "balance_loss_clip": 1.0629952, "balance_loss_mlp": 1.03477192, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.142713441826949, "language_loss": 0.75920862, "learning_rate": 3.955764557632634e-06, "loss": 0.78410983, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.8307130336761475 }, { "auxiliary_loss_clip": 0.01343906, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.06266606, "balance_loss_mlp": 1.02496219, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.821064854403045, "language_loss": 0.94545352, "learning_rate": 3.955601482668309e-06, "loss": 0.96923959, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.652146816253662 }, { "auxiliary_loss_clip": 0.01447796, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.05687332, "balance_loss_mlp": 1.0218749, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 1.9037487233876924, "language_loss": 0.88375258, "learning_rate": 3.955438111040794e-06, "loss": 0.90855151, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.8065950870513916 }, { "auxiliary_loss_clip": 0.01440078, "auxiliary_loss_mlp": 0.01031836, "balance_loss_clip": 1.05818176, "balance_loss_mlp": 1.02223969, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 2.1363304538330277, "language_loss": 0.80234361, "learning_rate": 3.955274442774873e-06, "loss": 0.82706273, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.7760157585144043 }, { "auxiliary_loss_clip": 0.01297406, "auxiliary_loss_mlp": 0.01047135, "balance_loss_clip": 1.06263733, "balance_loss_mlp": 1.03722906, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.3626513442612205, "language_loss": 0.70433891, "learning_rate": 3.9551104778953725e-06, "loss": 0.72778428, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.734409809112549 }, { "auxiliary_loss_clip": 0.01395315, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.05897343, "balance_loss_mlp": 1.02460504, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.855307641719697, "language_loss": 0.85444975, "learning_rate": 3.954946216427167e-06, "loss": 0.87874532, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 3.796165943145752 }, { "auxiliary_loss_clip": 0.01307157, "auxiliary_loss_mlp": 0.01007643, "balance_loss_clip": 1.03402925, "balance_loss_mlp": 1.00456762, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.8781008428262876, "language_loss": 0.61572999, "learning_rate": 3.954781658395176e-06, "loss": 0.63887799, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.224285125732422 }, { "auxiliary_loss_clip": 0.01353458, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.06078446, "balance_loss_mlp": 1.02917254, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 2.2251914122058305, "language_loss": 0.92381334, "learning_rate": 3.95461680382436e-06, "loss": 0.94774556, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 2.744856834411621 }, { "auxiliary_loss_clip": 0.01302539, "auxiliary_loss_mlp": 0.01044018, "balance_loss_clip": 1.07004166, "balance_loss_mlp": 1.03370094, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 3.4683293536497057, "language_loss": 0.86272025, "learning_rate": 3.9544516527397295e-06, "loss": 0.88618582, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 3.5638248920440674 }, { "auxiliary_loss_clip": 0.01336385, "auxiliary_loss_mlp": 0.01035011, "balance_loss_clip": 1.06079924, "balance_loss_mlp": 1.02567673, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.9224407063806832, "language_loss": 0.80537695, "learning_rate": 3.954286205166338e-06, "loss": 0.82909095, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.633012533187866 }, { "auxiliary_loss_clip": 0.01305729, "auxiliary_loss_mlp": 0.0103605, "balance_loss_clip": 1.07083166, "balance_loss_mlp": 1.02580392, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 5.020085940932602, "language_loss": 0.84101719, "learning_rate": 3.954120461129282e-06, "loss": 0.86443496, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 2.6699647903442383 }, { "auxiliary_loss_clip": 0.01253412, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.07040751, "balance_loss_mlp": 1.02451706, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 2.112756336900727, "language_loss": 0.83810461, "learning_rate": 3.953954420653706e-06, "loss": 0.8609786, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.6397109031677246 }, { "auxiliary_loss_clip": 0.01302618, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.06729817, "balance_loss_mlp": 1.02750862, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 2.2183713137845733, "language_loss": 0.88147485, "learning_rate": 3.953788083764798e-06, "loss": 0.90488064, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 3.5655534267425537 }, { "auxiliary_loss_clip": 0.01446458, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.06038308, "balance_loss_mlp": 1.02262712, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.5626233109635557, "language_loss": 0.92146331, "learning_rate": 3.953621450487792e-06, "loss": 0.94626302, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.769533634185791 }, { "auxiliary_loss_clip": 0.01158109, "auxiliary_loss_mlp": 0.0100909, "balance_loss_clip": 1.04054379, "balance_loss_mlp": 1.0054419, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8661937028060824, "language_loss": 0.61236465, "learning_rate": 3.953454520847964e-06, "loss": 0.6340366, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.3543875217437744 }, { "auxiliary_loss_clip": 0.01348135, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.06193972, "balance_loss_mlp": 1.02745605, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 2.0630903265838394, "language_loss": 0.73673344, "learning_rate": 3.9532872948706395e-06, "loss": 0.76059824, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.71807599067688 }, { "auxiliary_loss_clip": 0.01352046, "auxiliary_loss_mlp": 0.01042137, "balance_loss_clip": 1.06504726, "balance_loss_mlp": 1.0320282, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 3.114135835783417, "language_loss": 0.82817769, "learning_rate": 3.9531197725811845e-06, "loss": 0.85211957, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.6773998737335205 }, { "auxiliary_loss_clip": 0.012524, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.07073212, "balance_loss_mlp": 1.02441096, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 2.005967189399882, "language_loss": 0.87951058, "learning_rate": 3.952951954005013e-06, "loss": 0.90237439, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.6130824089050293 }, { "auxiliary_loss_clip": 0.01342179, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.05819821, "balance_loss_mlp": 1.02893496, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.7409043381265648, "language_loss": 0.84615773, "learning_rate": 3.952783839167584e-06, "loss": 0.86996233, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.735697031021118 }, { "auxiliary_loss_clip": 0.01298295, "auxiliary_loss_mlp": 0.01041456, "balance_loss_clip": 1.06500268, "balance_loss_mlp": 1.03069711, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.720032169766388, "language_loss": 0.74442279, "learning_rate": 3.952615428094398e-06, "loss": 0.76782024, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.6495962142944336 }, { "auxiliary_loss_clip": 0.01437086, "auxiliary_loss_mlp": 0.0104259, "balance_loss_clip": 1.05481482, "balance_loss_mlp": 1.0331372, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.8250951979139092, "language_loss": 0.7343688, "learning_rate": 3.952446720811004e-06, "loss": 0.75916553, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.8345518112182617 }, { "auxiliary_loss_clip": 0.0130363, "auxiliary_loss_mlp": 0.01005574, "balance_loss_clip": 1.03610396, "balance_loss_mlp": 1.00223577, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8322198178218507, "language_loss": 0.63597763, "learning_rate": 3.952277717342995e-06, "loss": 0.65906972, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.3960976600646973 }, { "auxiliary_loss_clip": 0.01354859, "auxiliary_loss_mlp": 0.01045796, "balance_loss_clip": 1.06398451, "balance_loss_mlp": 1.0353893, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 3.300995286679296, "language_loss": 0.8561461, "learning_rate": 3.952108417716009e-06, "loss": 0.88015264, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.694577693939209 }, { "auxiliary_loss_clip": 0.01305712, "auxiliary_loss_mlp": 0.01043499, "balance_loss_clip": 1.06960356, "balance_loss_mlp": 1.03262138, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 2.67947657467663, "language_loss": 0.85130829, "learning_rate": 3.951938821955727e-06, "loss": 0.87480044, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.697467088699341 }, { "auxiliary_loss_clip": 0.01350048, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.06405711, "balance_loss_mlp": 1.03183746, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.6939892030102104, "language_loss": 0.76625752, "learning_rate": 3.9517689300878786e-06, "loss": 0.79017746, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.7347590923309326 }, { "auxiliary_loss_clip": 0.01249764, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.06611609, "balance_loss_mlp": 1.02624011, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.756564118561201, "language_loss": 0.78680634, "learning_rate": 3.951598742138236e-06, "loss": 0.80966395, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.607825517654419 }, { "auxiliary_loss_clip": 0.0135286, "auxiliary_loss_mlp": 0.01039513, "balance_loss_clip": 1.06011462, "balance_loss_mlp": 1.02833164, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.3816179409227987, "language_loss": 0.79656738, "learning_rate": 3.951428258132615e-06, "loss": 0.82049108, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.6606271266937256 }, { "auxiliary_loss_clip": 0.01352758, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.06827521, "balance_loss_mlp": 1.0333153, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 1.878565184958034, "language_loss": 0.84609395, "learning_rate": 3.951257478096879e-06, "loss": 0.87005132, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.831615686416626 }, { "auxiliary_loss_clip": 0.01353153, "auxiliary_loss_mlp": 0.02586715, "balance_loss_clip": 1.06674218, "balance_loss_mlp": 1.00010407, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 4.5417733200795976, "language_loss": 0.68203533, "learning_rate": 3.951086402056936e-06, "loss": 0.721434, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.766151189804077 }, { "auxiliary_loss_clip": 0.01545048, "auxiliary_loss_mlp": 0.02582232, "balance_loss_clip": 1.06100988, "balance_loss_mlp": 1.00014377, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.705610350307804, "language_loss": 0.83839619, "learning_rate": 3.950915030038735e-06, "loss": 0.87966901, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 3.245574474334717 }, { "auxiliary_loss_clip": 0.01301975, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.06888199, "balance_loss_mlp": 1.0273416, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.0042840614351065, "language_loss": 0.83153558, "learning_rate": 3.9507433620682765e-06, "loss": 0.85491836, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.759399890899658 }, { "auxiliary_loss_clip": 0.01397249, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.06214011, "balance_loss_mlp": 1.03096819, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 2.3302838217099957, "language_loss": 0.88266319, "learning_rate": 3.9505713981716e-06, "loss": 0.90705907, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 3.7172889709472656 }, { "auxiliary_loss_clip": 0.01341136, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.06578994, "balance_loss_mlp": 1.02569079, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 1.7423176327569767, "language_loss": 0.81422752, "learning_rate": 3.950399138374795e-06, "loss": 0.83800066, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.733043909072876 }, { "auxiliary_loss_clip": 0.01298087, "auxiliary_loss_mlp": 0.01041376, "balance_loss_clip": 1.0634973, "balance_loss_mlp": 1.03150535, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.8032670349109168, "language_loss": 0.74445665, "learning_rate": 3.95022658270399e-06, "loss": 0.76785123, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 2.7748830318450928 }, { "auxiliary_loss_clip": 0.01348419, "auxiliary_loss_mlp": 0.01038867, "balance_loss_clip": 1.06459451, "balance_loss_mlp": 1.0284121, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 2.0002958338262187, "language_loss": 0.78307974, "learning_rate": 3.9500537311853635e-06, "loss": 0.8069526, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 3.572065830230713 }, { "auxiliary_loss_clip": 0.01300943, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.06352174, "balance_loss_mlp": 1.02316749, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 3.9795129805510787, "language_loss": 0.82857579, "learning_rate": 3.949880583845136e-06, "loss": 0.85192382, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.658402681350708 }, { "auxiliary_loss_clip": 0.0135094, "auxiliary_loss_mlp": 0.01042451, "balance_loss_clip": 1.06425178, "balance_loss_mlp": 1.0315969, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 2.300144202827262, "language_loss": 0.81137884, "learning_rate": 3.949707140709575e-06, "loss": 0.83531272, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 2.728961706161499 }, { "auxiliary_loss_clip": 0.01301776, "auxiliary_loss_mlp": 0.0103221, "balance_loss_clip": 1.06339192, "balance_loss_mlp": 1.02232754, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 3.698818506576039, "language_loss": 0.83373415, "learning_rate": 3.949533401804991e-06, "loss": 0.85707396, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.7012412548065186 }, { "auxiliary_loss_clip": 0.01297643, "auxiliary_loss_mlp": 0.02586823, "balance_loss_clip": 1.0649246, "balance_loss_mlp": 1.00017715, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 2.104135310068123, "language_loss": 0.9090116, "learning_rate": 3.949359367157739e-06, "loss": 0.94785631, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 3.4236602783203125 }, { "auxiliary_loss_clip": 0.01303851, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.06641686, "balance_loss_mlp": 1.02463317, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 2.1256782320122336, "language_loss": 0.7586543, "learning_rate": 3.949185036794222e-06, "loss": 0.78204823, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.7445285320281982 }, { "auxiliary_loss_clip": 0.01250084, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.06826544, "balance_loss_mlp": 1.02792645, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.6700352833308703, "language_loss": 0.7871033, "learning_rate": 3.949010410740884e-06, "loss": 0.80998409, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.7063138484954834 }, { "auxiliary_loss_clip": 0.01340093, "auxiliary_loss_mlp": 0.02586968, "balance_loss_clip": 1.06156838, "balance_loss_mlp": 1.00012934, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.6929425756514753, "language_loss": 0.86559629, "learning_rate": 3.948835489024216e-06, "loss": 0.90486687, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.7119762897491455 }, { "auxiliary_loss_clip": 0.01302414, "auxiliary_loss_mlp": 0.01038415, "balance_loss_clip": 1.06465936, "balance_loss_mlp": 1.02864015, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.065389451346853, "language_loss": 0.9021489, "learning_rate": 3.948660271670755e-06, "loss": 0.9255572, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.6568450927734375 }, { "auxiliary_loss_clip": 0.01343759, "auxiliary_loss_mlp": 0.01044636, "balance_loss_clip": 1.06412292, "balance_loss_mlp": 1.03397894, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.513647871963231, "language_loss": 0.84353143, "learning_rate": 3.948484758707079e-06, "loss": 0.86741543, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.7861578464508057 }, { "auxiliary_loss_clip": 0.01390252, "auxiliary_loss_mlp": 0.01042374, "balance_loss_clip": 1.0600909, "balance_loss_mlp": 1.03211594, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 1.9055475613971458, "language_loss": 0.83512115, "learning_rate": 3.948308950159815e-06, "loss": 0.85944742, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.7812860012054443 }, { "auxiliary_loss_clip": 0.01389568, "auxiliary_loss_mlp": 0.01049471, "balance_loss_clip": 1.05799627, "balance_loss_mlp": 1.03776419, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 2.773156265467036, "language_loss": 0.75865924, "learning_rate": 3.9481328460556326e-06, "loss": 0.78304964, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.721379041671753 }, { "auxiliary_loss_clip": 0.01342897, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.06079221, "balance_loss_mlp": 1.02624822, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.111303597573002, "language_loss": 0.89695066, "learning_rate": 3.9479564464212455e-06, "loss": 0.92074645, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.7622225284576416 }, { "auxiliary_loss_clip": 0.0125548, "auxiliary_loss_mlp": 0.01045309, "balance_loss_clip": 1.06680083, "balance_loss_mlp": 1.03432977, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.66184649373113, "language_loss": 0.76359916, "learning_rate": 3.947779751283414e-06, "loss": 0.78660697, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.601921796798706 }, { "auxiliary_loss_clip": 0.01306564, "auxiliary_loss_mlp": 0.02587888, "balance_loss_clip": 1.0735085, "balance_loss_mlp": 1.00014162, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.9090757641667346, "language_loss": 0.76190615, "learning_rate": 3.947602760668944e-06, "loss": 0.80085063, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.6983680725097656 }, { "auxiliary_loss_clip": 0.01304404, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.070297, "balance_loss_mlp": 1.0320015, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 1.747946456929717, "language_loss": 0.71343321, "learning_rate": 3.947425474604684e-06, "loss": 0.73691177, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.7661752700805664 }, { "auxiliary_loss_clip": 0.01352032, "auxiliary_loss_mlp": 0.01044561, "balance_loss_clip": 1.06433249, "balance_loss_mlp": 1.03419018, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 3.0637580333462004, "language_loss": 0.92237216, "learning_rate": 3.947247893117528e-06, "loss": 0.94633806, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.7693183422088623 }, { "auxiliary_loss_clip": 0.01296589, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.06247568, "balance_loss_mlp": 1.02397168, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 3.00290162648237, "language_loss": 0.69262087, "learning_rate": 3.947070016234413e-06, "loss": 0.71594852, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.5775561332702637 }, { "auxiliary_loss_clip": 0.01363807, "auxiliary_loss_mlp": 0.01041379, "balance_loss_clip": 1.06748295, "balance_loss_mlp": 1.03045893, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.3370184714401376, "language_loss": 0.74689561, "learning_rate": 3.946891843982326e-06, "loss": 0.77094746, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.7018189430236816 }, { "auxiliary_loss_clip": 0.01306154, "auxiliary_loss_mlp": 0.0104131, "balance_loss_clip": 1.07071114, "balance_loss_mlp": 1.03074265, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 5.000096303466017, "language_loss": 0.7441259, "learning_rate": 3.9467133763882935e-06, "loss": 0.76760054, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.6081578731536865 }, { "auxiliary_loss_clip": 0.01293608, "auxiliary_loss_mlp": 0.01041343, "balance_loss_clip": 1.06418085, "balance_loss_mlp": 1.03026271, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.160297694559054, "language_loss": 0.86166215, "learning_rate": 3.9465346134793905e-06, "loss": 0.88501167, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.6279892921447754 }, { "auxiliary_loss_clip": 0.01400756, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.06657743, "balance_loss_mlp": 1.02648282, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 1.9810872742552859, "language_loss": 0.79641867, "learning_rate": 3.9463555552827335e-06, "loss": 0.82080328, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.7496979236602783 }, { "auxiliary_loss_clip": 0.01293261, "auxiliary_loss_mlp": 0.01034873, "balance_loss_clip": 1.06492686, "balance_loss_mlp": 1.02458572, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.466958964085333, "language_loss": 0.86365438, "learning_rate": 3.946176201825487e-06, "loss": 0.88693571, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.685763120651245 }, { "auxiliary_loss_clip": 0.01352523, "auxiliary_loss_mlp": 0.01038489, "balance_loss_clip": 1.06782162, "balance_loss_mlp": 1.0277009, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 2.093672568118895, "language_loss": 0.83569312, "learning_rate": 3.9459965531348575e-06, "loss": 0.85960323, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 3.571596145629883 }, { "auxiliary_loss_clip": 0.0135507, "auxiliary_loss_mlp": 0.02589704, "balance_loss_clip": 1.06685448, "balance_loss_mlp": 1.00022531, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.3532693893168766, "language_loss": 0.85170591, "learning_rate": 3.945816609238098e-06, "loss": 0.89115369, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.709230899810791 }, { "auxiliary_loss_clip": 0.0144598, "auxiliary_loss_mlp": 0.01039489, "balance_loss_clip": 1.0630784, "balance_loss_mlp": 1.02849197, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 2.602493067714094, "language_loss": 0.85317552, "learning_rate": 3.945636370162507e-06, "loss": 0.87803018, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.79087233543396 }, { "auxiliary_loss_clip": 0.01299931, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.06664443, "balance_loss_mlp": 1.02294147, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 3.0138658758730537, "language_loss": 0.79482681, "learning_rate": 3.945455835935425e-06, "loss": 0.81815839, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 3.6043152809143066 }, { "auxiliary_loss_clip": 0.01343116, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.064237, "balance_loss_mlp": 1.03140163, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.9926095511414985, "language_loss": 0.75070095, "learning_rate": 3.94527500658424e-06, "loss": 0.77455997, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 2.7230799198150635 }, { "auxiliary_loss_clip": 0.01438397, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.06229687, "balance_loss_mlp": 1.02632117, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 1.9174218095066955, "language_loss": 0.81454813, "learning_rate": 3.945093882136382e-06, "loss": 0.83929127, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 2.890918016433716 }, { "auxiliary_loss_clip": 0.01353718, "auxiliary_loss_mlp": 0.02583719, "balance_loss_clip": 1.07037961, "balance_loss_mlp": 1.00022626, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 2.174854289909286, "language_loss": 0.84821534, "learning_rate": 3.944912462619329e-06, "loss": 0.88758969, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.7103817462921143 }, { "auxiliary_loss_clip": 0.01354252, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.06519496, "balance_loss_mlp": 1.02601469, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 2.915545647260689, "language_loss": 0.80504239, "learning_rate": 3.9447307480606025e-06, "loss": 0.82895947, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 3.6548516750335693 }, { "auxiliary_loss_clip": 0.01347285, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.06437457, "balance_loss_mlp": 1.02812028, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 4.090083955652681, "language_loss": 0.90432471, "learning_rate": 3.944548738487767e-06, "loss": 0.92818213, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.6541080474853516 }, { "auxiliary_loss_clip": 0.01259482, "auxiliary_loss_mlp": 0.01044471, "balance_loss_clip": 1.07301736, "balance_loss_mlp": 1.03346813, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.9288207290056323, "language_loss": 0.90937489, "learning_rate": 3.944366433928434e-06, "loss": 0.93241447, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.6882293224334717 }, { "auxiliary_loss_clip": 0.01348875, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.06459022, "balance_loss_mlp": 1.02760446, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 1.476033550260478, "language_loss": 0.83588958, "learning_rate": 3.9441838344102594e-06, "loss": 0.85975045, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.7407398223876953 }, { "auxiliary_loss_clip": 0.01348059, "auxiliary_loss_mlp": 0.01049101, "balance_loss_clip": 1.0692687, "balance_loss_mlp": 1.03813958, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 2.570264125567191, "language_loss": 0.67277664, "learning_rate": 3.944000939960943e-06, "loss": 0.69674826, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.7064414024353027 }, { "auxiliary_loss_clip": 0.01301122, "auxiliary_loss_mlp": 0.01048256, "balance_loss_clip": 1.06491089, "balance_loss_mlp": 1.03786051, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 2.1099928274487354, "language_loss": 0.80099273, "learning_rate": 3.943817750608229e-06, "loss": 0.82448649, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.7620561122894287 }, { "auxiliary_loss_clip": 0.01305565, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.07082915, "balance_loss_mlp": 1.03547716, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 3.3914448296861703, "language_loss": 0.81796426, "learning_rate": 3.943634266379908e-06, "loss": 0.84147251, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.6324148178100586 }, { "auxiliary_loss_clip": 0.01306078, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.06863582, "balance_loss_mlp": 1.02303934, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.8228171277431984, "language_loss": 0.84820378, "learning_rate": 3.943450487303815e-06, "loss": 0.87160528, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.725874900817871 }, { "auxiliary_loss_clip": 0.01303008, "auxiliary_loss_mlp": 0.01040817, "balance_loss_clip": 1.06975269, "balance_loss_mlp": 1.03109539, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 1.9351552918347021, "language_loss": 0.85734904, "learning_rate": 3.943266413407827e-06, "loss": 0.88078725, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.6590418815612793 }, { "auxiliary_loss_clip": 0.01306361, "auxiliary_loss_mlp": 0.01041022, "balance_loss_clip": 1.06937075, "balance_loss_mlp": 1.03106189, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 2.153519410291218, "language_loss": 0.85054392, "learning_rate": 3.94308204471987e-06, "loss": 0.87401772, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.7443456649780273 }, { "auxiliary_loss_clip": 0.01382104, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.06143212, "balance_loss_mlp": 1.02535045, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 11.008755945457288, "language_loss": 0.74760067, "learning_rate": 3.942897381267912e-06, "loss": 0.7717756, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.7408041954040527 }, { "auxiliary_loss_clip": 0.01303018, "auxiliary_loss_mlp": 0.01037687, "balance_loss_clip": 1.06917453, "balance_loss_mlp": 1.02650535, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 3.221429661219543, "language_loss": 0.6607008, "learning_rate": 3.942712423079965e-06, "loss": 0.68410784, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.6933605670928955 }, { "auxiliary_loss_clip": 0.01387212, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.05687702, "balance_loss_mlp": 1.02167463, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.380158919463994, "language_loss": 0.90033174, "learning_rate": 3.942527170184088e-06, "loss": 0.92451859, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.7243762016296387 }, { "auxiliary_loss_clip": 0.01254056, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.06963706, "balance_loss_mlp": 1.02982068, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.895594894395919, "language_loss": 0.77428818, "learning_rate": 3.942341622608385e-06, "loss": 0.79722571, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.6456151008605957 }, { "auxiliary_loss_clip": 0.01353222, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.07053971, "balance_loss_mlp": 1.03128088, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 1.5070420982343802, "language_loss": 0.77549112, "learning_rate": 3.942155780381001e-06, "loss": 0.79943919, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.876343011856079 }, { "auxiliary_loss_clip": 0.01356272, "auxiliary_loss_mlp": 0.0104173, "balance_loss_clip": 1.0665282, "balance_loss_mlp": 1.0313468, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 2.4674539349140363, "language_loss": 0.75990868, "learning_rate": 3.94196964353013e-06, "loss": 0.7838887, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.6793954372406006 }, { "auxiliary_loss_clip": 0.01337256, "auxiliary_loss_mlp": 0.02581252, "balance_loss_clip": 1.0646832, "balance_loss_mlp": 1.00022149, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 1.9733478114754022, "language_loss": 0.80862761, "learning_rate": 3.941783212084008e-06, "loss": 0.84781265, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.706925868988037 }, { "auxiliary_loss_clip": 0.01338429, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.06592727, "balance_loss_mlp": 1.03160191, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.5133753991572845, "language_loss": 0.7929424, "learning_rate": 3.941596486070916e-06, "loss": 0.81675535, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.7938344478607178 }, { "auxiliary_loss_clip": 0.01441097, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.06493735, "balance_loss_mlp": 1.02515984, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 4.045320762522045, "language_loss": 0.58563644, "learning_rate": 3.941409465519182e-06, "loss": 0.61040133, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.7772326469421387 }, { "auxiliary_loss_clip": 0.01298892, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.06715918, "balance_loss_mlp": 1.02555442, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.8877114873019658, "language_loss": 0.85442543, "learning_rate": 3.941222150457176e-06, "loss": 0.87777102, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 3.7072579860687256 }, { "auxiliary_loss_clip": 0.01306494, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.06779838, "balance_loss_mlp": 1.03397274, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.5392838585738007, "language_loss": 0.71609318, "learning_rate": 3.941034540913311e-06, "loss": 0.7395978, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.659313440322876 }, { "auxiliary_loss_clip": 0.01305398, "auxiliary_loss_mlp": 0.02584788, "balance_loss_clip": 1.07000494, "balance_loss_mlp": 1.00018096, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.756753641284867, "language_loss": 0.82390672, "learning_rate": 3.940846636916051e-06, "loss": 0.86280859, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.7088160514831543 }, { "auxiliary_loss_clip": 0.01351354, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.07007122, "balance_loss_mlp": 1.02857065, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.2313118991040506, "language_loss": 0.8656652, "learning_rate": 3.940658438493899e-06, "loss": 0.88956857, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 4.1792895793914795 }, { "auxiliary_loss_clip": 0.01256471, "auxiliary_loss_mlp": 0.01034991, "balance_loss_clip": 1.06894863, "balance_loss_mlp": 1.02447128, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 3.264153738361369, "language_loss": 0.76168668, "learning_rate": 3.940469945675405e-06, "loss": 0.78460133, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 3.4910080432891846 }, { "auxiliary_loss_clip": 0.01430099, "auxiliary_loss_mlp": 0.01039579, "balance_loss_clip": 1.05929315, "balance_loss_mlp": 1.02998304, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 1.9744735553123471, "language_loss": 0.91745734, "learning_rate": 3.940281158489163e-06, "loss": 0.94215417, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 2.8823490142822266 }, { "auxiliary_loss_clip": 0.01496624, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.05835819, "balance_loss_mlp": 1.02481604, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.9712096341276695, "language_loss": 0.82552469, "learning_rate": 3.940092076963812e-06, "loss": 0.85083711, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.883492946624756 }, { "auxiliary_loss_clip": 0.0135262, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.06509137, "balance_loss_mlp": 1.02413607, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.223088259957407, "language_loss": 0.7897352, "learning_rate": 3.9399027011280355e-06, "loss": 0.81360865, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 3.9809439182281494 }, { "auxiliary_loss_clip": 0.01350961, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.06838095, "balance_loss_mlp": 1.0306747, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.0186631158563375, "language_loss": 0.77477062, "learning_rate": 3.939713031010561e-06, "loss": 0.79869378, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.7577338218688965 }, { "auxiliary_loss_clip": 0.01401742, "auxiliary_loss_mlp": 0.01040346, "balance_loss_clip": 1.06661057, "balance_loss_mlp": 1.02924144, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.5893077292787297, "language_loss": 0.78048599, "learning_rate": 3.939523066640163e-06, "loss": 0.80490685, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.725170373916626 }, { "auxiliary_loss_clip": 0.01288289, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.06752968, "balance_loss_mlp": 1.02765334, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 2.06984676202614, "language_loss": 0.81154305, "learning_rate": 3.939332808045657e-06, "loss": 0.83480692, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.726757764816284 }, { "auxiliary_loss_clip": 0.01399036, "auxiliary_loss_mlp": 0.01044102, "balance_loss_clip": 1.06401467, "balance_loss_mlp": 1.03298545, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 12.649308647013715, "language_loss": 0.84463727, "learning_rate": 3.939142255255906e-06, "loss": 0.86906862, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.7322347164154053 }, { "auxiliary_loss_clip": 0.01299615, "auxiliary_loss_mlp": 0.0103791, "balance_loss_clip": 1.06775844, "balance_loss_mlp": 1.02842712, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 1.813703040013049, "language_loss": 0.86977959, "learning_rate": 3.938951408299817e-06, "loss": 0.89315486, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.7396903038024902 }, { "auxiliary_loss_clip": 0.0137153, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.05712545, "balance_loss_mlp": 1.03968525, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.7983060358596532, "language_loss": 0.54427731, "learning_rate": 3.938760267206342e-06, "loss": 0.5684185, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.38413667678833 }, { "auxiliary_loss_clip": 0.01252577, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.07047093, "balance_loss_mlp": 1.02364874, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.1781077724017046, "language_loss": 0.78759515, "learning_rate": 3.938568832004475e-06, "loss": 0.81046569, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.794687509536743 }, { "auxiliary_loss_clip": 0.01339379, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.060691, "balance_loss_mlp": 1.0306505, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.042285640585899, "language_loss": 0.75300163, "learning_rate": 3.938377102723257e-06, "loss": 0.77681112, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.735180377960205 }, { "auxiliary_loss_clip": 0.01436911, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.06015277, "balance_loss_mlp": 1.02579105, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 4.156283339285816, "language_loss": 0.83418858, "learning_rate": 3.938185079391774e-06, "loss": 0.85892648, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.8454277515411377 }, { "auxiliary_loss_clip": 0.01249651, "auxiliary_loss_mlp": 0.01035494, "balance_loss_clip": 1.06715846, "balance_loss_mlp": 1.02519393, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 3.2083081356734438, "language_loss": 1.05870104, "learning_rate": 3.937992762039157e-06, "loss": 1.08155251, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 0.005930185317993164 }, { "auxiliary_loss_clip": 0.01297979, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.06634617, "balance_loss_mlp": 1.03342199, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.7548662569013032, "language_loss": 0.80428374, "learning_rate": 3.937800150694577e-06, "loss": 0.82769799, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.703380823135376 }, { "auxiliary_loss_clip": 0.01446861, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.06270194, "balance_loss_mlp": 1.02312875, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.4541229606539363, "language_loss": 0.76317835, "learning_rate": 3.937607245387255e-06, "loss": 0.78798205, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.785367965698242 }, { "auxiliary_loss_clip": 0.01355567, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.06224954, "balance_loss_mlp": 1.03402328, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 2.02342478890887, "language_loss": 0.72013092, "learning_rate": 3.937414046146455e-06, "loss": 0.74411708, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.687171459197998 }, { "auxiliary_loss_clip": 0.01257222, "auxiliary_loss_mlp": 0.01046536, "balance_loss_clip": 1.071931, "balance_loss_mlp": 1.03655815, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.5719204794241484, "language_loss": 0.75634623, "learning_rate": 3.9372205530014845e-06, "loss": 0.77938378, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.7022111415863037 }, { "auxiliary_loss_clip": 0.01252011, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.06602263, "balance_loss_mlp": 1.02930081, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 4.61498356580739, "language_loss": 0.71150112, "learning_rate": 3.937026765981696e-06, "loss": 0.73443007, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.6587748527526855 }, { "auxiliary_loss_clip": 0.01404176, "auxiliary_loss_mlp": 0.01054295, "balance_loss_clip": 1.06795788, "balance_loss_mlp": 1.04330361, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.9289846450607246, "language_loss": 0.79881126, "learning_rate": 3.936832685116488e-06, "loss": 0.82339597, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.7055270671844482 }, { "auxiliary_loss_clip": 0.01253639, "auxiliary_loss_mlp": 0.0104183, "balance_loss_clip": 1.06968713, "balance_loss_mlp": 1.03195953, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 3.3024750994554832, "language_loss": 0.90090346, "learning_rate": 3.936638310435301e-06, "loss": 0.92385823, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.61527681350708 }, { "auxiliary_loss_clip": 0.01306109, "auxiliary_loss_mlp": 0.01039513, "balance_loss_clip": 1.06809473, "balance_loss_mlp": 1.0296005, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 2.0540341080333335, "language_loss": 0.81639665, "learning_rate": 3.936443641967623e-06, "loss": 0.83985287, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.733175754547119 }, { "auxiliary_loss_clip": 0.01354343, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.06639707, "balance_loss_mlp": 1.02992201, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.117914242693852, "language_loss": 0.83914173, "learning_rate": 3.936248679742983e-06, "loss": 0.86309052, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 3.7367053031921387 }, { "auxiliary_loss_clip": 0.01308046, "auxiliary_loss_mlp": 0.01009966, "balance_loss_clip": 1.04276538, "balance_loss_mlp": 1.00724781, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.2811927196034485, "language_loss": 0.70163208, "learning_rate": 3.936053423790959e-06, "loss": 0.72481221, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.0485939979553223 }, { "auxiliary_loss_clip": 0.01253927, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.07057929, "balance_loss_mlp": 1.03256285, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 7.687201216450959, "language_loss": 0.77794921, "learning_rate": 3.935857874141168e-06, "loss": 0.80091739, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.7428719997406006 }, { "auxiliary_loss_clip": 0.01348146, "auxiliary_loss_mlp": 0.01044077, "balance_loss_clip": 1.06641579, "balance_loss_mlp": 1.0340575, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.678966799820402, "language_loss": 0.83846188, "learning_rate": 3.935662030823279e-06, "loss": 0.86238408, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 4.71454119682312 }, { "auxiliary_loss_clip": 0.0130413, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 1.06641448, "balance_loss_mlp": 1.0237875, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 3.310856095184647, "language_loss": 0.72771907, "learning_rate": 3.935465893866998e-06, "loss": 0.75109857, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.6407790184020996 }, { "auxiliary_loss_clip": 0.01349835, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.06442356, "balance_loss_mlp": 1.027035, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 4.04348022385036, "language_loss": 0.8005845, "learning_rate": 3.935269463302079e-06, "loss": 0.82445836, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 2.873183012008667 }, { "auxiliary_loss_clip": 0.01308453, "auxiliary_loss_mlp": 0.01043504, "balance_loss_clip": 1.0701263, "balance_loss_mlp": 1.03274548, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 2.996045955200057, "language_loss": 0.76714158, "learning_rate": 3.935072739158322e-06, "loss": 0.7906611, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 2.6403756141662598 }, { "auxiliary_loss_clip": 0.01352057, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.06362033, "balance_loss_mlp": 1.02671564, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.71955212295564, "language_loss": 0.8003453, "learning_rate": 3.934875721465569e-06, "loss": 0.82424307, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 3.7000839710235596 }, { "auxiliary_loss_clip": 0.01349796, "auxiliary_loss_mlp": 0.01037671, "balance_loss_clip": 1.06330407, "balance_loss_mlp": 1.02692461, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 4.759305846650769, "language_loss": 0.72060323, "learning_rate": 3.9346784102537076e-06, "loss": 0.74447793, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.8910715579986572 }, { "auxiliary_loss_clip": 0.01252803, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.06769538, "balance_loss_mlp": 1.02926588, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 2.65967196188039, "language_loss": 0.78405321, "learning_rate": 3.934480805552669e-06, "loss": 0.80697328, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.6137969493865967 }, { "auxiliary_loss_clip": 0.01254546, "auxiliary_loss_mlp": 0.02587315, "balance_loss_clip": 1.07030749, "balance_loss_mlp": 1.00029302, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.0403150471959886, "language_loss": 0.88165593, "learning_rate": 3.93428290739243e-06, "loss": 0.92007458, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.737029552459717 }, { "auxiliary_loss_clip": 0.01354058, "auxiliary_loss_mlp": 0.01044498, "balance_loss_clip": 1.06546724, "balance_loss_mlp": 1.0332799, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.408002519192761, "language_loss": 0.79609638, "learning_rate": 3.9340847158030125e-06, "loss": 0.82008195, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.6169111728668213 }, { "auxiliary_loss_clip": 0.01304539, "auxiliary_loss_mlp": 0.01034871, "balance_loss_clip": 1.06643391, "balance_loss_mlp": 1.02443993, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 1.7248700382725897, "language_loss": 0.75394684, "learning_rate": 3.9338862308144814e-06, "loss": 0.77734089, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.6591861248016357 }, { "auxiliary_loss_clip": 0.01250936, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.06947553, "balance_loss_mlp": 1.02498984, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.6797692910536006, "language_loss": 0.84625626, "learning_rate": 3.933687452456946e-06, "loss": 0.8691206, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.5765480995178223 }, { "auxiliary_loss_clip": 0.01403028, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.06268191, "balance_loss_mlp": 1.02856314, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.6919168172385097, "language_loss": 0.86240536, "learning_rate": 3.933488380760562e-06, "loss": 0.88683492, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.7172205448150635 }, { "auxiliary_loss_clip": 0.01252563, "auxiliary_loss_mlp": 0.02588384, "balance_loss_clip": 1.06820977, "balance_loss_mlp": 1.00037944, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 2.1949284673590483, "language_loss": 0.87353969, "learning_rate": 3.9332890157555286e-06, "loss": 0.9119491, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.6271986961364746 }, { "auxiliary_loss_clip": 0.01357455, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.06599438, "balance_loss_mlp": 1.03277731, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 1.9644286166609504, "language_loss": 0.76540124, "learning_rate": 3.933089357472088e-06, "loss": 0.78941566, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.679608106613159 }, { "auxiliary_loss_clip": 0.01258447, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.07374263, "balance_loss_mlp": 1.03411984, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 2.9629813802026717, "language_loss": 0.85751271, "learning_rate": 3.932889405940529e-06, "loss": 0.88053596, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.6873793601989746 }, { "auxiliary_loss_clip": 0.01354103, "auxiliary_loss_mlp": 0.0104218, "balance_loss_clip": 1.06997216, "balance_loss_mlp": 1.03103983, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.159129516461089, "language_loss": 0.80326855, "learning_rate": 3.932689161191184e-06, "loss": 0.82723135, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.709218978881836 }, { "auxiliary_loss_clip": 0.01302088, "auxiliary_loss_mlp": 0.01039708, "balance_loss_clip": 1.06604373, "balance_loss_mlp": 1.02837729, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.3362183205206644, "language_loss": 0.87991738, "learning_rate": 3.93248862325443e-06, "loss": 0.90333539, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.742094039916992 }, { "auxiliary_loss_clip": 0.01215488, "auxiliary_loss_mlp": 0.010044, "balance_loss_clip": 1.04336655, "balance_loss_mlp": 1.00147939, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9358174680393039, "language_loss": 0.64536786, "learning_rate": 3.932287792160688e-06, "loss": 0.66756666, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.1785061359405518 }, { "auxiliary_loss_clip": 0.01299929, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.06241608, "balance_loss_mlp": 1.02800417, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 3.3532758440619856, "language_loss": 0.80729139, "learning_rate": 3.932086667940424e-06, "loss": 0.83067751, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.675508499145508 }, { "auxiliary_loss_clip": 0.01298808, "auxiliary_loss_mlp": 0.02586591, "balance_loss_clip": 1.06883168, "balance_loss_mlp": 1.00032222, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 1.9328651123693088, "language_loss": 0.81754071, "learning_rate": 3.93188525062415e-06, "loss": 0.85639471, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.7350330352783203 }, { "auxiliary_loss_clip": 0.01299129, "auxiliary_loss_mlp": 0.01037398, "balance_loss_clip": 1.06542182, "balance_loss_mlp": 1.0263536, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 3.4743555845746177, "language_loss": 0.8586489, "learning_rate": 3.931683540242418e-06, "loss": 0.88201416, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.680359363555908 }, { "auxiliary_loss_clip": 0.01297452, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.06566823, "balance_loss_mlp": 1.02632523, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.4994308453705467, "language_loss": 0.90992671, "learning_rate": 3.9314815368258295e-06, "loss": 0.9332695, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.6304702758789062 }, { "auxiliary_loss_clip": 0.01305023, "auxiliary_loss_mlp": 0.01045593, "balance_loss_clip": 1.06990492, "balance_loss_mlp": 1.03580034, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.8093520709172253, "language_loss": 0.78959405, "learning_rate": 3.9312792404050275e-06, "loss": 0.81310022, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.6697819232940674 }, { "auxiliary_loss_clip": 0.0125165, "auxiliary_loss_mlp": 0.01031898, "balance_loss_clip": 1.07032323, "balance_loss_mlp": 1.02244508, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 2.2350297273811406, "language_loss": 0.77398568, "learning_rate": 3.9310766510107e-06, "loss": 0.79682112, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 3.56718373298645 }, { "auxiliary_loss_clip": 0.01397904, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.06008863, "balance_loss_mlp": 1.02721477, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.9892741705515435, "language_loss": 0.91848159, "learning_rate": 3.9308737686735806e-06, "loss": 0.94284719, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.754479169845581 }, { "auxiliary_loss_clip": 0.01254744, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.07011282, "balance_loss_mlp": 1.02683854, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.8640778254264827, "language_loss": 0.82948834, "learning_rate": 3.9306705934244455e-06, "loss": 0.85241389, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 2.616848945617676 }, { "auxiliary_loss_clip": 0.01338124, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.06263351, "balance_loss_mlp": 1.03052139, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 2.8047353156369828, "language_loss": 0.88405967, "learning_rate": 3.930467125294116e-06, "loss": 0.90784073, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 3.7702300548553467 }, { "auxiliary_loss_clip": 0.01406965, "auxiliary_loss_mlp": 0.01007914, "balance_loss_clip": 1.03575706, "balance_loss_mlp": 1.00538635, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9266534493837569, "language_loss": 0.60532421, "learning_rate": 3.930263364313458e-06, "loss": 0.62947297, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 4.147348880767822 }, { "auxiliary_loss_clip": 0.01381977, "auxiliary_loss_mlp": 0.01049578, "balance_loss_clip": 1.06200063, "balance_loss_mlp": 1.03805602, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.9138689549311494, "language_loss": 0.82737923, "learning_rate": 3.930059310513384e-06, "loss": 0.85169482, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 2.7907538414001465 }, { "auxiliary_loss_clip": 0.0138774, "auxiliary_loss_mlp": 0.02585835, "balance_loss_clip": 1.0601269, "balance_loss_mlp": 1.00035965, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.8509555225997754, "language_loss": 0.84270066, "learning_rate": 3.929854963924846e-06, "loss": 0.88243639, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 2.8362209796905518 }, { "auxiliary_loss_clip": 0.01382428, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.06136036, "balance_loss_mlp": 1.0343833, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 2.1952153759155975, "language_loss": 0.77816975, "learning_rate": 3.929650324578845e-06, "loss": 0.80243486, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 3.8158721923828125 }, { "auxiliary_loss_clip": 0.01354869, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.06653619, "balance_loss_mlp": 1.02600348, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 2.336349293100345, "language_loss": 0.82603884, "learning_rate": 3.929445392506423e-06, "loss": 0.84996247, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.7915406227111816 }, { "auxiliary_loss_clip": 0.01300007, "auxiliary_loss_mlp": 0.01040095, "balance_loss_clip": 1.06908929, "balance_loss_mlp": 1.02998042, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 4.115367789100844, "language_loss": 0.76038957, "learning_rate": 3.92924016773867e-06, "loss": 0.78379059, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.631925344467163 }, { "auxiliary_loss_clip": 0.01343505, "auxiliary_loss_mlp": 0.02581806, "balance_loss_clip": 1.06093228, "balance_loss_mlp": 1.00039411, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.4182269981929774, "language_loss": 0.73939502, "learning_rate": 3.9290346503067175e-06, "loss": 0.77864814, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.6903791427612305 }, { "auxiliary_loss_clip": 0.01301275, "auxiliary_loss_mlp": 0.01055257, "balance_loss_clip": 1.06337106, "balance_loss_mlp": 1.04448092, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 2.7735922374672946, "language_loss": 0.78785551, "learning_rate": 3.9288288402417415e-06, "loss": 0.8114208, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.935434341430664 }, { "auxiliary_loss_clip": 0.01297928, "auxiliary_loss_mlp": 0.01041564, "balance_loss_clip": 1.06534338, "balance_loss_mlp": 1.03049505, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.7759500971653805, "language_loss": 0.70588887, "learning_rate": 3.928622737574964e-06, "loss": 0.72928381, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.66365122795105 }, { "auxiliary_loss_clip": 0.01349009, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.06198382, "balance_loss_mlp": 1.02767634, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 2.0556386216432414, "language_loss": 0.91194969, "learning_rate": 3.928416342337652e-06, "loss": 0.93582189, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.772516965866089 }, { "auxiliary_loss_clip": 0.01326717, "auxiliary_loss_mlp": 0.01036958, "balance_loss_clip": 1.06284881, "balance_loss_mlp": 1.02695084, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.98440891036142, "language_loss": 0.82979137, "learning_rate": 3.928209654561113e-06, "loss": 0.85342813, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.7124621868133545 }, { "auxiliary_loss_clip": 0.01342596, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 1.06473947, "balance_loss_mlp": 1.0308013, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 3.360713355129871, "language_loss": 0.81731641, "learning_rate": 3.928002674276703e-06, "loss": 0.84114182, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.716331958770752 }, { "auxiliary_loss_clip": 0.01426759, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.05110753, "balance_loss_mlp": 1.02463746, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.743292811870845, "language_loss": 0.76105726, "learning_rate": 3.92779540151582e-06, "loss": 0.78566641, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.704352378845215 }, { "auxiliary_loss_clip": 0.01345558, "auxiliary_loss_mlp": 0.0104216, "balance_loss_clip": 1.06354511, "balance_loss_mlp": 1.0319972, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.7849195130141053, "language_loss": 0.85699081, "learning_rate": 3.927587836309907e-06, "loss": 0.88086796, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.6844851970672607 }, { "auxiliary_loss_clip": 0.0133807, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.05982256, "balance_loss_mlp": 1.02607322, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 2.065523851801602, "language_loss": 0.78108442, "learning_rate": 3.927379978690452e-06, "loss": 0.80482936, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.7149362564086914 }, { "auxiliary_loss_clip": 0.01387649, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.054883, "balance_loss_mlp": 1.02371383, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 3.35780923108966, "language_loss": 0.87341297, "learning_rate": 3.927171828688987e-06, "loss": 0.89762151, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.792544364929199 }, { "auxiliary_loss_clip": 0.0124917, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.06823444, "balance_loss_mlp": 1.02778506, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.015304643851002, "language_loss": 0.82252961, "learning_rate": 3.926963386337088e-06, "loss": 0.84540039, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.6608996391296387 }, { "auxiliary_loss_clip": 0.01249864, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.06800818, "balance_loss_mlp": 1.02733946, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.718466988154677, "language_loss": 0.7013818, "learning_rate": 3.926754651666375e-06, "loss": 0.72426164, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.773157835006714 }, { "auxiliary_loss_clip": 0.01375241, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.06284499, "balance_loss_mlp": 1.02912164, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 2.443921416376691, "language_loss": 0.78027958, "learning_rate": 3.926545624708513e-06, "loss": 0.80442744, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.741490125656128 }, { "auxiliary_loss_clip": 0.01393597, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.05948734, "balance_loss_mlp": 1.02458978, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.7358797762620553, "language_loss": 0.85522538, "learning_rate": 3.926336305495213e-06, "loss": 0.87950814, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.7508292198181152 }, { "auxiliary_loss_clip": 0.01391163, "auxiliary_loss_mlp": 0.01039156, "balance_loss_clip": 1.06106138, "balance_loss_mlp": 1.0291543, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.3064357631925274, "language_loss": 0.89140981, "learning_rate": 3.926126694058226e-06, "loss": 0.91571301, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.8023548126220703 }, { "auxiliary_loss_clip": 0.01440016, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.06248295, "balance_loss_mlp": 1.02488673, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.5640929664457042, "language_loss": 0.82092214, "learning_rate": 3.92591679042935e-06, "loss": 0.84566832, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.8165667057037354 }, { "auxiliary_loss_clip": 0.01298285, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.06765199, "balance_loss_mlp": 1.02779436, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 2.63934890929552, "language_loss": 0.82050836, "learning_rate": 3.92570659464043e-06, "loss": 0.84387267, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 3.5639734268188477 }, { "auxiliary_loss_clip": 0.01295415, "auxiliary_loss_mlp": 0.02583641, "balance_loss_clip": 1.06744957, "balance_loss_mlp": 1.00030112, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 1.9574977883334987, "language_loss": 0.8002255, "learning_rate": 3.925496106723349e-06, "loss": 0.83901608, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.6000442504882812 }, { "auxiliary_loss_clip": 0.01299873, "auxiliary_loss_mlp": 0.010404, "balance_loss_clip": 1.06606269, "balance_loss_mlp": 1.030249, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 4.152941620204811, "language_loss": 0.83817637, "learning_rate": 3.9252853267100405e-06, "loss": 0.86157912, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.695502281188965 }, { "auxiliary_loss_clip": 0.01387751, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.06066716, "balance_loss_mlp": 1.03048277, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 4.298973238795976, "language_loss": 0.8370533, "learning_rate": 3.9250742546324786e-06, "loss": 0.86132735, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 3.6716456413269043 }, { "auxiliary_loss_clip": 0.01344091, "auxiliary_loss_mlp": 0.01046641, "balance_loss_clip": 1.06067109, "balance_loss_mlp": 1.03683639, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.8357602498808363, "language_loss": 0.86737049, "learning_rate": 3.924862890522683e-06, "loss": 0.89127779, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 3.6321189403533936 }, { "auxiliary_loss_clip": 0.01299107, "auxiliary_loss_mlp": 0.01044323, "balance_loss_clip": 1.06559253, "balance_loss_mlp": 1.03359985, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.112150001004441, "language_loss": 0.86056089, "learning_rate": 3.9246512344127174e-06, "loss": 0.88399523, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 2.6986746788024902 }, { "auxiliary_loss_clip": 0.01475284, "auxiliary_loss_mlp": 0.01046149, "balance_loss_clip": 1.05355847, "balance_loss_mlp": 1.0356344, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 2.1456747362369946, "language_loss": 0.81974339, "learning_rate": 3.9244392863346895e-06, "loss": 0.84495771, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 2.7901992797851562 }, { "auxiliary_loss_clip": 0.01351107, "auxiliary_loss_mlp": 0.0104524, "balance_loss_clip": 1.06710267, "balance_loss_mlp": 1.03456521, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.9924834759980157, "language_loss": 0.92308819, "learning_rate": 3.9242270463207524e-06, "loss": 0.94705164, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 3.6206157207489014 }, { "auxiliary_loss_clip": 0.01430773, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.05670798, "balance_loss_mlp": 1.03087962, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 2.6525342704501482, "language_loss": 0.852036, "learning_rate": 3.924014514403102e-06, "loss": 0.8767525, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.78056001663208 }, { "auxiliary_loss_clip": 0.01435381, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.05815315, "balance_loss_mlp": 1.02589762, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 2.333066277874281, "language_loss": 0.91453505, "learning_rate": 3.92380169061398e-06, "loss": 0.93925947, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.7320709228515625 }, { "auxiliary_loss_clip": 0.01385034, "auxiliary_loss_mlp": 0.02582366, "balance_loss_clip": 1.05436361, "balance_loss_mlp": 1.00032258, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 1.9749347459797657, "language_loss": 0.84159851, "learning_rate": 3.9235885749856705e-06, "loss": 0.88127249, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.8205618858337402 }, { "auxiliary_loss_clip": 0.01341162, "auxiliary_loss_mlp": 0.01041957, "balance_loss_clip": 1.06419349, "balance_loss_mlp": 1.03163397, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 1.9472174103942141, "language_loss": 0.82664955, "learning_rate": 3.9233751675505035e-06, "loss": 0.85048079, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.6750898361206055 }, { "auxiliary_loss_clip": 0.01351986, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.07004118, "balance_loss_mlp": 1.02702928, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 3.6312600889808446, "language_loss": 0.85303044, "learning_rate": 3.923161468340853e-06, "loss": 0.87692147, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.6828417778015137 }, { "auxiliary_loss_clip": 0.01436744, "auxiliary_loss_mlp": 0.01042089, "balance_loss_clip": 1.05400097, "balance_loss_mlp": 1.03075266, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 1.7240451646107409, "language_loss": 0.81613648, "learning_rate": 3.9229474773891374e-06, "loss": 0.84092486, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.766862154006958 }, { "auxiliary_loss_clip": 0.01400963, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.05882752, "balance_loss_mlp": 1.02462173, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 1.976973075956202, "language_loss": 0.83586073, "learning_rate": 3.922733194727818e-06, "loss": 0.86022103, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.7730624675750732 }, { "auxiliary_loss_clip": 0.01299521, "auxiliary_loss_mlp": 0.01037239, "balance_loss_clip": 1.06662512, "balance_loss_mlp": 1.02696276, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 2.6808299479858557, "language_loss": 0.87582767, "learning_rate": 3.922518620389402e-06, "loss": 0.89919531, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.6121182441711426 }, { "auxiliary_loss_clip": 0.01522015, "auxiliary_loss_mlp": 0.01045597, "balance_loss_clip": 1.05368376, "balance_loss_mlp": 1.03622687, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 1.7355461590200734, "language_loss": 0.89496952, "learning_rate": 3.922303754406439e-06, "loss": 0.92064565, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 3.053508758544922 }, { "auxiliary_loss_clip": 0.01379253, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.05303955, "balance_loss_mlp": 1.02584815, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 1.8721983851070452, "language_loss": 0.79133415, "learning_rate": 3.922088596811526e-06, "loss": 0.81549156, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.9730894565582275 }, { "auxiliary_loss_clip": 0.01289138, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.06461978, "balance_loss_mlp": 1.03024507, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 3.410259027726554, "language_loss": 0.8675102, "learning_rate": 3.9218731476373e-06, "loss": 0.89080501, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.6441404819488525 }, { "auxiliary_loss_clip": 0.01303018, "auxiliary_loss_mlp": 0.01043084, "balance_loss_clip": 1.06910992, "balance_loss_mlp": 1.0314312, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 1.9213309372151195, "language_loss": 0.84530175, "learning_rate": 3.9216574069164455e-06, "loss": 0.86876273, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.7098774909973145 }, { "auxiliary_loss_clip": 0.01245158, "auxiliary_loss_mlp": 0.01037939, "balance_loss_clip": 1.06823993, "balance_loss_mlp": 1.02820015, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.7724453994663179, "language_loss": 0.79902267, "learning_rate": 3.921441374681691e-06, "loss": 0.82185364, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.655426025390625 }, { "auxiliary_loss_clip": 0.01340726, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.06498873, "balance_loss_mlp": 1.03050923, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.9599601951144572, "language_loss": 0.6481657, "learning_rate": 3.921225050965808e-06, "loss": 0.67196894, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.731170177459717 }, { "auxiliary_loss_clip": 0.01392906, "auxiliary_loss_mlp": 0.01034822, "balance_loss_clip": 1.06260562, "balance_loss_mlp": 1.02460587, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 1.998440154152543, "language_loss": 0.75075018, "learning_rate": 3.921008435801612e-06, "loss": 0.77502751, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.762303352355957 }, { "auxiliary_loss_clip": 0.01288288, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.06354105, "balance_loss_mlp": 1.02501702, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 2.0966722694374735, "language_loss": 0.75541955, "learning_rate": 3.920791529221963e-06, "loss": 0.77865267, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.6846323013305664 }, { "auxiliary_loss_clip": 0.0134597, "auxiliary_loss_mlp": 0.02582581, "balance_loss_clip": 1.06344736, "balance_loss_mlp": 1.00033689, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 1.7053129909385294, "language_loss": 0.76786637, "learning_rate": 3.920574331259768e-06, "loss": 0.80715191, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.8217179775238037 }, { "auxiliary_loss_clip": 0.01333384, "auxiliary_loss_mlp": 0.0104544, "balance_loss_clip": 1.06053913, "balance_loss_mlp": 1.03543222, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.4338673890569114, "language_loss": 0.80027485, "learning_rate": 3.9203568419479716e-06, "loss": 0.82406306, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 4.004366636276245 }, { "auxiliary_loss_clip": 0.01318425, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.06452751, "balance_loss_mlp": 1.02276218, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.803150030159134, "language_loss": 0.75081146, "learning_rate": 3.92013906131957e-06, "loss": 0.7743172, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.820523500442505 }, { "auxiliary_loss_clip": 0.01393745, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.06206059, "balance_loss_mlp": 1.03015018, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 3.5798691031246963, "language_loss": 0.82059491, "learning_rate": 3.9199209894076e-06, "loss": 0.84493023, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.8071773052215576 }, { "auxiliary_loss_clip": 0.01245779, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.06479585, "balance_loss_mlp": 1.026057, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.9029937296986277, "language_loss": 0.90398026, "learning_rate": 3.919702626245142e-06, "loss": 0.92680061, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.648801565170288 }, { "auxiliary_loss_clip": 0.01331912, "auxiliary_loss_mlp": 0.01041752, "balance_loss_clip": 1.05808067, "balance_loss_mlp": 1.03161287, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.204839533090296, "language_loss": 0.66159213, "learning_rate": 3.919483971865322e-06, "loss": 0.68532872, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 3.7246944904327393 }, { "auxiliary_loss_clip": 0.01344606, "auxiliary_loss_mlp": 0.01046283, "balance_loss_clip": 1.06710267, "balance_loss_mlp": 1.036412, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 4.963403555527744, "language_loss": 0.87475848, "learning_rate": 3.91926502630131e-06, "loss": 0.89866734, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 2.68692946434021 }, { "auxiliary_loss_clip": 0.01297968, "auxiliary_loss_mlp": 0.01044138, "balance_loss_clip": 1.06936026, "balance_loss_mlp": 1.03451765, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 1.7667037418856715, "language_loss": 0.72240376, "learning_rate": 3.91904578958632e-06, "loss": 0.74582481, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.748490333557129 }, { "auxiliary_loss_clip": 0.01247141, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.06711888, "balance_loss_mlp": 1.02873993, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.028104802745636, "language_loss": 0.84126282, "learning_rate": 3.918826261753608e-06, "loss": 0.86412418, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.6228108406066895 }, { "auxiliary_loss_clip": 0.01342457, "auxiliary_loss_mlp": 0.01042555, "balance_loss_clip": 1.06367612, "balance_loss_mlp": 1.03381729, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 2.752864549590666, "language_loss": 0.71450686, "learning_rate": 3.918606442836478e-06, "loss": 0.73835695, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 3.6273419857025146 }, { "auxiliary_loss_clip": 0.01295527, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.06925452, "balance_loss_mlp": 1.02304566, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.7983104691525906, "language_loss": 0.77599549, "learning_rate": 3.918386332868277e-06, "loss": 0.79928058, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.6986594200134277 }, { "auxiliary_loss_clip": 0.01288109, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.06375456, "balance_loss_mlp": 1.0277046, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 2.1975139163730986, "language_loss": 0.94674289, "learning_rate": 3.918165931882394e-06, "loss": 0.96999753, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.6833527088165283 }, { "auxiliary_loss_clip": 0.01482065, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.05339384, "balance_loss_mlp": 1.03036571, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.813460551244486, "language_loss": 0.75454199, "learning_rate": 3.917945239912264e-06, "loss": 0.77976084, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.865227699279785 }, { "auxiliary_loss_clip": 0.01437958, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.05950737, "balance_loss_mlp": 1.02760267, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 2.16015198006088, "language_loss": 0.75424045, "learning_rate": 3.917724256991367e-06, "loss": 0.77898455, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.923170328140259 }, { "auxiliary_loss_clip": 0.01334385, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.06268024, "balance_loss_mlp": 1.02790189, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 1.9134824509477115, "language_loss": 0.81484413, "learning_rate": 3.9175029831532245e-06, "loss": 0.83856475, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.776963233947754 }, { "auxiliary_loss_clip": 0.0139553, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.06298769, "balance_loss_mlp": 1.02626419, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 1.998877878452502, "language_loss": 0.88408726, "learning_rate": 3.917281418431404e-06, "loss": 0.90840197, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.6764466762542725 }, { "auxiliary_loss_clip": 0.01342878, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.06689668, "balance_loss_mlp": 1.02998018, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.8704474798227673, "language_loss": 0.76994747, "learning_rate": 3.917059562859516e-06, "loss": 0.79376888, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.7464077472686768 }, { "auxiliary_loss_clip": 0.01335828, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.06329429, "balance_loss_mlp": 1.02782345, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.12705092378322, "language_loss": 0.8850342, "learning_rate": 3.916837416471218e-06, "loss": 0.90875977, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.7478911876678467 }, { "auxiliary_loss_clip": 0.01289526, "auxiliary_loss_mlp": 0.01034342, "balance_loss_clip": 1.06192303, "balance_loss_mlp": 1.02556229, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.792484801835597, "language_loss": 0.73033917, "learning_rate": 3.916614979300207e-06, "loss": 0.75357783, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.659621238708496 }, { "auxiliary_loss_clip": 0.01435876, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.05965388, "balance_loss_mlp": 1.0308764, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.723592125533583, "language_loss": 0.78637284, "learning_rate": 3.9163922513802274e-06, "loss": 0.81113416, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.8458261489868164 }, { "auxiliary_loss_clip": 0.01247724, "auxiliary_loss_mlp": 0.0103438, "balance_loss_clip": 1.06698012, "balance_loss_mlp": 1.02375245, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 3.0957854067896995, "language_loss": 0.83295041, "learning_rate": 3.916169232745067e-06, "loss": 0.85577142, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.569119691848755 }, { "auxiliary_loss_clip": 0.0133529, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.06155181, "balance_loss_mlp": 1.02631319, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 2.657099925408867, "language_loss": 0.92236924, "learning_rate": 3.915945923428559e-06, "loss": 0.94609296, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.7391390800476074 }, { "auxiliary_loss_clip": 0.01291913, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.0622673, "balance_loss_mlp": 1.02692449, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.5313805168709886, "language_loss": 0.83239079, "learning_rate": 3.915722323464577e-06, "loss": 0.8556748, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.6309993267059326 }, { "auxiliary_loss_clip": 0.0129161, "auxiliary_loss_mlp": 0.01038423, "balance_loss_clip": 1.06459343, "balance_loss_mlp": 1.02897549, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 3.853325400645241, "language_loss": 0.70694077, "learning_rate": 3.91549843288704e-06, "loss": 0.73024106, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.907522678375244 }, { "auxiliary_loss_clip": 0.01393864, "auxiliary_loss_mlp": 0.02582726, "balance_loss_clip": 1.05837131, "balance_loss_mlp": 1.00040722, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 2.886786809362589, "language_loss": 0.79168689, "learning_rate": 3.915274251729916e-06, "loss": 0.83145285, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.7574703693389893 }, { "auxiliary_loss_clip": 0.01398874, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.06489754, "balance_loss_mlp": 1.02866936, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.029983095546796, "language_loss": 0.90217364, "learning_rate": 3.91504978002721e-06, "loss": 0.9265492, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.913487672805786 }, { "auxiliary_loss_clip": 0.01319676, "auxiliary_loss_mlp": 0.02583087, "balance_loss_clip": 1.06075978, "balance_loss_mlp": 1.00035501, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 1.9447313997119695, "language_loss": 0.76220214, "learning_rate": 3.914825017812974e-06, "loss": 0.80122972, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.6729581356048584 }, { "auxiliary_loss_clip": 0.01317772, "auxiliary_loss_mlp": 0.01039911, "balance_loss_clip": 1.06616306, "balance_loss_mlp": 1.03001702, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.2885039956428392, "language_loss": 0.72545922, "learning_rate": 3.9145999651213065e-06, "loss": 0.74903607, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 3.735759973526001 }, { "auxiliary_loss_clip": 0.01293444, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.06507087, "balance_loss_mlp": 1.02579808, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 2.2614816570193716, "language_loss": 0.88570404, "learning_rate": 3.9143746219863465e-06, "loss": 0.90899467, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.672090768814087 }, { "auxiliary_loss_clip": 0.01278261, "auxiliary_loss_mlp": 0.0100214, "balance_loss_clip": 1.05397677, "balance_loss_mlp": 0.99982768, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9189653375814852, "language_loss": 0.64760828, "learning_rate": 3.914148988442278e-06, "loss": 0.6704123, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.197863817214966 }, { "auxiliary_loss_clip": 0.01337729, "auxiliary_loss_mlp": 0.01038662, "balance_loss_clip": 1.06509173, "balance_loss_mlp": 1.0276711, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 2.640161586922222, "language_loss": 0.9527272, "learning_rate": 3.91392306452333e-06, "loss": 0.97649121, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.7459659576416016 }, { "auxiliary_loss_clip": 0.01251836, "auxiliary_loss_mlp": 0.0103907, "balance_loss_clip": 1.06962454, "balance_loss_mlp": 1.02856779, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 3.4409086786914687, "language_loss": 0.66485387, "learning_rate": 3.913696850263774e-06, "loss": 0.68776298, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 3.503448247909546 }, { "auxiliary_loss_clip": 0.01294444, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.06367779, "balance_loss_mlp": 1.02633619, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.5282719155222253, "language_loss": 0.79048109, "learning_rate": 3.913470345697929e-06, "loss": 0.81379229, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 3.547529935836792 }, { "auxiliary_loss_clip": 0.01419247, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.06251788, "balance_loss_mlp": 1.0303843, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.3257032300983194, "language_loss": 0.85947615, "learning_rate": 3.913243550860153e-06, "loss": 0.88407969, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 2.7789196968078613 }, { "auxiliary_loss_clip": 0.01295145, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.06855869, "balance_loss_mlp": 1.01896012, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 2.1232707265482045, "language_loss": 0.7597481, "learning_rate": 3.913016465784852e-06, "loss": 0.78299248, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.715496301651001 }, { "auxiliary_loss_clip": 0.0143896, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.05747843, "balance_loss_mlp": 1.02323818, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 2.3670575654094903, "language_loss": 0.7224142, "learning_rate": 3.912789090506474e-06, "loss": 0.74713838, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 3.6896867752075195 }, { "auxiliary_loss_clip": 0.01393311, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.05642164, "balance_loss_mlp": 1.02501631, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.215584114019043, "language_loss": 0.72113407, "learning_rate": 3.9125614250595114e-06, "loss": 0.74541819, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.8120834827423096 }, { "auxiliary_loss_clip": 0.01298827, "auxiliary_loss_mlp": 0.01037799, "balance_loss_clip": 1.06705296, "balance_loss_mlp": 1.02724862, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.5152550728034018, "language_loss": 0.88804054, "learning_rate": 3.912333469478502e-06, "loss": 0.91140687, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.6352038383483887 }, { "auxiliary_loss_clip": 0.0134446, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.06117713, "balance_loss_mlp": 1.03021526, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 2.598584918695924, "language_loss": 0.78111452, "learning_rate": 3.912105223798025e-06, "loss": 0.80496407, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.7253355979919434 }, { "auxiliary_loss_clip": 0.01319478, "auxiliary_loss_mlp": 0.01004546, "balance_loss_clip": 1.04120696, "balance_loss_mlp": 1.0022099, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 1.0006643638663886, "language_loss": 0.67604887, "learning_rate": 3.9118766880527065e-06, "loss": 0.69928914, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.2389609813690186 }, { "auxiliary_loss_clip": 0.01439881, "auxiliary_loss_mlp": 0.01041503, "balance_loss_clip": 1.06053257, "balance_loss_mlp": 1.03163886, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.716749675042218, "language_loss": 0.73764169, "learning_rate": 3.9116478622772145e-06, "loss": 0.76245558, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.8211042881011963 }, { "auxiliary_loss_clip": 0.01295985, "auxiliary_loss_mlp": 0.01042186, "balance_loss_clip": 1.06858826, "balance_loss_mlp": 1.03234506, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.723780567005092, "language_loss": 0.880144, "learning_rate": 3.911418746506261e-06, "loss": 0.90352571, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.732898473739624 }, { "auxiliary_loss_clip": 0.01299059, "auxiliary_loss_mlp": 0.01040168, "balance_loss_clip": 1.06894219, "balance_loss_mlp": 1.02965951, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.930967321459311, "language_loss": 0.78558606, "learning_rate": 3.911189340774604e-06, "loss": 0.80897838, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.631235361099243 }, { "auxiliary_loss_clip": 0.01353583, "auxiliary_loss_mlp": 0.01036554, "balance_loss_clip": 1.0645206, "balance_loss_mlp": 1.02711308, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.7230595805222502, "language_loss": 0.79995632, "learning_rate": 3.910959645117043e-06, "loss": 0.82385767, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.7133617401123047 }, { "auxiliary_loss_clip": 0.01260387, "auxiliary_loss_mlp": 0.02535217, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 0.99988663, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8181347583838138, "language_loss": 0.56718558, "learning_rate": 3.910729659568423e-06, "loss": 0.6051417, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.2254464626312256 }, { "auxiliary_loss_clip": 0.01343084, "auxiliary_loss_mlp": 0.0103881, "balance_loss_clip": 1.06438541, "balance_loss_mlp": 1.02861798, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 2.22206034318736, "language_loss": 0.8200112, "learning_rate": 3.9104993841636344e-06, "loss": 0.84383011, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.7387354373931885 }, { "auxiliary_loss_clip": 0.01340704, "auxiliary_loss_mlp": 0.0257778, "balance_loss_clip": 1.06590962, "balance_loss_mlp": 1.00028014, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.9110389560717111, "language_loss": 0.80865687, "learning_rate": 3.910268818937608e-06, "loss": 0.84784174, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.6528477668762207 }, { "auxiliary_loss_clip": 0.01406612, "auxiliary_loss_mlp": 0.01046553, "balance_loss_clip": 1.0599643, "balance_loss_mlp": 1.03695643, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.7178772481316575, "language_loss": 0.87827396, "learning_rate": 3.9100379639253196e-06, "loss": 0.90280557, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.7831780910491943 }, { "auxiliary_loss_clip": 0.01340408, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.05965173, "balance_loss_mlp": 1.02589619, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.990457049798616, "language_loss": 0.86351806, "learning_rate": 3.909806819161791e-06, "loss": 0.88727373, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.639815330505371 }, { "auxiliary_loss_clip": 0.01396582, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.06094265, "balance_loss_mlp": 1.02584219, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 1.9629491966316894, "language_loss": 0.86425287, "learning_rate": 3.909575384682086e-06, "loss": 0.8885836, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.740975856781006 }, { "auxiliary_loss_clip": 0.01296766, "auxiliary_loss_mlp": 0.01039074, "balance_loss_clip": 1.06305289, "balance_loss_mlp": 1.02964473, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 2.1534715785076934, "language_loss": 0.69360334, "learning_rate": 3.9093436605213144e-06, "loss": 0.71696168, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.7517013549804688 }, { "auxiliary_loss_clip": 0.01343613, "auxiliary_loss_mlp": 0.01038462, "balance_loss_clip": 1.06474471, "balance_loss_mlp": 1.02866864, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 2.176266823838163, "language_loss": 0.79669642, "learning_rate": 3.909111646714627e-06, "loss": 0.82051718, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.797309160232544 }, { "auxiliary_loss_clip": 0.01240356, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.06516373, "balance_loss_mlp": 1.02622962, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.910540673638394, "language_loss": 0.72376823, "learning_rate": 3.9088793432972206e-06, "loss": 0.74652505, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.705413579940796 }, { "auxiliary_loss_clip": 0.01440337, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 1.06041324, "balance_loss_mlp": 1.02833629, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.4391487258827564, "language_loss": 0.8223623, "learning_rate": 3.908646750304336e-06, "loss": 0.8471415, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 3.6682639122009277 }, { "auxiliary_loss_clip": 0.01345953, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.06654406, "balance_loss_mlp": 1.02977097, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.7917288627512735, "language_loss": 0.87332952, "learning_rate": 3.908413867771257e-06, "loss": 0.89718133, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.7442803382873535 }, { "auxiliary_loss_clip": 0.01294487, "auxiliary_loss_mlp": 0.01041184, "balance_loss_clip": 1.06720817, "balance_loss_mlp": 1.03126585, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 6.32031896179411, "language_loss": 0.80504519, "learning_rate": 3.908180695733311e-06, "loss": 0.82840192, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.6223134994506836 }, { "auxiliary_loss_clip": 0.01421523, "auxiliary_loss_mlp": 0.01048768, "balance_loss_clip": 1.05300879, "balance_loss_mlp": 1.03929675, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 2.2884537232718083, "language_loss": 0.83121616, "learning_rate": 3.907947234225871e-06, "loss": 0.85591906, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.75187349319458 }, { "auxiliary_loss_clip": 0.01482051, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.05372047, "balance_loss_mlp": 1.02591896, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 1.8535359309129105, "language_loss": 0.87115133, "learning_rate": 3.907713483284352e-06, "loss": 0.8963306, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 4.691546440124512 }, { "auxiliary_loss_clip": 0.01537814, "auxiliary_loss_mlp": 0.0103973, "balance_loss_clip": 1.05278254, "balance_loss_mlp": 1.03002071, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.500585422991512, "language_loss": 0.97483373, "learning_rate": 3.907479442944216e-06, "loss": 1.00060916, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.0827810764312744 }, { "auxiliary_loss_clip": 0.01290684, "auxiliary_loss_mlp": 0.01039132, "balance_loss_clip": 1.0669347, "balance_loss_mlp": 1.02969122, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 4.553894732239057, "language_loss": 0.92392284, "learning_rate": 3.907245113240963e-06, "loss": 0.94722092, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.0423970222473145 }, { "auxiliary_loss_clip": 0.01387064, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.0558157, "balance_loss_mlp": 1.02555382, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 2.3557287102610713, "language_loss": 0.7379967, "learning_rate": 3.907010494210144e-06, "loss": 0.76222152, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 3.0001912117004395 }, { "auxiliary_loss_clip": 0.01297196, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.06637919, "balance_loss_mlp": 1.02653003, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 2.3300754326467414, "language_loss": 0.92199647, "learning_rate": 3.9067755858873495e-06, "loss": 0.94534463, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 3.639268398284912 }, { "auxiliary_loss_clip": 0.01306794, "auxiliary_loss_mlp": 0.01005386, "balance_loss_clip": 1.03875816, "balance_loss_mlp": 1.00314522, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.863598851473667, "language_loss": 0.62823081, "learning_rate": 3.906540388308214e-06, "loss": 0.65135264, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.353687286376953 }, { "auxiliary_loss_clip": 0.01429277, "auxiliary_loss_mlp": 0.01040911, "balance_loss_clip": 1.05987155, "balance_loss_mlp": 1.03143406, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 3.5622364525151813, "language_loss": 0.81696403, "learning_rate": 3.906304901508417e-06, "loss": 0.84166598, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.8104772567749023 }, { "auxiliary_loss_clip": 0.01299577, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.07145238, "balance_loss_mlp": 1.02619112, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.951532629988132, "language_loss": 0.75706816, "learning_rate": 3.9060691255236835e-06, "loss": 0.78042287, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.770315170288086 }, { "auxiliary_loss_clip": 0.01289806, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.06177926, "balance_loss_mlp": 1.03169298, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 2.426636070807813, "language_loss": 0.80838907, "learning_rate": 3.905833060389778e-06, "loss": 0.83170331, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.883533239364624 }, { "auxiliary_loss_clip": 0.0124667, "auxiliary_loss_mlp": 0.02577704, "balance_loss_clip": 1.06891298, "balance_loss_mlp": 1.00033617, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 4.732655078779728, "language_loss": 0.78474116, "learning_rate": 3.905596706142513e-06, "loss": 0.82298487, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.696477174758911 }, { "auxiliary_loss_clip": 0.01383292, "auxiliary_loss_mlp": 0.01037714, "balance_loss_clip": 1.05595922, "balance_loss_mlp": 1.02753377, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 1.835497403340833, "language_loss": 0.86225051, "learning_rate": 3.9053600628177435e-06, "loss": 0.88646054, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.8718841075897217 }, { "auxiliary_loss_clip": 0.01242723, "auxiliary_loss_mlp": 0.01036548, "balance_loss_clip": 1.06495392, "balance_loss_mlp": 1.02730894, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.477610630537506, "language_loss": 0.85024589, "learning_rate": 3.905123130451367e-06, "loss": 0.87303859, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.6648738384246826 }, { "auxiliary_loss_clip": 0.01245588, "auxiliary_loss_mlp": 0.01039003, "balance_loss_clip": 1.06789231, "balance_loss_mlp": 1.02896523, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 2.2705900016151537, "language_loss": 0.79511189, "learning_rate": 3.904885909079326e-06, "loss": 0.81795776, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.675853967666626 }, { "auxiliary_loss_clip": 0.01291028, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.0649246, "balance_loss_mlp": 1.02214789, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.7702429910888524, "language_loss": 0.78026116, "learning_rate": 3.904648398737607e-06, "loss": 0.80348861, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.7044944763183594 }, { "auxiliary_loss_clip": 0.01243404, "auxiliary_loss_mlp": 0.01036418, "balance_loss_clip": 1.06525397, "balance_loss_mlp": 1.02743542, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 3.119109639960784, "language_loss": 0.781142, "learning_rate": 3.9044105994622406e-06, "loss": 0.80394018, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.6837165355682373 }, { "auxiliary_loss_clip": 0.01312788, "auxiliary_loss_mlp": 0.02583297, "balance_loss_clip": 1.06260026, "balance_loss_mlp": 1.00042391, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 4.36570031952575, "language_loss": 0.81893015, "learning_rate": 3.9041725112893005e-06, "loss": 0.85789096, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.7359445095062256 }, { "auxiliary_loss_clip": 0.01385725, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.06263566, "balance_loss_mlp": 1.02148938, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 6.287642404789657, "language_loss": 0.75046349, "learning_rate": 3.903934134254904e-06, "loss": 0.77463436, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.6382410526275635 }, { "auxiliary_loss_clip": 0.01301169, "auxiliary_loss_mlp": 0.01046265, "balance_loss_clip": 1.06569946, "balance_loss_mlp": 1.03574431, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 3.299026546041682, "language_loss": 0.84800333, "learning_rate": 3.903695468395213e-06, "loss": 0.8714776, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.672031879425049 }, { "auxiliary_loss_clip": 0.01348669, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.06135941, "balance_loss_mlp": 1.02748513, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 5.15939727483319, "language_loss": 0.55985272, "learning_rate": 3.903456513746434e-06, "loss": 0.58371627, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.756950855255127 }, { "auxiliary_loss_clip": 0.01242176, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.066082, "balance_loss_mlp": 1.02480888, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.85684324983257, "language_loss": 0.87579155, "learning_rate": 3.903217270344815e-06, "loss": 0.8985495, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.691006660461426 }, { "auxiliary_loss_clip": 0.01381825, "auxiliary_loss_mlp": 0.0103412, "balance_loss_clip": 1.05729461, "balance_loss_mlp": 1.02449942, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 2.0501026122530264, "language_loss": 0.82653546, "learning_rate": 3.902977738226648e-06, "loss": 0.85069495, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.772590398788452 }, { "auxiliary_loss_clip": 0.01295569, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.06772745, "balance_loss_mlp": 1.03218114, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.8253796299615879, "language_loss": 0.90757728, "learning_rate": 3.902737917428273e-06, "loss": 0.93096161, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.724503993988037 }, { "auxiliary_loss_clip": 0.01245023, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.06803191, "balance_loss_mlp": 1.02710342, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.833856767039498, "language_loss": 0.84042144, "learning_rate": 3.902497807986068e-06, "loss": 0.86323774, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 3.622091770172119 }, { "auxiliary_loss_clip": 0.01393145, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.05693316, "balance_loss_mlp": 1.0242517, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.6805193593399024, "language_loss": 0.84150159, "learning_rate": 3.902257409936458e-06, "loss": 0.86577487, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.7717368602752686 }, { "auxiliary_loss_clip": 0.01346749, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.06822801, "balance_loss_mlp": 1.02807844, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 3.7135101945624873, "language_loss": 0.83820093, "learning_rate": 3.902016723315912e-06, "loss": 0.86203969, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.7000856399536133 }, { "auxiliary_loss_clip": 0.01292485, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.06305468, "balance_loss_mlp": 1.03108215, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 4.111478008468384, "language_loss": 0.69559968, "learning_rate": 3.901775748160941e-06, "loss": 0.71892852, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.6608502864837646 }, { "auxiliary_loss_clip": 0.01247511, "auxiliary_loss_mlp": 0.01007918, "balance_loss_clip": 1.04084027, "balance_loss_mlp": 1.0056529, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.8025903075024308, "language_loss": 0.60786015, "learning_rate": 3.901534484508101e-06, "loss": 0.63041455, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 4.565524101257324 }, { "auxiliary_loss_clip": 0.01333275, "auxiliary_loss_mlp": 0.01036571, "balance_loss_clip": 1.05886221, "balance_loss_mlp": 1.02633119, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 1.9943701327762455, "language_loss": 0.74765861, "learning_rate": 3.901292932393991e-06, "loss": 0.77135706, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 2.795952081680298 }, { "auxiliary_loss_clip": 0.01249728, "auxiliary_loss_mlp": 0.01038241, "balance_loss_clip": 1.06968594, "balance_loss_mlp": 1.02874017, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 2.181677906686555, "language_loss": 0.8502875, "learning_rate": 3.9010510918552555e-06, "loss": 0.87316716, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 2.6524336338043213 }, { "auxiliary_loss_clip": 0.01344603, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.06109703, "balance_loss_mlp": 1.02865338, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.508868136373472, "language_loss": 0.74464768, "learning_rate": 3.900808962928581e-06, "loss": 0.7684859, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.764497995376587 }, { "auxiliary_loss_clip": 0.01244101, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.06895685, "balance_loss_mlp": 1.02283931, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.2708282896771976, "language_loss": 0.8938669, "learning_rate": 3.900566545650698e-06, "loss": 0.91662842, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 3.578157901763916 }, { "auxiliary_loss_clip": 0.01293241, "auxiliary_loss_mlp": 0.01044841, "balance_loss_clip": 1.06752348, "balance_loss_mlp": 1.03528619, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.341749103151016, "language_loss": 0.81419802, "learning_rate": 3.900323840058381e-06, "loss": 0.83757889, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.7035741806030273 }, { "auxiliary_loss_clip": 0.01291601, "auxiliary_loss_mlp": 0.01037516, "balance_loss_clip": 1.06175828, "balance_loss_mlp": 1.02855158, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 2.759902771815658, "language_loss": 0.81919968, "learning_rate": 3.900080846188449e-06, "loss": 0.84249091, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.765500068664551 }, { "auxiliary_loss_clip": 0.01245315, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.06710136, "balance_loss_mlp": 1.02746022, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.9865444586121985, "language_loss": 0.81153405, "learning_rate": 3.8998375640777625e-06, "loss": 0.83435774, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.6787545680999756 }, { "auxiliary_loss_clip": 0.01243247, "auxiliary_loss_mlp": 0.01016915, "balance_loss_clip": 1.03191364, "balance_loss_mlp": 1.01469827, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.706508092372491, "language_loss": 0.52649945, "learning_rate": 3.899593993763229e-06, "loss": 0.54910111, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.256310224533081 }, { "auxiliary_loss_clip": 0.01386423, "auxiliary_loss_mlp": 0.01038669, "balance_loss_clip": 1.06077981, "balance_loss_mlp": 1.02845263, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 2.264765518305721, "language_loss": 0.81282687, "learning_rate": 3.899350135281796e-06, "loss": 0.8370778, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.8394534587860107 }, { "auxiliary_loss_clip": 0.01390352, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.06016493, "balance_loss_mlp": 1.03188825, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.694704887766841, "language_loss": 0.79806817, "learning_rate": 3.8991059886704585e-06, "loss": 0.82238591, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.803236246109009 }, { "auxiliary_loss_clip": 0.01384311, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.0608021, "balance_loss_mlp": 1.03774309, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.0358647036535307, "language_loss": 0.83353686, "learning_rate": 3.898861553966252e-06, "loss": 0.85784972, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.8915727138519287 }, { "auxiliary_loss_clip": 0.01538432, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.05210805, "balance_loss_mlp": 1.03181744, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 2.3744056053138234, "language_loss": 0.88227868, "learning_rate": 3.898616831206257e-06, "loss": 0.90807819, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 3.0063436031341553 }, { "auxiliary_loss_clip": 0.01386603, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.05653954, "balance_loss_mlp": 1.02469587, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 2.226549465521729, "language_loss": 0.76570749, "learning_rate": 3.8983718204276e-06, "loss": 0.78993011, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 3.017740249633789 }, { "auxiliary_loss_clip": 0.01339223, "auxiliary_loss_mlp": 0.01032606, "balance_loss_clip": 1.06218982, "balance_loss_mlp": 1.02377903, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 1.7555074247924802, "language_loss": 0.82716227, "learning_rate": 3.898126521667446e-06, "loss": 0.8508805, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.7249915599823 }, { "auxiliary_loss_clip": 0.01290552, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 1.0618577, "balance_loss_mlp": 1.03010917, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.7604790071450669, "language_loss": 0.83473051, "learning_rate": 3.897880934963007e-06, "loss": 0.8580364, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.685483694076538 }, { "auxiliary_loss_clip": 0.01341828, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.06099582, "balance_loss_mlp": 1.02768552, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.491623084497036, "language_loss": 0.7849015, "learning_rate": 3.89763506035154e-06, "loss": 0.80869806, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.670273780822754 }, { "auxiliary_loss_clip": 0.01279084, "auxiliary_loss_mlp": 0.01041654, "balance_loss_clip": 1.05962682, "balance_loss_mlp": 1.03246915, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.7727017415757433, "language_loss": 0.8121444, "learning_rate": 3.897388897870343e-06, "loss": 0.83535182, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.716249465942383 }, { "auxiliary_loss_clip": 0.0135028, "auxiliary_loss_mlp": 0.01037792, "balance_loss_clip": 1.06110168, "balance_loss_mlp": 1.02727795, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 2.3028365028123305, "language_loss": 0.7471751, "learning_rate": 3.89714244755676e-06, "loss": 0.77105582, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.8057966232299805 }, { "auxiliary_loss_clip": 0.01397167, "auxiliary_loss_mlp": 0.01040059, "balance_loss_clip": 1.05386829, "balance_loss_mlp": 1.03000951, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.683895381081054, "language_loss": 0.86059237, "learning_rate": 3.896895709448175e-06, "loss": 0.88496464, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.7501142024993896 }, { "auxiliary_loss_clip": 0.01480562, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.05170631, "balance_loss_mlp": 1.03092206, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 3.63666254667062, "language_loss": 0.77384937, "learning_rate": 3.896648683582019e-06, "loss": 0.79906392, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.824110507965088 }, { "auxiliary_loss_clip": 0.0143582, "auxiliary_loss_mlp": 0.01034323, "balance_loss_clip": 1.06185532, "balance_loss_mlp": 1.02476835, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.018688304957884, "language_loss": 0.80809033, "learning_rate": 3.896401369995766e-06, "loss": 0.83279181, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.866117238998413 }, { "auxiliary_loss_clip": 0.01244394, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.06917, "balance_loss_mlp": 1.02111578, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 3.6439006008602894, "language_loss": 0.79551256, "learning_rate": 3.896153768726932e-06, "loss": 0.81826138, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 3.6568238735198975 }, { "auxiliary_loss_clip": 0.0129366, "auxiliary_loss_mlp": 0.01052043, "balance_loss_clip": 1.06803632, "balance_loss_mlp": 1.0422914, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 2.6892359351592074, "language_loss": 0.87502116, "learning_rate": 3.8959058798130806e-06, "loss": 0.89847815, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.648695230484009 }, { "auxiliary_loss_clip": 0.01344599, "auxiliary_loss_mlp": 0.02582366, "balance_loss_clip": 1.06214261, "balance_loss_mlp": 1.00052166, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 2.0663287598758355, "language_loss": 0.75120175, "learning_rate": 3.895657703291814e-06, "loss": 0.79047143, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.7483255863189697 }, { "auxiliary_loss_clip": 0.01349112, "auxiliary_loss_mlp": 0.01037536, "balance_loss_clip": 1.06229353, "balance_loss_mlp": 1.02684295, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 4.7239555414760135, "language_loss": 0.80034864, "learning_rate": 3.895409239200781e-06, "loss": 0.82421505, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.767587661743164 }, { "auxiliary_loss_clip": 0.012871, "auxiliary_loss_mlp": 0.0104056, "balance_loss_clip": 1.06101418, "balance_loss_mlp": 1.03051686, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.3488092712653548, "language_loss": 0.90904444, "learning_rate": 3.895160487577673e-06, "loss": 0.93232107, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 4.5021679401397705 }, { "auxiliary_loss_clip": 0.01197107, "auxiliary_loss_mlp": 0.01002446, "balance_loss_clip": 1.04086483, "balance_loss_mlp": 1.00032449, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7868703211544962, "language_loss": 0.60932207, "learning_rate": 3.894911448460226e-06, "loss": 0.63131762, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.1641018390655518 }, { "auxiliary_loss_clip": 0.01517629, "auxiliary_loss_mlp": 0.01038907, "balance_loss_clip": 1.05172706, "balance_loss_mlp": 1.0284946, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 1.9519395581814911, "language_loss": 0.73089933, "learning_rate": 3.8946621218862195e-06, "loss": 0.75646472, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 3.0084848403930664 }, { "auxiliary_loss_clip": 0.01388439, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.06041622, "balance_loss_mlp": 1.02241194, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 2.2181223464362514, "language_loss": 0.89003354, "learning_rate": 3.894412507893475e-06, "loss": 0.91423792, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.9290642738342285 }, { "auxiliary_loss_clip": 0.01442545, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.05853152, "balance_loss_mlp": 1.03234053, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 2.0665356349500223, "language_loss": 0.72040093, "learning_rate": 3.894162606519859e-06, "loss": 0.74525487, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 3.6544559001922607 }, { "auxiliary_loss_clip": 0.01437543, "auxiliary_loss_mlp": 0.01046246, "balance_loss_clip": 1.06011796, "balance_loss_mlp": 1.0356065, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 3.078926725040933, "language_loss": 0.77237177, "learning_rate": 3.893912417803282e-06, "loss": 0.79720962, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.8977482318878174 }, { "auxiliary_loss_clip": 0.01407611, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.05550361, "balance_loss_mlp": 1.03141212, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 2.292341275730407, "language_loss": 0.77185667, "learning_rate": 3.8936619417816975e-06, "loss": 0.79634947, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.858304023742676 }, { "auxiliary_loss_clip": 0.01389388, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.06498075, "balance_loss_mlp": 1.02975202, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 1.8584823688509549, "language_loss": 0.7175324, "learning_rate": 3.8934111784931015e-06, "loss": 0.74182153, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 3.019113779067993 }, { "auxiliary_loss_clip": 0.01252166, "auxiliary_loss_mlp": 0.0100253, "balance_loss_clip": 1.04038727, "balance_loss_mlp": 1.00036037, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 1.0256689483183399, "language_loss": 0.59036577, "learning_rate": 3.893160127975535e-06, "loss": 0.61291277, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.4826316833496094 }, { "auxiliary_loss_clip": 0.01447332, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.06019521, "balance_loss_mlp": 1.02484894, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.9717537657156368, "language_loss": 0.8137148, "learning_rate": 3.8929087902670826e-06, "loss": 0.83853722, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 3.0307607650756836 }, { "auxiliary_loss_clip": 0.01200045, "auxiliary_loss_mlp": 0.01002839, "balance_loss_clip": 1.03841281, "balance_loss_mlp": 1.00075293, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.917975844055761, "language_loss": 0.60622758, "learning_rate": 3.8926571654058715e-06, "loss": 0.62825638, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.2036795616149902 }, { "auxiliary_loss_clip": 0.01387298, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.06277812, "balance_loss_mlp": 1.02709484, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.720720070806504, "language_loss": 0.77518898, "learning_rate": 3.892405253430074e-06, "loss": 0.79943132, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.7363550662994385 }, { "auxiliary_loss_clip": 0.01346619, "auxiliary_loss_mlp": 0.02581243, "balance_loss_clip": 1.06531835, "balance_loss_mlp": 1.00053859, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 1.9115761244534688, "language_loss": 0.82445014, "learning_rate": 3.892153054377904e-06, "loss": 0.86372876, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.73370623588562 }, { "auxiliary_loss_clip": 0.01395237, "auxiliary_loss_mlp": 0.01008192, "balance_loss_clip": 1.03317952, "balance_loss_mlp": 1.00578403, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9306993470695897, "language_loss": 0.59371156, "learning_rate": 3.891900568287619e-06, "loss": 0.61774588, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.2817232608795166 }, { "auxiliary_loss_clip": 0.01362348, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.0637722, "balance_loss_mlp": 1.03246641, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.6858614546150945, "language_loss": 0.72370529, "learning_rate": 3.891647795197523e-06, "loss": 0.7477721, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.949690103530884 }, { "auxiliary_loss_clip": 0.01395689, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.05908823, "balance_loss_mlp": 1.02773058, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.558989441374826, "language_loss": 0.68553627, "learning_rate": 3.8913947351459605e-06, "loss": 0.70988917, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.802950143814087 }, { "auxiliary_loss_clip": 0.01241028, "auxiliary_loss_mlp": 0.01044203, "balance_loss_clip": 1.06605172, "balance_loss_mlp": 1.03517866, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 2.174380955770455, "language_loss": 0.67966318, "learning_rate": 3.89114138817132e-06, "loss": 0.70251554, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.695899486541748 }, { "auxiliary_loss_clip": 0.01290253, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.06748486, "balance_loss_mlp": 1.02367115, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.899023120367045, "language_loss": 0.84300721, "learning_rate": 3.890887754312035e-06, "loss": 0.86623257, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.718205690383911 }, { "auxiliary_loss_clip": 0.0133688, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.06145787, "balance_loss_mlp": 1.02266788, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 2.9351202053739063, "language_loss": 0.87825203, "learning_rate": 3.890633833606581e-06, "loss": 0.90194571, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.6689302921295166 }, { "auxiliary_loss_clip": 0.01292518, "auxiliary_loss_mlp": 0.01041183, "balance_loss_clip": 1.07025146, "balance_loss_mlp": 1.03108621, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 2.117535459900211, "language_loss": 0.69396079, "learning_rate": 3.890379626093477e-06, "loss": 0.71729779, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.6978399753570557 }, { "auxiliary_loss_clip": 0.01429205, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.05842233, "balance_loss_mlp": 1.02817857, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 2.0546260101673686, "language_loss": 0.92404813, "learning_rate": 3.890125131811287e-06, "loss": 0.94872355, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.7551114559173584 }, { "auxiliary_loss_clip": 0.01329829, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.05689156, "balance_loss_mlp": 1.02849102, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 3.321306453692712, "language_loss": 0.75449508, "learning_rate": 3.889870350798618e-06, "loss": 0.77817053, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.751676321029663 }, { "auxiliary_loss_clip": 0.01242234, "auxiliary_loss_mlp": 0.01036722, "balance_loss_clip": 1.06645429, "balance_loss_mlp": 1.0270009, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 2.4947137202395644, "language_loss": 0.7866652, "learning_rate": 3.889615283094119e-06, "loss": 0.80945474, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 3.630387306213379 }, { "auxiliary_loss_clip": 0.012478, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.06724739, "balance_loss_mlp": 1.02953792, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.726398752553281, "language_loss": 0.84623492, "learning_rate": 3.889359928736485e-06, "loss": 0.86911786, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.5981311798095703 }, { "auxiliary_loss_clip": 0.01338555, "auxiliary_loss_mlp": 0.02578688, "balance_loss_clip": 1.06523967, "balance_loss_mlp": 1.00053966, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 3.4998102597324343, "language_loss": 0.91094184, "learning_rate": 3.889104287764451e-06, "loss": 0.95011425, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.7944018840789795 }, { "auxiliary_loss_clip": 0.01334355, "auxiliary_loss_mlp": 0.0104451, "balance_loss_clip": 1.06362414, "balance_loss_mlp": 1.03456187, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 3.276376705547758, "language_loss": 0.90604055, "learning_rate": 3.888848360216798e-06, "loss": 0.92982924, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 3.7408578395843506 }, { "auxiliary_loss_clip": 0.01263262, "auxiliary_loss_mlp": 0.01005174, "balance_loss_clip": 1.04512239, "balance_loss_mlp": 1.00290871, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.7963980704956061, "language_loss": 0.56586409, "learning_rate": 3.888592146132351e-06, "loss": 0.58854842, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 4.333545207977295 }, { "auxiliary_loss_clip": 0.01294156, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.06857014, "balance_loss_mlp": 1.0257448, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 1.8954746084083058, "language_loss": 0.78733325, "learning_rate": 3.888335645549978e-06, "loss": 0.81062299, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 2.7247891426086426 }, { "auxiliary_loss_clip": 0.0124572, "auxiliary_loss_mlp": 0.01047801, "balance_loss_clip": 1.06932831, "balance_loss_mlp": 1.03893173, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 32.738588449575495, "language_loss": 0.81451762, "learning_rate": 3.888078858508588e-06, "loss": 0.83745283, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.640674114227295 }, { "auxiliary_loss_clip": 0.01340642, "auxiliary_loss_mlp": 0.01036466, "balance_loss_clip": 1.06740451, "balance_loss_mlp": 1.02620745, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.7774901554392817, "language_loss": 0.8419975, "learning_rate": 3.8878217850471365e-06, "loss": 0.86576855, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 2.7824339866638184 }, { "auxiliary_loss_clip": 0.01250378, "auxiliary_loss_mlp": 0.01040336, "balance_loss_clip": 1.07203531, "balance_loss_mlp": 1.03069162, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 2.7712476798183645, "language_loss": 0.74165183, "learning_rate": 3.887564425204621e-06, "loss": 0.76455897, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 3.6334645748138428 }, { "auxiliary_loss_clip": 0.01294442, "auxiliary_loss_mlp": 0.01002468, "balance_loss_clip": 1.03172398, "balance_loss_mlp": 1.00020313, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8452824771640849, "language_loss": 0.54692215, "learning_rate": 3.887306779020083e-06, "loss": 0.56989115, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.2159583568573 }, { "auxiliary_loss_clip": 0.01300448, "auxiliary_loss_mlp": 0.01042108, "balance_loss_clip": 1.07105505, "balance_loss_mlp": 1.03230309, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.4310785462084366, "language_loss": 0.70732743, "learning_rate": 3.887048846532608e-06, "loss": 0.730753, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.6651527881622314 }, { "auxiliary_loss_clip": 0.01301163, "auxiliary_loss_mlp": 0.01005687, "balance_loss_clip": 1.03343248, "balance_loss_mlp": 1.00357652, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7555369418193514, "language_loss": 0.58107632, "learning_rate": 3.8867906277813224e-06, "loss": 0.60414481, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.203640937805176 }, { "auxiliary_loss_clip": 0.01298724, "auxiliary_loss_mlp": 0.02578231, "balance_loss_clip": 1.06760991, "balance_loss_mlp": 1.00058818, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.2775733342100564, "language_loss": 0.74062473, "learning_rate": 3.886532122805399e-06, "loss": 0.77939427, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.8434252738952637 }, { "auxiliary_loss_clip": 0.01475784, "auxiliary_loss_mlp": 0.01036824, "balance_loss_clip": 1.0535419, "balance_loss_mlp": 1.0266192, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 2.0565895518162423, "language_loss": 0.89625829, "learning_rate": 3.886273331644053e-06, "loss": 0.92138433, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.9582810401916504 }, { "auxiliary_loss_clip": 0.0143476, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.06272984, "balance_loss_mlp": 1.02922606, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 8.289654547913319, "language_loss": 0.82399464, "learning_rate": 3.886014254336542e-06, "loss": 0.84873164, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.7270584106445312 }, { "auxiliary_loss_clip": 0.0129195, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.06513476, "balance_loss_mlp": 1.02961087, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.8137252214759514, "language_loss": 0.92645037, "learning_rate": 3.885754890922168e-06, "loss": 0.94976306, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.7240183353424072 }, { "auxiliary_loss_clip": 0.01530572, "auxiliary_loss_mlp": 0.01041192, "balance_loss_clip": 1.0564605, "balance_loss_mlp": 1.03132761, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.8820510642790582, "language_loss": 0.78514177, "learning_rate": 3.885495241440277e-06, "loss": 0.81085944, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 3.0027265548706055 }, { "auxiliary_loss_clip": 0.01244227, "auxiliary_loss_mlp": 0.01036593, "balance_loss_clip": 1.0683558, "balance_loss_mlp": 1.02727056, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 2.528916332846907, "language_loss": 0.74473965, "learning_rate": 3.885235305930257e-06, "loss": 0.76754791, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.8175761699676514 }, { "auxiliary_loss_clip": 0.01387828, "auxiliary_loss_mlp": 0.01044543, "balance_loss_clip": 1.06174064, "balance_loss_mlp": 1.03526831, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 2.393704098493691, "language_loss": 0.85246992, "learning_rate": 3.884975084431539e-06, "loss": 0.87679362, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.7183263301849365 }, { "auxiliary_loss_clip": 0.01290188, "auxiliary_loss_mlp": 0.02580475, "balance_loss_clip": 1.06737602, "balance_loss_mlp": 1.00065815, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 4.44622373560651, "language_loss": 0.91513228, "learning_rate": 3.8847145769836e-06, "loss": 0.95383888, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.811180830001831 }, { "auxiliary_loss_clip": 0.01246984, "auxiliary_loss_mlp": 0.01045395, "balance_loss_clip": 1.06983042, "balance_loss_mlp": 1.03609681, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.702149799799932, "language_loss": 0.66657043, "learning_rate": 3.884453783625959e-06, "loss": 0.68949425, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.609877109527588 }, { "auxiliary_loss_clip": 0.01336712, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.06534612, "balance_loss_mlp": 1.02898693, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 7.8254915271544165, "language_loss": 0.84967041, "learning_rate": 3.884192704398176e-06, "loss": 0.87341964, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.665424346923828 }, { "auxiliary_loss_clip": 0.01259249, "auxiliary_loss_mlp": 0.01036644, "balance_loss_clip": 1.06604838, "balance_loss_mlp": 1.02666056, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 3.168482599447299, "language_loss": 0.74991333, "learning_rate": 3.883931339339858e-06, "loss": 0.77287233, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.907115936279297 }, { "auxiliary_loss_clip": 0.0129769, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.06683099, "balance_loss_mlp": 1.02283132, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 1.8512742124744874, "language_loss": 0.78848314, "learning_rate": 3.883669688490654e-06, "loss": 0.81178832, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.621971607208252 }, { "auxiliary_loss_clip": 0.01329653, "auxiliary_loss_mlp": 0.02580181, "balance_loss_clip": 1.06033778, "balance_loss_mlp": 1.00064003, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 2.881299195851193, "language_loss": 0.85314608, "learning_rate": 3.883407751890256e-06, "loss": 0.89224434, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.658878803253174 }, { "auxiliary_loss_clip": 0.01352713, "auxiliary_loss_mlp": 0.01042585, "balance_loss_clip": 1.05960345, "balance_loss_mlp": 1.03220224, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.6040664008497794, "language_loss": 0.85595262, "learning_rate": 3.8831455295783994e-06, "loss": 0.87990558, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.739625930786133 }, { "auxiliary_loss_clip": 0.01335061, "auxiliary_loss_mlp": 0.01039131, "balance_loss_clip": 1.06263649, "balance_loss_mlp": 1.02980864, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.6711946218511649, "language_loss": 0.74401051, "learning_rate": 3.882883021594864e-06, "loss": 0.76775241, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 3.591701030731201 }, { "auxiliary_loss_clip": 0.01385784, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.0633471, "balance_loss_mlp": 1.02975917, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.027740822976571, "language_loss": 0.87268341, "learning_rate": 3.8826202279794705e-06, "loss": 0.89693201, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.738779306411743 }, { "auxiliary_loss_clip": 0.01246832, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.06996536, "balance_loss_mlp": 1.03548598, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 3.02277750481882, "language_loss": 0.70237637, "learning_rate": 3.882357148772085e-06, "loss": 0.725308, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.636211395263672 }, { "auxiliary_loss_clip": 0.01379104, "auxiliary_loss_mlp": 0.01039354, "balance_loss_clip": 1.06166828, "balance_loss_mlp": 1.02944744, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.7748438390763632, "language_loss": 0.84489805, "learning_rate": 3.882093784012617e-06, "loss": 0.86908257, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 3.5408265590667725 }, { "auxiliary_loss_clip": 0.01341196, "auxiliary_loss_mlp": 0.01037447, "balance_loss_clip": 1.0654825, "balance_loss_mlp": 1.02824426, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 2.0493509896375954, "language_loss": 0.84032804, "learning_rate": 3.881830133741019e-06, "loss": 0.86411446, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 3.60079288482666 }, { "auxiliary_loss_clip": 0.01396176, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.06800008, "balance_loss_mlp": 1.02826369, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.0032663930666934, "language_loss": 0.76268166, "learning_rate": 3.881566197997285e-06, "loss": 0.78702533, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 2.82951021194458 }, { "auxiliary_loss_clip": 0.01335771, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.06614339, "balance_loss_mlp": 1.03161037, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 2.5884105700516966, "language_loss": 0.74954677, "learning_rate": 3.881301976821456e-06, "loss": 0.77330649, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.7211484909057617 }, { "auxiliary_loss_clip": 0.012875, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.06480312, "balance_loss_mlp": 1.02394176, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 2.362134967045639, "language_loss": 0.90862304, "learning_rate": 3.881037470253612e-06, "loss": 0.93182969, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.5152883529663086 }, { "auxiliary_loss_clip": 0.01436222, "auxiliary_loss_mlp": 0.01047304, "balance_loss_clip": 1.05953455, "balance_loss_mlp": 1.03738558, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.736708997979135, "language_loss": 0.79619569, "learning_rate": 3.88077267833388e-06, "loss": 0.82103097, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.78802752494812 }, { "auxiliary_loss_clip": 0.0143371, "auxiliary_loss_mlp": 0.01037254, "balance_loss_clip": 1.05917835, "balance_loss_mlp": 1.02729368, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.096980003319845, "language_loss": 0.84009826, "learning_rate": 3.880507601102427e-06, "loss": 0.86480784, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.7342684268951416 }, { "auxiliary_loss_clip": 0.0124504, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.07255149, "balance_loss_mlp": 1.02808857, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 2.351620964382535, "language_loss": 0.81967211, "learning_rate": 3.880242238599467e-06, "loss": 0.84249622, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.6631782054901123 }, { "auxiliary_loss_clip": 0.01242106, "auxiliary_loss_mlp": 0.01034842, "balance_loss_clip": 1.06793737, "balance_loss_mlp": 1.02451801, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.663897023011702, "language_loss": 0.83078283, "learning_rate": 3.879976590865254e-06, "loss": 0.85355234, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.661515712738037 }, { "auxiliary_loss_clip": 0.01337964, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.06714511, "balance_loss_mlp": 1.02972579, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.4542638780236175, "language_loss": 0.8701216, "learning_rate": 3.879710657940087e-06, "loss": 0.89390087, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.793872833251953 }, { "auxiliary_loss_clip": 0.01287181, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.06588161, "balance_loss_mlp": 1.02316403, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 2.0911328907835656, "language_loss": 0.70162308, "learning_rate": 3.879444439864308e-06, "loss": 0.72482175, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.7725989818573 }, { "auxiliary_loss_clip": 0.01288982, "auxiliary_loss_mlp": 0.0258227, "balance_loss_clip": 1.06322682, "balance_loss_mlp": 1.00084102, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 2.12524975082109, "language_loss": 0.85832077, "learning_rate": 3.879177936678301e-06, "loss": 0.89703327, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.654008626937866 }, { "auxiliary_loss_clip": 0.01340593, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.06325948, "balance_loss_mlp": 1.02479386, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 1.9719479432006661, "language_loss": 0.77208126, "learning_rate": 3.878911148422496e-06, "loss": 0.79582959, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.8433077335357666 }, { "auxiliary_loss_clip": 0.01295613, "auxiliary_loss_mlp": 0.0104426, "balance_loss_clip": 1.06974959, "balance_loss_mlp": 1.03485417, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 3.2570887080985433, "language_loss": 0.70508271, "learning_rate": 3.878644075137364e-06, "loss": 0.72848141, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.7706148624420166 }, { "auxiliary_loss_clip": 0.01370702, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.05762637, "balance_loss_mlp": 1.03106785, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 2.759840580936066, "language_loss": 0.79550618, "learning_rate": 3.878376716863418e-06, "loss": 0.819619, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.7194623947143555 }, { "auxiliary_loss_clip": 0.01335412, "auxiliary_loss_mlp": 0.01037742, "balance_loss_clip": 1.06176054, "balance_loss_mlp": 1.02768052, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 2.1997708888708853, "language_loss": 0.71952426, "learning_rate": 3.878109073641219e-06, "loss": 0.74325579, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.6973438262939453 }, { "auxiliary_loss_clip": 0.01436015, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.06308901, "balance_loss_mlp": 1.03545022, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.6537559566663962, "language_loss": 0.81019962, "learning_rate": 3.877841145511366e-06, "loss": 0.83500421, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.8671274185180664 }, { "auxiliary_loss_clip": 0.01254527, "auxiliary_loss_mlp": 0.01044144, "balance_loss_clip": 1.06618059, "balance_loss_mlp": 1.03416646, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 2.01112195580312, "language_loss": 0.82961023, "learning_rate": 3.8775729325145035e-06, "loss": 0.852597, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.6836555004119873 }, { "auxiliary_loss_clip": 0.01286731, "auxiliary_loss_mlp": 0.01002439, "balance_loss_clip": 1.03682041, "balance_loss_mlp": 0.99997121, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7999059977067157, "language_loss": 0.64701462, "learning_rate": 3.877304434691321e-06, "loss": 0.66990626, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.422255277633667 }, { "auxiliary_loss_clip": 0.01391284, "auxiliary_loss_mlp": 0.01037007, "balance_loss_clip": 1.06381285, "balance_loss_mlp": 1.02792358, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.778548295453032, "language_loss": 0.7955125, "learning_rate": 3.877035652082548e-06, "loss": 0.81979543, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.8107852935791016 }, { "auxiliary_loss_clip": 0.0133376, "auxiliary_loss_mlp": 0.0105045, "balance_loss_clip": 1.06445801, "balance_loss_mlp": 1.04034066, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.7463407663893655, "language_loss": 0.85311484, "learning_rate": 3.87676658472896e-06, "loss": 0.87695694, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.6930294036865234 }, { "auxiliary_loss_clip": 0.01288544, "auxiliary_loss_mlp": 0.01042347, "balance_loss_clip": 1.0626229, "balance_loss_mlp": 1.03214836, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.993002499743212, "language_loss": 0.85373634, "learning_rate": 3.876497232671372e-06, "loss": 0.87704527, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.6358494758605957 }, { "auxiliary_loss_clip": 0.01438611, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.05801392, "balance_loss_mlp": 1.02762747, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.3670781864294397, "language_loss": 0.83453226, "learning_rate": 3.876227595950647e-06, "loss": 0.85928917, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 3.7373085021972656 }, { "auxiliary_loss_clip": 0.0123785, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.06601238, "balance_loss_mlp": 1.02989876, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.934696429293343, "language_loss": 0.78893954, "learning_rate": 3.875957674607686e-06, "loss": 0.81171155, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.6655707359313965 }, { "auxiliary_loss_clip": 0.01282803, "auxiliary_loss_mlp": 0.0258444, "balance_loss_clip": 1.06079459, "balance_loss_mlp": 1.00089645, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 2.304027128232615, "language_loss": 0.88142115, "learning_rate": 3.8756874686834386e-06, "loss": 0.9200936, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.6889147758483887 }, { "auxiliary_loss_clip": 0.01289091, "auxiliary_loss_mlp": 0.02583655, "balance_loss_clip": 1.06243479, "balance_loss_mlp": 1.00085592, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.7938193879242268, "language_loss": 0.80449772, "learning_rate": 3.875416978218893e-06, "loss": 0.84322518, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.7121808528900146 }, { "auxiliary_loss_clip": 0.01357067, "auxiliary_loss_mlp": 0.01039016, "balance_loss_clip": 1.05940795, "balance_loss_mlp": 1.02931213, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 2.334614924009087, "language_loss": 0.82525563, "learning_rate": 3.8751462032550835e-06, "loss": 0.8492164, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.7338407039642334 }, { "auxiliary_loss_clip": 0.01330737, "auxiliary_loss_mlp": 0.01046115, "balance_loss_clip": 1.06478524, "balance_loss_mlp": 1.03644109, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 2.113999324237088, "language_loss": 0.83327878, "learning_rate": 3.874875143833085e-06, "loss": 0.8570472, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 4.455169439315796 }, { "auxiliary_loss_clip": 0.01290528, "auxiliary_loss_mlp": 0.01034141, "balance_loss_clip": 1.06657314, "balance_loss_mlp": 1.02421141, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 1.7945904792970173, "language_loss": 0.68931532, "learning_rate": 3.874603799994019e-06, "loss": 0.71256196, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 3.0180506706237793 }, { "auxiliary_loss_clip": 0.01379173, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.05703187, "balance_loss_mlp": 1.03050971, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 5.239547867787885, "language_loss": 0.86828625, "learning_rate": 3.874332171779046e-06, "loss": 0.89247113, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 2.8194854259490967 }, { "auxiliary_loss_clip": 0.01382823, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.05824637, "balance_loss_mlp": 1.02583742, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 2.464453077999299, "language_loss": 0.75727749, "learning_rate": 3.874060259229373e-06, "loss": 0.78145385, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 3.653068780899048 }, { "auxiliary_loss_clip": 0.01290366, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.06762862, "balance_loss_mlp": 1.03233767, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.2608457006764167, "language_loss": 0.93589455, "learning_rate": 3.873788062386249e-06, "loss": 0.95921838, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.7514114379882812 }, { "auxiliary_loss_clip": 0.01392689, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.06525838, "balance_loss_mlp": 1.03193998, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 3.0531584789909028, "language_loss": 0.82086897, "learning_rate": 3.873515581290965e-06, "loss": 0.84520835, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.8313028812408447 }, { "auxiliary_loss_clip": 0.01386521, "auxiliary_loss_mlp": 0.01039813, "balance_loss_clip": 1.0647521, "balance_loss_mlp": 1.03089666, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.1869151629319292, "language_loss": 0.7503643, "learning_rate": 3.8732428159848575e-06, "loss": 0.77462763, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.686420440673828 }, { "auxiliary_loss_clip": 0.0129065, "auxiliary_loss_mlp": 0.01040543, "balance_loss_clip": 1.06954193, "balance_loss_mlp": 1.03059506, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.92410383940294, "language_loss": 0.78333652, "learning_rate": 3.872969766509304e-06, "loss": 0.80664843, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.7926411628723145 }, { "auxiliary_loss_clip": 0.01287467, "auxiliary_loss_mlp": 0.01002406, "balance_loss_clip": 1.03307402, "balance_loss_mlp": 1.00034356, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7654609407625333, "language_loss": 0.55628324, "learning_rate": 3.872696432905726e-06, "loss": 0.57918197, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.2620954513549805 }, { "auxiliary_loss_clip": 0.01290963, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.06282949, "balance_loss_mlp": 1.02927065, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 3.5929861316836407, "language_loss": 0.7155661, "learning_rate": 3.872422815215589e-06, "loss": 0.73886967, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.7150886058807373 }, { "auxiliary_loss_clip": 0.0127894, "auxiliary_loss_mlp": 0.01036096, "balance_loss_clip": 1.05651462, "balance_loss_mlp": 1.02580249, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 2.0028469374832722, "language_loss": 0.74479181, "learning_rate": 3.8721489134803994e-06, "loss": 0.76794219, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.6243033409118652 }, { "auxiliary_loss_clip": 0.01286288, "auxiliary_loss_mlp": 0.01036533, "balance_loss_clip": 1.06552982, "balance_loss_mlp": 1.02700782, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.123088185214369, "language_loss": 0.72481573, "learning_rate": 3.871874727741707e-06, "loss": 0.74804395, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.6778087615966797 }, { "auxiliary_loss_clip": 0.0128473, "auxiliary_loss_mlp": 0.01033832, "balance_loss_clip": 1.06749964, "balance_loss_mlp": 1.02511191, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.7678846339249012, "language_loss": 0.96685755, "learning_rate": 3.871600258041108e-06, "loss": 0.99004316, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.7081143856048584 }, { "auxiliary_loss_clip": 0.01334787, "auxiliary_loss_mlp": 0.01039373, "balance_loss_clip": 1.06158495, "balance_loss_mlp": 1.03000891, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 3.7198824037387426, "language_loss": 0.86381757, "learning_rate": 3.871325504420238e-06, "loss": 0.88755918, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.701585292816162 }, { "auxiliary_loss_clip": 0.01237713, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.0676465, "balance_loss_mlp": 1.02349329, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.1896544135266907, "language_loss": 0.82061362, "learning_rate": 3.871050466920776e-06, "loss": 0.84331781, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.795872449874878 }, { "auxiliary_loss_clip": 0.01379444, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.057868, "balance_loss_mlp": 1.03518176, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 1.8838415088974632, "language_loss": 0.79517615, "learning_rate": 3.870775145584447e-06, "loss": 0.81941342, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.684641122817993 }, { "auxiliary_loss_clip": 0.01343159, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.06396604, "balance_loss_mlp": 1.03021681, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 3.699326328707347, "language_loss": 0.64871299, "learning_rate": 3.8704995404530145e-06, "loss": 0.67254657, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.7411813735961914 }, { "auxiliary_loss_clip": 0.01235081, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.06747746, "balance_loss_mlp": 1.02782178, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 2.071450114916175, "language_loss": 0.84899968, "learning_rate": 3.87022365156829e-06, "loss": 0.87170994, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.701616048812866 }, { "auxiliary_loss_clip": 0.01485697, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.05690098, "balance_loss_mlp": 1.02513504, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.441751801008685, "language_loss": 0.81036997, "learning_rate": 3.869947478972123e-06, "loss": 0.83556616, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.921440601348877 }, { "auxiliary_loss_clip": 0.01284749, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.06326127, "balance_loss_mlp": 1.02522779, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.1189825072708097, "language_loss": 0.823874, "learning_rate": 3.869671022706412e-06, "loss": 0.84706557, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.7932496070861816 }, { "auxiliary_loss_clip": 0.01420674, "auxiliary_loss_mlp": 0.01035352, "balance_loss_clip": 1.05251217, "balance_loss_mlp": 1.02600586, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 2.0395695182890727, "language_loss": 0.65423143, "learning_rate": 3.869394282813092e-06, "loss": 0.6787917, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.797970771789551 }, { "auxiliary_loss_clip": 0.01392359, "auxiliary_loss_mlp": 0.01035943, "balance_loss_clip": 1.05871284, "balance_loss_mlp": 1.0263468, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 2.72778973616459, "language_loss": 0.89143813, "learning_rate": 3.869117259334147e-06, "loss": 0.91572118, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.6758124828338623 }, { "auxiliary_loss_clip": 0.01283126, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.06228673, "balance_loss_mlp": 1.02685261, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.7474066776101012, "language_loss": 0.82094824, "learning_rate": 3.868839952311599e-06, "loss": 0.8441394, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 3.6009488105773926 }, { "auxiliary_loss_clip": 0.01331146, "auxiliary_loss_mlp": 0.01036365, "balance_loss_clip": 1.06433296, "balance_loss_mlp": 1.02755547, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.255238709545601, "language_loss": 0.80629849, "learning_rate": 3.868562361787516e-06, "loss": 0.82997364, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.715620279312134 }, { "auxiliary_loss_clip": 0.0148524, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.05165255, "balance_loss_mlp": 1.02273393, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 3.40921173515509, "language_loss": 0.69124246, "learning_rate": 3.868284487804009e-06, "loss": 0.71641487, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 2.86678409576416 }, { "auxiliary_loss_clip": 0.01340745, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.06055748, "balance_loss_mlp": 1.02844858, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.8487722618513007, "language_loss": 0.78104866, "learning_rate": 3.86800633040323e-06, "loss": 0.80482799, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 4.1762495040893555 }, { "auxiliary_loss_clip": 0.01334226, "auxiliary_loss_mlp": 0.02582126, "balance_loss_clip": 1.06450117, "balance_loss_mlp": 1.00067997, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.2095262040210706, "language_loss": 0.78005183, "learning_rate": 3.867727889627376e-06, "loss": 0.81921536, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 3.595963716506958 }, { "auxiliary_loss_clip": 0.01373432, "auxiliary_loss_mlp": 0.01037122, "balance_loss_clip": 1.0601095, "balance_loss_mlp": 1.02755582, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.2178316519889654, "language_loss": 0.78478098, "learning_rate": 3.867449165518687e-06, "loss": 0.80888653, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 2.6924540996551514 }, { "auxiliary_loss_clip": 0.01240928, "auxiliary_loss_mlp": 0.02584965, "balance_loss_clip": 1.06707132, "balance_loss_mlp": 1.00068355, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.8692882727744065, "language_loss": 0.71139455, "learning_rate": 3.867170158119444e-06, "loss": 0.74965346, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 2.648721933364868 }, { "auxiliary_loss_clip": 0.01241237, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.06700253, "balance_loss_mlp": 1.02449346, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 2.019109682857975, "language_loss": 0.76028681, "learning_rate": 3.866890867471972e-06, "loss": 0.78303623, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 3.5189919471740723 }, { "auxiliary_loss_clip": 0.01288866, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.05779338, "balance_loss_mlp": 1.02711248, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 2.6527082825223007, "language_loss": 0.89793551, "learning_rate": 3.86661129361864e-06, "loss": 0.92119563, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.671086311340332 }, { "auxiliary_loss_clip": 0.01336608, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.06439483, "balance_loss_mlp": 1.03294206, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 3.026880728578261, "language_loss": 0.86078882, "learning_rate": 3.866331436601859e-06, "loss": 0.88458431, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.637600898742676 }, { "auxiliary_loss_clip": 0.01241476, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.06974196, "balance_loss_mlp": 1.02612615, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.4480150113749892, "language_loss": 0.73761648, "learning_rate": 3.866051296464083e-06, "loss": 0.76038742, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.59543776512146 }, { "auxiliary_loss_clip": 0.01238812, "auxiliary_loss_mlp": 0.02584345, "balance_loss_clip": 1.06471753, "balance_loss_mlp": 1.0006423, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 3.962081774742827, "language_loss": 0.85376573, "learning_rate": 3.86577087324781e-06, "loss": 0.89199728, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.6189918518066406 }, { "auxiliary_loss_clip": 0.01283554, "auxiliary_loss_mlp": 0.01038931, "balance_loss_clip": 1.06562877, "balance_loss_mlp": 1.0294714, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 2.404079574651244, "language_loss": 0.77677053, "learning_rate": 3.865490166995578e-06, "loss": 0.79999542, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.6092305183410645 }, { "auxiliary_loss_clip": 0.01291806, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.06795406, "balance_loss_mlp": 1.03132296, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.172372413655442, "language_loss": 0.84431541, "learning_rate": 3.86520917774997e-06, "loss": 0.86763275, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.774808406829834 }, { "auxiliary_loss_clip": 0.01283228, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.06383193, "balance_loss_mlp": 1.03331578, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.0715497942054673, "language_loss": 0.75300509, "learning_rate": 3.864927905553614e-06, "loss": 0.77626753, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.6093714237213135 }, { "auxiliary_loss_clip": 0.01379511, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.05815601, "balance_loss_mlp": 1.0253365, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 2.0717603965476314, "language_loss": 0.89040101, "learning_rate": 3.8646463504491765e-06, "loss": 0.9145391, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.7505252361297607 }, { "auxiliary_loss_clip": 0.01289627, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.06812155, "balance_loss_mlp": 1.02977204, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.9460681098553418, "language_loss": 0.83176601, "learning_rate": 3.8643645124793705e-06, "loss": 0.85505712, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.7777130603790283 }, { "auxiliary_loss_clip": 0.0128356, "auxiliary_loss_mlp": 0.01042942, "balance_loss_clip": 1.0636878, "balance_loss_mlp": 1.0327673, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.9836350940892595, "language_loss": 0.74993527, "learning_rate": 3.8640823916869515e-06, "loss": 0.77320033, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.906296491622925 }, { "auxiliary_loss_clip": 0.01237574, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.06519413, "balance_loss_mlp": 1.02864313, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.557451218160784, "language_loss": 0.7850765, "learning_rate": 3.863799988114714e-06, "loss": 0.8078351, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.7187600135803223 }, { "auxiliary_loss_clip": 0.01240407, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.06620598, "balance_loss_mlp": 1.02755904, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 2.561181110356706, "language_loss": 0.71076787, "learning_rate": 3.863517301805502e-06, "loss": 0.73354554, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.5897533893585205 }, { "auxiliary_loss_clip": 0.013899, "auxiliary_loss_mlp": 0.01045692, "balance_loss_clip": 1.06531847, "balance_loss_mlp": 1.03548789, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 10.09589567685479, "language_loss": 0.9699474, "learning_rate": 3.863234332802196e-06, "loss": 0.99430335, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.7155702114105225 }, { "auxiliary_loss_clip": 0.01333188, "auxiliary_loss_mlp": 0.01040054, "balance_loss_clip": 1.06071663, "balance_loss_mlp": 1.03090513, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.0610439801734213, "language_loss": 0.74060857, "learning_rate": 3.862951081147723e-06, "loss": 0.764341, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.71290922164917 }, { "auxiliary_loss_clip": 0.01289154, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.06605053, "balance_loss_mlp": 1.02662373, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.3162500647257684, "language_loss": 0.78373778, "learning_rate": 3.862667546885053e-06, "loss": 0.80698746, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.7533113956451416 }, { "auxiliary_loss_clip": 0.01294747, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.06095052, "balance_loss_mlp": 1.02632904, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 1.9553351165239061, "language_loss": 0.73854578, "learning_rate": 3.8623837300571965e-06, "loss": 0.76185411, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.6811482906341553 }, { "auxiliary_loss_clip": 0.01237654, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.06615734, "balance_loss_mlp": 1.02914584, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 2.642302963726878, "language_loss": 0.84119207, "learning_rate": 3.8620996307072085e-06, "loss": 0.86396003, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.6845502853393555 }, { "auxiliary_loss_clip": 0.01390198, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.05860829, "balance_loss_mlp": 1.02662098, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 1.7730608522890507, "language_loss": 0.64036536, "learning_rate": 3.861815248878188e-06, "loss": 0.66462648, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 3.741718053817749 }, { "auxiliary_loss_clip": 0.01331009, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.06417418, "balance_loss_mlp": 1.02659333, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 3.5015605283598275, "language_loss": 0.80084765, "learning_rate": 3.861530584613274e-06, "loss": 0.82452053, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.6540687084198 }, { "auxiliary_loss_clip": 0.01290143, "auxiliary_loss_mlp": 0.02585434, "balance_loss_clip": 1.06601119, "balance_loss_mlp": 1.00063109, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 4.807352151556901, "language_loss": 0.82139975, "learning_rate": 3.86124563795565e-06, "loss": 0.86015546, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.7357795238494873 }, { "auxiliary_loss_clip": 0.012402, "auxiliary_loss_mlp": 0.01040501, "balance_loss_clip": 1.06979012, "balance_loss_mlp": 1.0320785, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.6737852191696254, "language_loss": 0.70229781, "learning_rate": 3.860960408948543e-06, "loss": 0.72510481, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.7664380073547363 }, { "auxiliary_loss_clip": 0.01283924, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.06750286, "balance_loss_mlp": 1.02298427, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 3.7078280720642085, "language_loss": 0.8970589, "learning_rate": 3.860674897635222e-06, "loss": 0.92021632, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.635326385498047 }, { "auxiliary_loss_clip": 0.01286425, "auxiliary_loss_mlp": 0.01035565, "balance_loss_clip": 1.06633711, "balance_loss_mlp": 1.02594435, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.5095445956774887, "language_loss": 0.8354702, "learning_rate": 3.860389104058998e-06, "loss": 0.85869008, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 4.399419546127319 }, { "auxiliary_loss_clip": 0.01337964, "auxiliary_loss_mlp": 0.01038921, "balance_loss_clip": 1.06399441, "balance_loss_mlp": 1.02915156, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 2.1435437670291035, "language_loss": 0.73102176, "learning_rate": 3.860103028263227e-06, "loss": 0.7547906, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 2.699784994125366 }, { "auxiliary_loss_clip": 0.01431964, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.05524695, "balance_loss_mlp": 1.03151965, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.1355929116300127, "language_loss": 0.69877476, "learning_rate": 3.859816670291304e-06, "loss": 0.72349805, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 2.819456100463867 }, { "auxiliary_loss_clip": 0.01473884, "auxiliary_loss_mlp": 0.01052378, "balance_loss_clip": 1.05504346, "balance_loss_mlp": 1.04328883, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 2.2332065926758875, "language_loss": 0.90002853, "learning_rate": 3.859530030186672e-06, "loss": 0.92529118, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 3.7252860069274902 }, { "auxiliary_loss_clip": 0.01336165, "auxiliary_loss_mlp": 0.01036646, "balance_loss_clip": 1.06451321, "balance_loss_mlp": 1.02765751, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.390659511522882, "language_loss": 0.83282697, "learning_rate": 3.859243107992813e-06, "loss": 0.85655504, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.7622504234313965 }, { "auxiliary_loss_clip": 0.01387028, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.05645728, "balance_loss_mlp": 1.03146958, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 3.7430065036172753, "language_loss": 0.78426373, "learning_rate": 3.858955903753252e-06, "loss": 0.80854344, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.8809471130371094 }, { "auxiliary_loss_clip": 0.01289125, "auxiliary_loss_mlp": 0.01042682, "balance_loss_clip": 1.06560457, "balance_loss_mlp": 1.03398573, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.6013364309516696, "language_loss": 0.83576238, "learning_rate": 3.858668417511559e-06, "loss": 0.85908043, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.716440200805664 }, { "auxiliary_loss_clip": 0.01291286, "auxiliary_loss_mlp": 0.01041402, "balance_loss_clip": 1.06487775, "balance_loss_mlp": 1.03263974, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 4.524548881011971, "language_loss": 0.76439846, "learning_rate": 3.8583806493113445e-06, "loss": 0.78772527, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.7597532272338867 }, { "auxiliary_loss_clip": 0.01285186, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.06541753, "balance_loss_mlp": 1.02581024, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.9250873667435044, "language_loss": 0.82193816, "learning_rate": 3.858092599196263e-06, "loss": 0.84515047, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.6675233840942383 }, { "auxiliary_loss_clip": 0.01284022, "auxiliary_loss_mlp": 0.0103671, "balance_loss_clip": 1.06514502, "balance_loss_mlp": 1.0274241, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.2231358249103628, "language_loss": 0.82467389, "learning_rate": 3.857804267210012e-06, "loss": 0.8478812, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.7411625385284424 }, { "auxiliary_loss_clip": 0.01375078, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.05695152, "balance_loss_mlp": 1.02770734, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 1.9654467995624836, "language_loss": 0.88021928, "learning_rate": 3.857515653396331e-06, "loss": 0.9043383, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.8067467212677 }, { "auxiliary_loss_clip": 0.01432606, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.0593617, "balance_loss_mlp": 1.0266943, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 3.0240811004013146, "language_loss": 0.8740201, "learning_rate": 3.857226757799002e-06, "loss": 0.8987031, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.808821439743042 }, { "auxiliary_loss_clip": 0.01331745, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.06107366, "balance_loss_mlp": 1.02593255, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.5126777809336196, "language_loss": 0.74403095, "learning_rate": 3.85693758046185e-06, "loss": 0.76769912, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.684375524520874 }, { "auxiliary_loss_clip": 0.01240143, "auxiliary_loss_mlp": 0.01035665, "balance_loss_clip": 1.07155538, "balance_loss_mlp": 1.02684903, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.8123919454370871, "language_loss": 0.82718837, "learning_rate": 3.8566481214287435e-06, "loss": 0.8499465, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.6630842685699463 }, { "auxiliary_loss_clip": 0.01378927, "auxiliary_loss_mlp": 0.01040086, "balance_loss_clip": 1.05604076, "balance_loss_mlp": 1.03160405, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.131197169845581, "language_loss": 0.90740615, "learning_rate": 3.8563583807435935e-06, "loss": 0.93159628, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.6914825439453125 }, { "auxiliary_loss_clip": 0.01284137, "auxiliary_loss_mlp": 0.025789, "balance_loss_clip": 1.0639894, "balance_loss_mlp": 1.00055838, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 2.136072441899214, "language_loss": 0.77680862, "learning_rate": 3.856068358450353e-06, "loss": 0.81543899, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.6888210773468018 }, { "auxiliary_loss_clip": 0.01331095, "auxiliary_loss_mlp": 0.0103843, "balance_loss_clip": 1.06599247, "balance_loss_mlp": 1.02911973, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 2.478797027916171, "language_loss": 0.86279243, "learning_rate": 3.8557780545930186e-06, "loss": 0.88648766, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.8432765007019043 }, { "auxiliary_loss_clip": 0.01328873, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.06472468, "balance_loss_mlp": 1.02556145, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 2.1068793273522757, "language_loss": 0.79667693, "learning_rate": 3.855487469215628e-06, "loss": 0.82031322, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.71262526512146 }, { "auxiliary_loss_clip": 0.01382615, "auxiliary_loss_mlp": 0.010288, "balance_loss_clip": 1.06217873, "balance_loss_mlp": 1.02003813, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.374938820733192, "language_loss": 0.72836989, "learning_rate": 3.855196602362264e-06, "loss": 0.75248408, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.9065310955047607 }, { "auxiliary_loss_clip": 0.01286021, "auxiliary_loss_mlp": 0.01033669, "balance_loss_clip": 1.06385767, "balance_loss_mlp": 1.0247283, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.2281952228649615, "language_loss": 0.94359267, "learning_rate": 3.854905454077051e-06, "loss": 0.96678954, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.662846565246582 }, { "auxiliary_loss_clip": 0.01531749, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.05383658, "balance_loss_mlp": 1.02128839, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 1.8989005867453117, "language_loss": 0.88522178, "learning_rate": 3.854614024404155e-06, "loss": 0.91085017, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.954106092453003 }, { "auxiliary_loss_clip": 0.01320923, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.05892706, "balance_loss_mlp": 1.02184939, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 1.9356969094457368, "language_loss": 0.89508677, "learning_rate": 3.8543223133877865e-06, "loss": 0.918607, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 3.817535161972046 }, { "auxiliary_loss_clip": 0.01321774, "auxiliary_loss_mlp": 0.01035339, "balance_loss_clip": 1.05684519, "balance_loss_mlp": 1.02602863, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.9616459328957105, "language_loss": 0.88229436, "learning_rate": 3.854030321072198e-06, "loss": 0.90586555, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.6981470584869385 }, { "auxiliary_loss_clip": 0.01386768, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.05939555, "balance_loss_mlp": 1.03241873, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.9467577296585745, "language_loss": 0.73856282, "learning_rate": 3.853738047501682e-06, "loss": 0.76284158, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.797975540161133 }, { "auxiliary_loss_clip": 0.01285931, "auxiliary_loss_mlp": 0.01035936, "balance_loss_clip": 1.06816077, "balance_loss_mlp": 1.02632177, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 3.1553173087265423, "language_loss": 0.77528411, "learning_rate": 3.85344549272058e-06, "loss": 0.7985028, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.6182901859283447 }, { "auxiliary_loss_clip": 0.01282754, "auxiliary_loss_mlp": 0.01040567, "balance_loss_clip": 1.06411147, "balance_loss_mlp": 1.03111982, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.8313018169484425, "language_loss": 0.82387269, "learning_rate": 3.853152656773269e-06, "loss": 0.84710586, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 3.6577577590942383 }, { "auxiliary_loss_clip": 0.01328274, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.0633142, "balance_loss_mlp": 1.02846956, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 2.1634207404515347, "language_loss": 0.85119659, "learning_rate": 3.852859539704174e-06, "loss": 0.87485141, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 3.788844108581543 }, { "auxiliary_loss_clip": 0.01425485, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.05336213, "balance_loss_mlp": 1.02637863, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 2.077542323553634, "language_loss": 0.76443899, "learning_rate": 3.85256614155776e-06, "loss": 0.78905654, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 2.8875086307525635 }, { "auxiliary_loss_clip": 0.01281766, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.06191826, "balance_loss_mlp": 1.02115774, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 2.463049957009014, "language_loss": 0.74444985, "learning_rate": 3.852272462378535e-06, "loss": 0.76757222, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.736884593963623 }, { "auxiliary_loss_clip": 0.01330488, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.06280923, "balance_loss_mlp": 1.02846384, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 2.0269881645736425, "language_loss": 0.77646244, "learning_rate": 3.85197850221105e-06, "loss": 0.80014443, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 3.6059176921844482 }, { "auxiliary_loss_clip": 0.01282137, "auxiliary_loss_mlp": 0.01041973, "balance_loss_clip": 1.06547904, "balance_loss_mlp": 1.03369963, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.8513158643483836, "language_loss": 0.75996572, "learning_rate": 3.851684261099899e-06, "loss": 0.78320682, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.7528088092803955 }, { "auxiliary_loss_clip": 0.01284781, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.06150281, "balance_loss_mlp": 1.02361703, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 1.8973559913669202, "language_loss": 0.8675645, "learning_rate": 3.851389739089718e-06, "loss": 0.89074004, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.8658058643341064 }, { "auxiliary_loss_clip": 0.01289471, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.06908345, "balance_loss_mlp": 1.02660811, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 3.290388447774354, "language_loss": 0.80188715, "learning_rate": 3.851094936225186e-06, "loss": 0.82514203, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.7324016094207764 }, { "auxiliary_loss_clip": 0.01327916, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.0635041, "balance_loss_mlp": 1.02302909, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 2.8517386515997267, "language_loss": 0.76600474, "learning_rate": 3.850799852551024e-06, "loss": 0.78960061, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.8486580848693848 }, { "auxiliary_loss_clip": 0.01277687, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.0611825, "balance_loss_mlp": 1.02412629, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.3131962230661958, "language_loss": 0.86290193, "learning_rate": 3.850504488111995e-06, "loss": 0.88600808, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.6721627712249756 }, { "auxiliary_loss_clip": 0.01327197, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.06218314, "balance_loss_mlp": 1.02846539, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 2.3470908732082436, "language_loss": 0.82672793, "learning_rate": 3.850208842952907e-06, "loss": 0.85037148, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.7788538932800293 }, { "auxiliary_loss_clip": 0.01442084, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.05891192, "balance_loss_mlp": 1.02289534, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 2.2925063004402197, "language_loss": 0.79170775, "learning_rate": 3.849912917118608e-06, "loss": 0.81644261, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.8455350399017334 }, { "auxiliary_loss_clip": 0.01205944, "auxiliary_loss_mlp": 0.01002563, "balance_loss_clip": 1.04749799, "balance_loss_mlp": 1.00059569, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8693981341527335, "language_loss": 0.5925225, "learning_rate": 3.849616710653992e-06, "loss": 0.61460757, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.25661039352417 }, { "auxiliary_loss_clip": 0.01278964, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.0639286, "balance_loss_mlp": 1.02318358, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.7104668879663563, "language_loss": 0.74536121, "learning_rate": 3.84932022360399e-06, "loss": 0.76847267, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.7308239936828613 }, { "auxiliary_loss_clip": 0.01331268, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.06563735, "balance_loss_mlp": 1.02553415, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 3.394743881819202, "language_loss": 0.85095757, "learning_rate": 3.849023456013581e-06, "loss": 0.87461436, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.7142319679260254 }, { "auxiliary_loss_clip": 0.01290243, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.06519735, "balance_loss_mlp": 1.02459288, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 4.932024067200915, "language_loss": 0.62132788, "learning_rate": 3.848726407927784e-06, "loss": 0.6445626, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.739807367324829 }, { "auxiliary_loss_clip": 0.01332685, "auxiliary_loss_mlp": 0.01037767, "balance_loss_clip": 1.06440568, "balance_loss_mlp": 1.02904654, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 2.3848949943682616, "language_loss": 0.8636992, "learning_rate": 3.84842907939166e-06, "loss": 0.88740373, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.7361133098602295 }, { "auxiliary_loss_clip": 0.0137436, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.05844975, "balance_loss_mlp": 1.02124608, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.9445732153360287, "language_loss": 0.71107441, "learning_rate": 3.8481314704503146e-06, "loss": 0.73511744, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.838121175765991 }, { "auxiliary_loss_clip": 0.01281888, "auxiliary_loss_mlp": 0.01038625, "balance_loss_clip": 1.0687418, "balance_loss_mlp": 1.02962422, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.2248988210543548, "language_loss": 0.87907481, "learning_rate": 3.847833581148895e-06, "loss": 0.90227991, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.712095260620117 }, { "auxiliary_loss_clip": 0.01232326, "auxiliary_loss_mlp": 0.01035707, "balance_loss_clip": 1.06383932, "balance_loss_mlp": 1.02629519, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 2.2883283198862077, "language_loss": 0.81124437, "learning_rate": 3.84753541153259e-06, "loss": 0.83392471, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.6916379928588867 }, { "auxiliary_loss_clip": 0.01287838, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.06967163, "balance_loss_mlp": 1.02834833, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.8718624313807875, "language_loss": 0.8314001, "learning_rate": 3.847236961646633e-06, "loss": 0.85465896, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.683565855026245 }, { "auxiliary_loss_clip": 0.01329639, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.06103563, "balance_loss_mlp": 1.02615321, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 3.4190190394145135, "language_loss": 0.7810303, "learning_rate": 3.846938231536296e-06, "loss": 0.80468029, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.6184017658233643 }, { "auxiliary_loss_clip": 0.01290661, "auxiliary_loss_mlp": 0.01047674, "balance_loss_clip": 1.07206845, "balance_loss_mlp": 1.03863204, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 2.387667107899127, "language_loss": 0.80802053, "learning_rate": 3.8466392212468995e-06, "loss": 0.83140385, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.6544251441955566 }, { "auxiliary_loss_clip": 0.01251422, "auxiliary_loss_mlp": 0.01002828, "balance_loss_clip": 1.04331076, "balance_loss_mlp": 1.00070596, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.8193800226589002, "language_loss": 0.61911356, "learning_rate": 3.8463399308238e-06, "loss": 0.64165604, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 4.148198366165161 }, { "auxiliary_loss_clip": 0.01285216, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.06724072, "balance_loss_mlp": 1.0310595, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 4.934203572011821, "language_loss": 0.64282298, "learning_rate": 3.846040360312402e-06, "loss": 0.6660766, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.7288947105407715 }, { "auxiliary_loss_clip": 0.01233709, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.06581473, "balance_loss_mlp": 1.03021121, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.632003086834704, "language_loss": 0.81144047, "learning_rate": 3.8457405097581485e-06, "loss": 0.83417153, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.637934923171997 }, { "auxiliary_loss_clip": 0.0138376, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.05629253, "balance_loss_mlp": 1.02924848, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.820237649603486, "language_loss": 0.7818346, "learning_rate": 3.8454403792065275e-06, "loss": 0.80605888, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.796902894973755 }, { "auxiliary_loss_clip": 0.01373436, "auxiliary_loss_mlp": 0.01035947, "balance_loss_clip": 1.06015229, "balance_loss_mlp": 1.02698231, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.4353702546048805, "language_loss": 0.85318828, "learning_rate": 3.845139968703068e-06, "loss": 0.87728208, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 4.460983991622925 }, { "auxiliary_loss_clip": 0.01429193, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.05777586, "balance_loss_mlp": 1.02734411, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.3088722542902422, "language_loss": 0.8301543, "learning_rate": 3.844839278293342e-06, "loss": 0.85481167, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 2.7997350692749023 }, { "auxiliary_loss_clip": 0.01235774, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.06774616, "balance_loss_mlp": 1.02927959, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.300275896209008, "language_loss": 0.76704943, "learning_rate": 3.8445383080229654e-06, "loss": 0.78979349, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 2.6947295665740967 }, { "auxiliary_loss_clip": 0.01323038, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.05876672, "balance_loss_mlp": 1.02171612, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.192400280543031, "language_loss": 0.73582244, "learning_rate": 3.844237057937593e-06, "loss": 0.75936198, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 3.7945406436920166 }, { "auxiliary_loss_clip": 0.01287719, "auxiliary_loss_mlp": 0.01032413, "balance_loss_clip": 1.06391239, "balance_loss_mlp": 1.0231086, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 3.6322052938273752, "language_loss": 0.77863926, "learning_rate": 3.843935528082926e-06, "loss": 0.8018406, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.722851037979126 }, { "auxiliary_loss_clip": 0.01286608, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.06618452, "balance_loss_mlp": 1.02207696, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 2.1454606462957972, "language_loss": 0.85204029, "learning_rate": 3.843633718504704e-06, "loss": 0.87522119, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.7190608978271484 }, { "auxiliary_loss_clip": 0.01385459, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.06245196, "balance_loss_mlp": 1.02550375, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 2.828490769257473, "language_loss": 0.9030388, "learning_rate": 3.843331629248715e-06, "loss": 0.92723358, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.7166967391967773 }, { "auxiliary_loss_clip": 0.01235482, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.0691092, "balance_loss_mlp": 1.02097893, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 10.010637470633105, "language_loss": 0.76398003, "learning_rate": 3.843029260360782e-06, "loss": 0.78663445, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.755166530609131 }, { "auxiliary_loss_clip": 0.01281336, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.06740057, "balance_loss_mlp": 1.02878165, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.9089467747231852, "language_loss": 0.78815025, "learning_rate": 3.8427266118867755e-06, "loss": 0.8113395, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.6692371368408203 }, { "auxiliary_loss_clip": 0.01331817, "auxiliary_loss_mlp": 0.01035359, "balance_loss_clip": 1.06432247, "balance_loss_mlp": 1.02583456, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.0310424635959947, "language_loss": 0.82980818, "learning_rate": 3.842423683872608e-06, "loss": 0.85347992, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.771913528442383 }, { "auxiliary_loss_clip": 0.01283903, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 1.06307197, "balance_loss_mlp": 1.02809858, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.465832112917977, "language_loss": 0.77809888, "learning_rate": 3.842120476364232e-06, "loss": 0.80131316, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.683453321456909 }, { "auxiliary_loss_clip": 0.01290452, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.06536198, "balance_loss_mlp": 1.02476704, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.064162225671212, "language_loss": 0.84058762, "learning_rate": 3.841816989407644e-06, "loss": 0.86383337, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.622837543487549 }, { "auxiliary_loss_clip": 0.01378981, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.05927801, "balance_loss_mlp": 1.02505708, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.6658999650565134, "language_loss": 0.76535511, "learning_rate": 3.841513223048884e-06, "loss": 0.7894876, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.9073240756988525 }, { "auxiliary_loss_clip": 0.01378031, "auxiliary_loss_mlp": 0.01031041, "balance_loss_clip": 1.05942321, "balance_loss_mlp": 1.02187932, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.2025093539821516, "language_loss": 0.78705734, "learning_rate": 3.841209177334031e-06, "loss": 0.81114805, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.760091781616211 }, { "auxiliary_loss_clip": 0.01276602, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.06195199, "balance_loss_mlp": 1.02324235, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.809465540091898, "language_loss": 0.75136304, "learning_rate": 3.84090485230921e-06, "loss": 0.77445078, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.6565163135528564 }, { "auxiliary_loss_clip": 0.01232905, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.06640267, "balance_loss_mlp": 1.02697909, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.637601227383634, "language_loss": 0.76425481, "learning_rate": 3.840600248020588e-06, "loss": 0.78694749, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.617185592651367 }, { "auxiliary_loss_clip": 0.01284536, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.05996239, "balance_loss_mlp": 1.02308238, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.291131101403316, "language_loss": 0.7985394, "learning_rate": 3.840295364514371e-06, "loss": 0.82170933, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.6870031356811523 }, { "auxiliary_loss_clip": 0.01329524, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.06125283, "balance_loss_mlp": 1.02216721, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 4.77614452864955, "language_loss": 0.78475344, "learning_rate": 3.83999020183681e-06, "loss": 0.80835605, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.624479293823242 }, { "auxiliary_loss_clip": 0.01471883, "auxiliary_loss_mlp": 0.01039055, "balance_loss_clip": 1.05558324, "balance_loss_mlp": 1.0299654, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 2.1370821521120957, "language_loss": 0.7893225, "learning_rate": 3.839684760034199e-06, "loss": 0.81443185, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.9282948970794678 }, { "auxiliary_loss_clip": 0.0137665, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.05820179, "balance_loss_mlp": 1.02848744, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 3.3642662922835522, "language_loss": 0.65255916, "learning_rate": 3.8393790391528716e-06, "loss": 0.67670715, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.8821663856506348 }, { "auxiliary_loss_clip": 0.01324243, "auxiliary_loss_mlp": 0.01040224, "balance_loss_clip": 1.05603313, "balance_loss_mlp": 1.03059161, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 2.3068168745835407, "language_loss": 0.89176273, "learning_rate": 3.8390730392392075e-06, "loss": 0.91540742, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.7800090312957764 }, { "auxiliary_loss_clip": 0.01231388, "auxiliary_loss_mlp": 0.01039611, "balance_loss_clip": 1.06329501, "balance_loss_mlp": 1.02977586, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 3.2399917745098046, "language_loss": 0.79293954, "learning_rate": 3.838766760339626e-06, "loss": 0.81564951, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 3.467919111251831 }, { "auxiliary_loss_clip": 0.01365407, "auxiliary_loss_mlp": 0.01035084, "balance_loss_clip": 1.05487871, "balance_loss_mlp": 1.02610207, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 3.437782493172087, "language_loss": 0.79120666, "learning_rate": 3.838460202500587e-06, "loss": 0.81521153, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.711481809616089 }, { "auxiliary_loss_clip": 0.01378218, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.06444907, "balance_loss_mlp": 1.02608228, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.3198769579902834, "language_loss": 0.74456012, "learning_rate": 3.838153365768599e-06, "loss": 0.76869857, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.7140684127807617 }, { "auxiliary_loss_clip": 0.01375851, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.06320298, "balance_loss_mlp": 1.03087687, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.9721809637243304, "language_loss": 0.75019288, "learning_rate": 3.837846250190206e-06, "loss": 0.77435416, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.8715617656707764 }, { "auxiliary_loss_clip": 0.01377837, "auxiliary_loss_mlp": 0.0257811, "balance_loss_clip": 1.05885744, "balance_loss_mlp": 1.0004431, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.433205534423664, "language_loss": 0.76702583, "learning_rate": 3.837538855811998e-06, "loss": 0.80658531, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.8650379180908203 }, { "auxiliary_loss_clip": 0.01282065, "auxiliary_loss_mlp": 0.01038453, "balance_loss_clip": 1.0639776, "balance_loss_mlp": 1.02939939, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.191543632034166, "language_loss": 0.71115327, "learning_rate": 3.837231182680606e-06, "loss": 0.73435843, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 4.477279901504517 }, { "auxiliary_loss_clip": 0.01285266, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.06308329, "balance_loss_mlp": 1.0228858, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.6301760033462263, "language_loss": 0.76124537, "learning_rate": 3.836923230842706e-06, "loss": 0.78442234, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.6569464206695557 }, { "auxiliary_loss_clip": 0.01428574, "auxiliary_loss_mlp": 0.01029691, "balance_loss_clip": 1.05233049, "balance_loss_mlp": 1.02089918, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.2314971780355277, "language_loss": 0.80597359, "learning_rate": 3.836615000345011e-06, "loss": 0.83055627, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 2.801683187484741 }, { "auxiliary_loss_clip": 0.01233676, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.06662083, "balance_loss_mlp": 1.02149594, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.3540270533789887, "language_loss": 0.77829301, "learning_rate": 3.836306491234282e-06, "loss": 0.80092967, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 3.6349635124206543 }, { "auxiliary_loss_clip": 0.01330871, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.06803143, "balance_loss_mlp": 1.0299716, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.400441496495865, "language_loss": 0.75380945, "learning_rate": 3.835997703557317e-06, "loss": 0.7775079, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.6636579036712646 }, { "auxiliary_loss_clip": 0.01431565, "auxiliary_loss_mlp": 0.01039764, "balance_loss_clip": 1.0528115, "balance_loss_mlp": 1.03038239, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.9004147349007385, "language_loss": 0.79842657, "learning_rate": 3.83568863736096e-06, "loss": 0.82313991, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.8169009685516357 }, { "auxiliary_loss_clip": 0.01378401, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.05424786, "balance_loss_mlp": 1.02665997, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 3.4242557774599702, "language_loss": 0.89311433, "learning_rate": 3.8353792926920975e-06, "loss": 0.91725904, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.730025291442871 }, { "auxiliary_loss_clip": 0.0128577, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.06470609, "balance_loss_mlp": 1.03104448, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.588427768614712, "language_loss": 0.81978399, "learning_rate": 3.835069669597655e-06, "loss": 0.84304547, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.7109203338623047 }, { "auxiliary_loss_clip": 0.01286047, "auxiliary_loss_mlp": 0.02583776, "balance_loss_clip": 1.06326497, "balance_loss_mlp": 1.00048923, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.193896963856943, "language_loss": 0.80007756, "learning_rate": 3.834759768124603e-06, "loss": 0.83877581, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.692371129989624 }, { "auxiliary_loss_clip": 0.01330644, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.06353426, "balance_loss_mlp": 1.03250861, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.8769111755542345, "language_loss": 0.7627638, "learning_rate": 3.834449588319953e-06, "loss": 0.78649187, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.716305732727051 }, { "auxiliary_loss_clip": 0.01279761, "auxiliary_loss_mlp": 0.01044207, "balance_loss_clip": 1.06582379, "balance_loss_mlp": 1.03441405, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 2.641452399583895, "language_loss": 0.85210168, "learning_rate": 3.834139130230758e-06, "loss": 0.87534136, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.7111051082611084 }, { "auxiliary_loss_clip": 0.01330579, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.05793762, "balance_loss_mlp": 1.02297115, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.7101404893171706, "language_loss": 0.80882078, "learning_rate": 3.833828393904117e-06, "loss": 0.8324514, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.7442586421966553 }, { "auxiliary_loss_clip": 0.01370672, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.05352974, "balance_loss_mlp": 1.02538908, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.4676504496153657, "language_loss": 0.77581191, "learning_rate": 3.833517379387165e-06, "loss": 0.79986978, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.764780282974243 }, { "auxiliary_loss_clip": 0.01285458, "auxiliary_loss_mlp": 0.01033272, "balance_loss_clip": 1.06300914, "balance_loss_mlp": 1.02392578, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.012425762480874, "language_loss": 0.88897502, "learning_rate": 3.833206086727085e-06, "loss": 0.9121623, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.69722318649292 }, { "auxiliary_loss_clip": 0.01380577, "auxiliary_loss_mlp": 0.01032653, "balance_loss_clip": 1.0545516, "balance_loss_mlp": 1.02340877, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.9263861552076897, "language_loss": 0.70528698, "learning_rate": 3.8328945159710994e-06, "loss": 0.72941935, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.7881438732147217 }, { "auxiliary_loss_clip": 0.01291257, "auxiliary_loss_mlp": 0.02576995, "balance_loss_clip": 1.06774449, "balance_loss_mlp": 1.00047493, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 3.1076971473298705, "language_loss": 0.88859296, "learning_rate": 3.832582667166473e-06, "loss": 0.92727548, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.673759698867798 }, { "auxiliary_loss_clip": 0.01333181, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.05963063, "balance_loss_mlp": 1.02355933, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 2.066051630357901, "language_loss": 0.81504214, "learning_rate": 3.8322705403605125e-06, "loss": 0.83869898, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.768216133117676 }, { "auxiliary_loss_clip": 0.01321548, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.05857456, "balance_loss_mlp": 1.01973712, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 8.743601966666455, "language_loss": 0.81456852, "learning_rate": 3.831958135600568e-06, "loss": 0.83806741, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.6298184394836426 }, { "auxiliary_loss_clip": 0.01284794, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.06404781, "balance_loss_mlp": 1.02619696, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 2.2638046756072807, "language_loss": 0.79516357, "learning_rate": 3.831645452934032e-06, "loss": 0.81836092, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.6748552322387695 }, { "auxiliary_loss_clip": 0.0123524, "auxiliary_loss_mlp": 0.01032081, "balance_loss_clip": 1.0685606, "balance_loss_mlp": 1.02309227, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 2.1443480424563712, "language_loss": 0.80262864, "learning_rate": 3.831332492408336e-06, "loss": 0.82530177, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.6757545471191406 }, { "auxiliary_loss_clip": 0.01325107, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.05749011, "balance_loss_mlp": 1.02225924, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 2.4916802772552997, "language_loss": 0.69355345, "learning_rate": 3.831019254070957e-06, "loss": 0.71711844, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.7027721405029297 }, { "auxiliary_loss_clip": 0.0142848, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.05631256, "balance_loss_mlp": 1.02692795, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 8.7029703845329, "language_loss": 0.95156813, "learning_rate": 3.8307057379694135e-06, "loss": 0.9762162, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 3.7809805870056152 }, { "auxiliary_loss_clip": 0.01233608, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.06346524, "balance_loss_mlp": 1.02626133, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.191699177632541, "language_loss": 0.8265574, "learning_rate": 3.830391944151264e-06, "loss": 0.84924603, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.592554807662964 }, { "auxiliary_loss_clip": 0.0132956, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.0574857, "balance_loss_mlp": 1.02500927, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.8851320641489395, "language_loss": 0.67523563, "learning_rate": 3.830077872664114e-06, "loss": 0.69887018, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.829266309738159 }, { "auxiliary_loss_clip": 0.01482384, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.05430198, "balance_loss_mlp": 1.02275431, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 3.6867462552200396, "language_loss": 0.72891682, "learning_rate": 3.829763523555604e-06, "loss": 0.75405991, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.8823046684265137 }, { "auxiliary_loss_clip": 0.01281644, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.06758046, "balance_loss_mlp": 1.02914262, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 3.3872992549735907, "language_loss": 0.7811588, "learning_rate": 3.829448896873423e-06, "loss": 0.80435222, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.7282354831695557 }, { "auxiliary_loss_clip": 0.01422146, "auxiliary_loss_mlp": 0.02582895, "balance_loss_clip": 1.05675626, "balance_loss_mlp": 1.00039256, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 8.471460245681248, "language_loss": 0.79153746, "learning_rate": 3.829133992665299e-06, "loss": 0.83158785, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 3.7018277645111084 }, { "auxiliary_loss_clip": 0.01273733, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.06145644, "balance_loss_mlp": 1.02481782, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 4.905872243925333, "language_loss": 0.88928574, "learning_rate": 3.828818810979002e-06, "loss": 0.91236234, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 2.653956413269043 }, { "auxiliary_loss_clip": 0.01233672, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.06754696, "balance_loss_mlp": 1.02020872, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 2.799270267978755, "language_loss": 0.80452502, "learning_rate": 3.8285033518623454e-06, "loss": 0.82715416, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.657195568084717 }, { "auxiliary_loss_clip": 0.01285611, "auxiliary_loss_mlp": 0.01034838, "balance_loss_clip": 1.06374288, "balance_loss_mlp": 1.02444303, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 3.2742172720887703, "language_loss": 0.81316459, "learning_rate": 3.8281876153631845e-06, "loss": 0.8363691, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 3.5065791606903076 }, { "auxiliary_loss_clip": 0.01430983, "auxiliary_loss_mlp": 0.01036167, "balance_loss_clip": 1.05630386, "balance_loss_mlp": 1.02680945, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 2.1720105810767274, "language_loss": 0.64689624, "learning_rate": 3.827871601529416e-06, "loss": 0.67156768, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.757573366165161 }, { "auxiliary_loss_clip": 0.01322457, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.05934525, "balance_loss_mlp": 1.02311122, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 2.945869454663328, "language_loss": 0.80504751, "learning_rate": 3.827555310408979e-06, "loss": 0.82859063, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.7074549198150635 }, { "auxiliary_loss_clip": 0.01382594, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.06506586, "balance_loss_mlp": 1.02848887, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.8510701401337846, "language_loss": 0.83186847, "learning_rate": 3.827238742049854e-06, "loss": 0.85607255, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.7707784175872803 }, { "auxiliary_loss_clip": 0.01230111, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.0629375, "balance_loss_mlp": 1.02619743, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 1.8039251262164828, "language_loss": 0.51599193, "learning_rate": 3.826921896500066e-06, "loss": 0.53864402, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.70924973487854 }, { "auxiliary_loss_clip": 0.01384886, "auxiliary_loss_mlp": 0.01032668, "balance_loss_clip": 1.05969238, "balance_loss_mlp": 1.0235728, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 9.32373691443938, "language_loss": 0.78048062, "learning_rate": 3.826604773807678e-06, "loss": 0.80465615, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.732954263687134 }, { "auxiliary_loss_clip": 0.01329517, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.05680537, "balance_loss_mlp": 1.02148604, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.9162553108940767, "language_loss": 0.74556386, "learning_rate": 3.826287374020798e-06, "loss": 0.76916951, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.6842048168182373 }, { "auxiliary_loss_clip": 0.01233291, "auxiliary_loss_mlp": 0.01042677, "balance_loss_clip": 1.06747687, "balance_loss_mlp": 1.03303885, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 1.9630859155810778, "language_loss": 0.8246367, "learning_rate": 3.825969697187575e-06, "loss": 0.84739631, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.638132095336914 }, { "auxiliary_loss_clip": 0.01380289, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.05624676, "balance_loss_mlp": 1.02040219, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 4.49563349981601, "language_loss": 0.69360447, "learning_rate": 3.8256517433562015e-06, "loss": 0.71770549, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.7775821685791016 }, { "auxiliary_loss_clip": 0.01229613, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.06391239, "balance_loss_mlp": 1.02928007, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.3032069003246276, "language_loss": 0.92024672, "learning_rate": 3.82533351257491e-06, "loss": 0.94293118, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.6200575828552246 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.06636178, "balance_loss_mlp": 1.02685618, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.8563699719828266, "language_loss": 0.88798207, "learning_rate": 3.825015004891975e-06, "loss": 0.91114211, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.6794424057006836 }, { "auxiliary_loss_clip": 0.01277258, "auxiliary_loss_mlp": 0.01033486, "balance_loss_clip": 1.06096172, "balance_loss_mlp": 1.02519464, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 1.7247970479428871, "language_loss": 0.75883764, "learning_rate": 3.824696220355716e-06, "loss": 0.78194505, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.666212797164917 }, { "auxiliary_loss_clip": 0.01325778, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.06196439, "balance_loss_mlp": 1.02335417, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.5470809826311183, "language_loss": 0.78951478, "learning_rate": 3.824377159014491e-06, "loss": 0.81308985, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.7269270420074463 }, { "auxiliary_loss_clip": 0.01280596, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.06420064, "balance_loss_mlp": 1.02055132, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.388011250340376, "language_loss": 0.85179365, "learning_rate": 3.824057820916702e-06, "loss": 0.87489027, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.613114833831787 }, { "auxiliary_loss_clip": 0.01332182, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.0617857, "balance_loss_mlp": 1.02268159, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 3.524332487290133, "language_loss": 0.71980077, "learning_rate": 3.8237382061107904e-06, "loss": 0.74344563, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.6544723510742188 }, { "auxiliary_loss_clip": 0.01517902, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.05059099, "balance_loss_mlp": 1.02672482, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 1.981289981892937, "language_loss": 0.78701448, "learning_rate": 3.823418314645243e-06, "loss": 0.81255352, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.9089746475219727 }, { "auxiliary_loss_clip": 0.01414444, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.05757141, "balance_loss_mlp": 1.02576637, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.3325519818736833, "language_loss": 0.74994159, "learning_rate": 3.823098146568588e-06, "loss": 0.7744323, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 3.1610448360443115 }, { "auxiliary_loss_clip": 0.01283805, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.06595826, "balance_loss_mlp": 1.02729166, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 2.0324186041133743, "language_loss": 0.7135278, "learning_rate": 3.822777701929394e-06, "loss": 0.73672098, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.7361841201782227 }, { "auxiliary_loss_clip": 0.01272554, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.05939221, "balance_loss_mlp": 1.02621293, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 1.8103763322318076, "language_loss": 0.73702335, "learning_rate": 3.8224569807762714e-06, "loss": 0.76010394, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 3.606426477432251 }, { "auxiliary_loss_clip": 0.01418114, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.05243778, "balance_loss_mlp": 1.02609789, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 2.131192271002674, "language_loss": 0.76563418, "learning_rate": 3.822135983157873e-06, "loss": 0.79016483, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.7617552280426025 }, { "auxiliary_loss_clip": 0.01227626, "auxiliary_loss_mlp": 0.0257551, "balance_loss_clip": 1.06319153, "balance_loss_mlp": 1.00036538, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 4.175435257609612, "language_loss": 0.84447563, "learning_rate": 3.821814709122896e-06, "loss": 0.88250697, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.57891845703125 }, { "auxiliary_loss_clip": 0.01326795, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.05970883, "balance_loss_mlp": 1.02324772, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.3952863262451802, "language_loss": 0.84989846, "learning_rate": 3.821493158720076e-06, "loss": 0.87348574, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.720737934112549 }, { "auxiliary_loss_clip": 0.0137564, "auxiliary_loss_mlp": 0.01043055, "balance_loss_clip": 1.05417824, "balance_loss_mlp": 1.0335362, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 2.86833282292341, "language_loss": 0.73568332, "learning_rate": 3.821171331998191e-06, "loss": 0.75987029, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.6849477291107178 }, { "auxiliary_loss_clip": 0.01293706, "auxiliary_loss_mlp": 0.01003809, "balance_loss_clip": 1.03328085, "balance_loss_mlp": 1.00175858, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7093532955435258, "language_loss": 0.54454035, "learning_rate": 3.820849229006064e-06, "loss": 0.56751549, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 5.272507190704346 }, { "auxiliary_loss_clip": 0.01231247, "auxiliary_loss_mlp": 0.01029043, "balance_loss_clip": 1.06429744, "balance_loss_mlp": 1.02055502, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 2.609384963228575, "language_loss": 0.70974267, "learning_rate": 3.8205268497925564e-06, "loss": 0.73234558, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.689685821533203 }, { "auxiliary_loss_clip": 0.01231356, "auxiliary_loss_mlp": 0.01032488, "balance_loss_clip": 1.06617773, "balance_loss_mlp": 1.02321935, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.7736455609514943, "language_loss": 0.78439784, "learning_rate": 3.8202041944065725e-06, "loss": 0.80703628, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 2.640169143676758 }, { "auxiliary_loss_clip": 0.01233787, "auxiliary_loss_mlp": 0.01036774, "balance_loss_clip": 1.06659627, "balance_loss_mlp": 1.02852464, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.1441172797281034, "language_loss": 0.73803592, "learning_rate": 3.819881262897061e-06, "loss": 0.76074159, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 3.5313470363616943 }, { "auxiliary_loss_clip": 0.01378338, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.06304884, "balance_loss_mlp": 1.02542365, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 2.4312463585454047, "language_loss": 0.73557925, "learning_rate": 3.819558055313008e-06, "loss": 0.75971711, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.7929446697235107 }, { "auxiliary_loss_clip": 0.01286584, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.0644536, "balance_loss_mlp": 1.02756429, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 3.7028190230734266, "language_loss": 0.77542746, "learning_rate": 3.819234571703444e-06, "loss": 0.79865181, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.6728122234344482 }, { "auxiliary_loss_clip": 0.01279061, "auxiliary_loss_mlp": 0.01037537, "balance_loss_clip": 1.06313562, "balance_loss_mlp": 1.02857804, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 4.1682241401011675, "language_loss": 0.85605836, "learning_rate": 3.8189108121174435e-06, "loss": 0.8792243, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.6614482402801514 }, { "auxiliary_loss_clip": 0.01371447, "auxiliary_loss_mlp": 0.01038982, "balance_loss_clip": 1.06116128, "balance_loss_mlp": 1.02982044, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.679656163009191, "language_loss": 0.83329082, "learning_rate": 3.818586776604118e-06, "loss": 0.85739517, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.776592254638672 }, { "auxiliary_loss_clip": 0.0126841, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.05934453, "balance_loss_mlp": 1.02793539, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 2.1391077356467805, "language_loss": 0.61657262, "learning_rate": 3.818262465212625e-06, "loss": 0.63961995, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.6782467365264893 }, { "auxiliary_loss_clip": 0.01274271, "auxiliary_loss_mlp": 0.01038994, "balance_loss_clip": 1.06207228, "balance_loss_mlp": 1.02980864, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 3.256805700350427, "language_loss": 0.77046371, "learning_rate": 3.817937877992161e-06, "loss": 0.79359627, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.6224617958068848 }, { "auxiliary_loss_clip": 0.01378882, "auxiliary_loss_mlp": 0.02578875, "balance_loss_clip": 1.05547607, "balance_loss_mlp": 1.00029302, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.4942228738212586, "language_loss": 0.85647738, "learning_rate": 3.817613014991967e-06, "loss": 0.89605492, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.689931869506836 }, { "auxiliary_loss_clip": 0.01370313, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.05550718, "balance_loss_mlp": 1.02797103, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.118298044736565, "language_loss": 0.76568907, "learning_rate": 3.817287876261323e-06, "loss": 0.78976142, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.759584903717041 }, { "auxiliary_loss_clip": 0.01324114, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.06007493, "balance_loss_mlp": 1.02217269, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 2.6198324112894724, "language_loss": 0.80074048, "learning_rate": 3.816962461849553e-06, "loss": 0.82429224, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.728184700012207 }, { "auxiliary_loss_clip": 0.01329297, "auxiliary_loss_mlp": 0.01040325, "balance_loss_clip": 1.06240022, "balance_loss_mlp": 1.03057361, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 4.034562719425338, "language_loss": 0.84148502, "learning_rate": 3.8166367718060235e-06, "loss": 0.86518121, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.701451539993286 }, { "auxiliary_loss_clip": 0.0127648, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.05931163, "balance_loss_mlp": 1.02497113, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 2.1171706625049334, "language_loss": 0.77173173, "learning_rate": 3.816310806180139e-06, "loss": 0.79483271, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.6479380130767822 }, { "auxiliary_loss_clip": 0.0132635, "auxiliary_loss_mlp": 0.01028696, "balance_loss_clip": 1.06278741, "balance_loss_mlp": 1.01955914, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 2.808294009587239, "language_loss": 0.81263828, "learning_rate": 3.81598456502135e-06, "loss": 0.83618879, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.7264108657836914 }, { "auxiliary_loss_clip": 0.01323126, "auxiliary_loss_mlp": 0.01035453, "balance_loss_clip": 1.06015241, "balance_loss_mlp": 1.02640462, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 4.532433074698137, "language_loss": 0.87126297, "learning_rate": 3.8156580483791455e-06, "loss": 0.8948487, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.6319878101348877 }, { "auxiliary_loss_clip": 0.01228946, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.06340516, "balance_loss_mlp": 1.0236547, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.2440290358212276, "language_loss": 0.77353114, "learning_rate": 3.815331256303059e-06, "loss": 0.79614317, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.7727062702178955 }, { "auxiliary_loss_clip": 0.01377444, "auxiliary_loss_mlp": 0.01037516, "balance_loss_clip": 1.0627085, "balance_loss_mlp": 1.02886152, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 1.968770215264132, "language_loss": 0.77295232, "learning_rate": 3.815004188842665e-06, "loss": 0.79710197, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.729576826095581 }, { "auxiliary_loss_clip": 0.01267794, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.05802822, "balance_loss_mlp": 1.02208674, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.7036351604068152, "language_loss": 0.7988646, "learning_rate": 3.814676846047578e-06, "loss": 0.82185078, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.731799840927124 }, { "auxiliary_loss_clip": 0.01278412, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.06098545, "balance_loss_mlp": 1.02739084, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 2.38383171730575, "language_loss": 0.69991207, "learning_rate": 3.8143492279674565e-06, "loss": 0.72305626, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.773869037628174 }, { "auxiliary_loss_clip": 0.0129603, "auxiliary_loss_mlp": 0.01003146, "balance_loss_clip": 1.03746367, "balance_loss_mlp": 1.00120282, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8691801545769928, "language_loss": 0.58374286, "learning_rate": 3.8140213346519997e-06, "loss": 0.60673463, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.9021623134613037 }, { "auxiliary_loss_clip": 0.01370211, "auxiliary_loss_mlp": 0.01038144, "balance_loss_clip": 1.05739474, "balance_loss_mlp": 1.0290246, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 2.6528773441096662, "language_loss": 0.77067602, "learning_rate": 3.813693166150948e-06, "loss": 0.79475957, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.755739688873291 }, { "auxiliary_loss_clip": 0.01368592, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.0564177, "balance_loss_mlp": 1.02341509, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.9574240021718645, "language_loss": 0.85433143, "learning_rate": 3.813364722514086e-06, "loss": 0.87834334, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.726825475692749 }, { "auxiliary_loss_clip": 0.01280225, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.06129801, "balance_loss_mlp": 1.0251298, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 2.0077819912349884, "language_loss": 0.80468172, "learning_rate": 3.8130360037912368e-06, "loss": 0.82782865, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.620246171951294 }, { "auxiliary_loss_clip": 0.01276554, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.06079602, "balance_loss_mlp": 1.02206421, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.0782542221248335, "language_loss": 0.81526929, "learning_rate": 3.812707010032268e-06, "loss": 0.83834279, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 3.606328248977661 }, { "auxiliary_loss_clip": 0.01285956, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.06590033, "balance_loss_mlp": 1.02439666, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 1.7908994318781268, "language_loss": 0.79382765, "learning_rate": 3.8123777412870863e-06, "loss": 0.81702256, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 3.5357794761657715 }, { "auxiliary_loss_clip": 0.01331162, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.06000888, "balance_loss_mlp": 1.03118217, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.4068089358760885, "language_loss": 0.7863096, "learning_rate": 3.812048197605643e-06, "loss": 0.81002259, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 2.7050952911376953 }, { "auxiliary_loss_clip": 0.01276115, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.06050324, "balance_loss_mlp": 1.02070594, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 3.9038257596586323, "language_loss": 0.81834865, "learning_rate": 3.8117183790379277e-06, "loss": 0.84140474, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 2.691882371902466 }, { "auxiliary_loss_clip": 0.01230745, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.0629456, "balance_loss_mlp": 1.02125919, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 3.9364081705900267, "language_loss": 0.94427872, "learning_rate": 3.811388285633976e-06, "loss": 0.9668901, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 3.511197090148926 }, { "auxiliary_loss_clip": 0.01433386, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.05755186, "balance_loss_mlp": 1.02650523, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 5.1406726980669895, "language_loss": 0.6191659, "learning_rate": 3.811057917443861e-06, "loss": 0.64385504, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.8306100368499756 }, { "auxiliary_loss_clip": 0.01247254, "auxiliary_loss_mlp": 0.01001162, "balance_loss_clip": 1.04051113, "balance_loss_mlp": 0.99921858, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8668394399207702, "language_loss": 0.68290526, "learning_rate": 3.8107272745177e-06, "loss": 0.70538938, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.381443500518799 }, { "auxiliary_loss_clip": 0.01377092, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.05926013, "balance_loss_mlp": 1.02332819, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 2.109304403719126, "language_loss": 0.78849912, "learning_rate": 3.8103963569056513e-06, "loss": 0.81259263, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.7423925399780273 }, { "auxiliary_loss_clip": 0.01323787, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.05692124, "balance_loss_mlp": 1.02517319, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 8.862682285935083, "language_loss": 0.88408411, "learning_rate": 3.8100651646579146e-06, "loss": 0.90764719, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.6791677474975586 }, { "auxiliary_loss_clip": 0.01325321, "auxiliary_loss_mlp": 0.01036667, "balance_loss_clip": 1.05716634, "balance_loss_mlp": 1.02731514, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.0908257537622346, "language_loss": 0.92784464, "learning_rate": 3.8097336978247317e-06, "loss": 0.95146447, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.6184322834014893 }, { "auxiliary_loss_clip": 0.01321651, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.06055295, "balance_loss_mlp": 1.0235846, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 2.767275555459153, "language_loss": 0.88983786, "learning_rate": 3.8094019564563854e-06, "loss": 0.91338503, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.704077959060669 }, { "auxiliary_loss_clip": 0.01232559, "auxiliary_loss_mlp": 0.0257866, "balance_loss_clip": 1.06525016, "balance_loss_mlp": 1.00035286, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 4.393308060981328, "language_loss": 0.75363612, "learning_rate": 3.809069940603201e-06, "loss": 0.79174829, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.5840277671813965 }, { "auxiliary_loss_clip": 0.01318229, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.0578053, "balance_loss_mlp": 1.02625895, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 3.2210893601946053, "language_loss": 0.78440088, "learning_rate": 3.8087376503155452e-06, "loss": 0.80793393, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.6589772701263428 }, { "auxiliary_loss_clip": 0.01241837, "auxiliary_loss_mlp": 0.01001648, "balance_loss_clip": 1.04067314, "balance_loss_mlp": 0.99964488, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.9074726612821137, "language_loss": 0.56240004, "learning_rate": 3.808405085643826e-06, "loss": 0.58483493, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.336135149002075 }, { "auxiliary_loss_clip": 0.01230112, "auxiliary_loss_mlp": 0.02575365, "balance_loss_clip": 1.06481862, "balance_loss_mlp": 1.0003345, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 3.1221511844361975, "language_loss": 0.89087605, "learning_rate": 3.8080722466384925e-06, "loss": 0.92893088, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.752171754837036 }, { "auxiliary_loss_clip": 0.0123057, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.0617429, "balance_loss_mlp": 1.02391756, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 4.016002099062441, "language_loss": 0.71142459, "learning_rate": 3.8077391333500376e-06, "loss": 0.73406363, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.7324962615966797 }, { "auxiliary_loss_clip": 0.01270157, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.06391621, "balance_loss_mlp": 1.02279055, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.8113989028613198, "language_loss": 0.7698428, "learning_rate": 3.8074057458289934e-06, "loss": 0.79285705, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.785930633544922 }, { "auxiliary_loss_clip": 0.01328631, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.0598228, "balance_loss_mlp": 1.02447474, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 2.3134030922763844, "language_loss": 0.82510674, "learning_rate": 3.807072084125934e-06, "loss": 0.84872472, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.757077217102051 }, { "auxiliary_loss_clip": 0.01326704, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.06400728, "balance_loss_mlp": 1.02567422, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 17.119619939963457, "language_loss": 0.80491465, "learning_rate": 3.806738148291477e-06, "loss": 0.82852793, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6498639583587646 }, { "auxiliary_loss_clip": 0.01422768, "auxiliary_loss_mlp": 0.01034761, "balance_loss_clip": 1.05524158, "balance_loss_mlp": 1.02642798, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 2.037399429473232, "language_loss": 0.71091229, "learning_rate": 3.8064039383762793e-06, "loss": 0.73548758, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 3.0642402172088623 }, { "auxiliary_loss_clip": 0.01281916, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 1.06413352, "balance_loss_mlp": 1.02586555, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 3.6825954905976284, "language_loss": 0.76738441, "learning_rate": 3.8060694544310396e-06, "loss": 0.7905497, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 3.0161969661712646 }, { "auxiliary_loss_clip": 0.01230769, "auxiliary_loss_mlp": 0.01034817, "balance_loss_clip": 1.06369638, "balance_loss_mlp": 1.02537537, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 2.0100229154452225, "language_loss": 0.78872997, "learning_rate": 3.8057346965065006e-06, "loss": 0.81138581, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.7953455448150635 }, { "auxiliary_loss_clip": 0.01327006, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.06265378, "balance_loss_mlp": 1.02227533, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.785046352310323, "language_loss": 0.8443464, "learning_rate": 3.805399664653443e-06, "loss": 0.86793154, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 4.227256536483765 }, { "auxiliary_loss_clip": 0.01231523, "auxiliary_loss_mlp": 0.01032348, "balance_loss_clip": 1.06425273, "balance_loss_mlp": 1.02293062, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 2.6667295247208007, "language_loss": 0.74768591, "learning_rate": 3.805064358922692e-06, "loss": 0.77032471, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.6381728649139404 }, { "auxiliary_loss_clip": 0.01283363, "auxiliary_loss_mlp": 0.01027359, "balance_loss_clip": 1.06229734, "balance_loss_mlp": 1.01874006, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.8568149339510323, "language_loss": 0.80850488, "learning_rate": 3.8047287793651136e-06, "loss": 0.83161211, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.665480613708496 }, { "auxiliary_loss_clip": 0.01322928, "auxiliary_loss_mlp": 0.01039564, "balance_loss_clip": 1.06143725, "balance_loss_mlp": 1.03037918, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.9310781550355527, "language_loss": 0.89056718, "learning_rate": 3.8043929260316137e-06, "loss": 0.91419208, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.823216676712036 }, { "auxiliary_loss_clip": 0.01334578, "auxiliary_loss_mlp": 0.01039489, "balance_loss_clip": 1.06752706, "balance_loss_mlp": 1.03035176, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 4.065094253931392, "language_loss": 0.83532095, "learning_rate": 3.8040567989731417e-06, "loss": 0.8590616, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.7605812549591064 }, { "auxiliary_loss_clip": 0.01274933, "auxiliary_loss_mlp": 0.01027778, "balance_loss_clip": 1.06297469, "balance_loss_mlp": 1.01928997, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 2.796588744566628, "language_loss": 0.79629719, "learning_rate": 3.8037203982406876e-06, "loss": 0.81932425, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 4.5333638191223145 }, { "auxiliary_loss_clip": 0.01233029, "auxiliary_loss_mlp": 0.01033491, "balance_loss_clip": 1.06613564, "balance_loss_mlp": 1.02441859, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 2.1847789795572052, "language_loss": 0.73215342, "learning_rate": 3.8033837238852835e-06, "loss": 0.75481856, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 2.646944046020508 }, { "auxiliary_loss_clip": 0.01320538, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.05707765, "balance_loss_mlp": 1.02355492, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.7863393374140721, "language_loss": 0.69368023, "learning_rate": 3.8030467759580017e-06, "loss": 0.71721053, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.7319626808166504 }, { "auxiliary_loss_clip": 0.01220783, "auxiliary_loss_mlp": 0.01033297, "balance_loss_clip": 1.06212199, "balance_loss_mlp": 1.024261, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 2.051379853411012, "language_loss": 0.87261915, "learning_rate": 3.802709554509958e-06, "loss": 0.89515996, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 3.604795455932617 }, { "auxiliary_loss_clip": 0.01322843, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.05666697, "balance_loss_mlp": 1.02257788, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.956830380752571, "language_loss": 0.79395187, "learning_rate": 3.8023720595923083e-06, "loss": 0.81749523, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.703732490539551 }, { "auxiliary_loss_clip": 0.01430256, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.05605042, "balance_loss_mlp": 1.0265727, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 2.2678685078038807, "language_loss": 0.87638593, "learning_rate": 3.80203429125625e-06, "loss": 0.90104294, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.7590084075927734 }, { "auxiliary_loss_clip": 0.01412985, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.05732322, "balance_loss_mlp": 1.02129138, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.743983132025934, "language_loss": 0.70349479, "learning_rate": 3.8016962495530225e-06, "loss": 0.72792542, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.830263376235962 }, { "auxiliary_loss_clip": 0.01233261, "auxiliary_loss_mlp": 0.01033845, "balance_loss_clip": 1.06473422, "balance_loss_mlp": 1.02486277, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.4113715700009255, "language_loss": 0.77217418, "learning_rate": 3.8013579345339063e-06, "loss": 0.79484522, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.573124885559082 }, { "auxiliary_loss_clip": 0.01379021, "auxiliary_loss_mlp": 0.01036727, "balance_loss_clip": 1.05882061, "balance_loss_mlp": 1.0263083, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 3.6265667428202706, "language_loss": 0.6984179, "learning_rate": 3.801019346250224e-06, "loss": 0.72257537, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.708192825317383 }, { "auxiliary_loss_clip": 0.01281499, "auxiliary_loss_mlp": 0.01030866, "balance_loss_clip": 1.0642736, "balance_loss_mlp": 1.02175879, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.852237601067135, "language_loss": 0.83655792, "learning_rate": 3.8006804847533395e-06, "loss": 0.85968161, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.6203277111053467 }, { "auxiliary_loss_clip": 0.01229686, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.06347167, "balance_loss_mlp": 1.01940513, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 2.2757413123417116, "language_loss": 0.85402352, "learning_rate": 3.8003413500946556e-06, "loss": 0.87660289, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.523566484451294 }, { "auxiliary_loss_clip": 0.01332174, "auxiliary_loss_mlp": 0.01033824, "balance_loss_clip": 1.06249094, "balance_loss_mlp": 1.02406645, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 2.9593504804894266, "language_loss": 0.83205879, "learning_rate": 3.8000019423256216e-06, "loss": 0.85571885, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.5316426753997803 }, { "auxiliary_loss_clip": 0.01318934, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.05895126, "balance_loss_mlp": 1.02440321, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 2.3292068967330466, "language_loss": 0.87989616, "learning_rate": 3.7996622614977234e-06, "loss": 0.90341926, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.6160335540771484 }, { "auxiliary_loss_clip": 0.01269479, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.06537724, "balance_loss_mlp": 1.02196789, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 2.160624113817759, "language_loss": 0.79175413, "learning_rate": 3.799322307662492e-06, "loss": 0.81475985, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.6830177307128906 }, { "auxiliary_loss_clip": 0.01430809, "auxiliary_loss_mlp": 0.01034722, "balance_loss_clip": 1.05563259, "balance_loss_mlp": 1.02517378, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.3744398478380195, "language_loss": 0.84154308, "learning_rate": 3.798982080871496e-06, "loss": 0.86619842, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.6599204540252686 }, { "auxiliary_loss_clip": 0.01233852, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.06452203, "balance_loss_mlp": 1.02422285, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.3792963468768393, "language_loss": 0.68113679, "learning_rate": 3.798641581176349e-06, "loss": 0.70380872, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.765388250350952 }, { "auxiliary_loss_clip": 0.01328943, "auxiliary_loss_mlp": 0.01025971, "balance_loss_clip": 1.05843067, "balance_loss_mlp": 1.01674414, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 1.8363684498923152, "language_loss": 0.7470938, "learning_rate": 3.7983008086287044e-06, "loss": 0.770643, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.7862298488616943 }, { "auxiliary_loss_clip": 0.0132596, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.0577507, "balance_loss_mlp": 1.02419114, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.7489870971190875, "language_loss": 0.78935724, "learning_rate": 3.797959763280257e-06, "loss": 0.81295705, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.8263020515441895 }, { "auxiliary_loss_clip": 0.01282915, "auxiliary_loss_mlp": 0.01035126, "balance_loss_clip": 1.06363797, "balance_loss_mlp": 1.02631021, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 2.084845545057407, "language_loss": 0.79135329, "learning_rate": 3.797618445182743e-06, "loss": 0.81453365, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.7929599285125732 }, { "auxiliary_loss_clip": 0.01424049, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.05481732, "balance_loss_mlp": 1.02799678, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.1060684280672124, "language_loss": 0.84544581, "learning_rate": 3.79727685438794e-06, "loss": 0.87005484, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.672224521636963 }, { "auxiliary_loss_clip": 0.01189329, "auxiliary_loss_mlp": 0.01009172, "balance_loss_clip": 1.03551054, "balance_loss_mlp": 1.00720525, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8327373179116119, "language_loss": 0.61593682, "learning_rate": 3.796934990947667e-06, "loss": 0.63792181, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.2633559703826904 }, { "auxiliary_loss_clip": 0.01191734, "auxiliary_loss_mlp": 0.01007347, "balance_loss_clip": 1.03567338, "balance_loss_mlp": 1.00537968, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8797444660114971, "language_loss": 0.6249702, "learning_rate": 3.7965928549137854e-06, "loss": 0.64696097, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 4.340043544769287 }, { "auxiliary_loss_clip": 0.01384555, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 1.05453324, "balance_loss_mlp": 1.02309489, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.0965106122078465, "language_loss": 0.7730605, "learning_rate": 3.7962504463381953e-06, "loss": 0.79723662, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.8291544914245605 }, { "auxiliary_loss_clip": 0.01331987, "auxiliary_loss_mlp": 0.02585676, "balance_loss_clip": 1.06393385, "balance_loss_mlp": 1.00024652, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.9377327529497037, "language_loss": 0.78539962, "learning_rate": 3.7959077652728412e-06, "loss": 0.82457626, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.825260877609253 }, { "auxiliary_loss_clip": 0.01328875, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.05876064, "balance_loss_mlp": 1.02599716, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.467483229887004, "language_loss": 0.77355587, "learning_rate": 3.795564811769707e-06, "loss": 0.79720831, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.7389471530914307 }, { "auxiliary_loss_clip": 0.01331775, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.0655421, "balance_loss_mlp": 1.02197194, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 2.062660702623551, "language_loss": 0.7816745, "learning_rate": 3.795221585880818e-06, "loss": 0.80531085, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.8747854232788086 }, { "auxiliary_loss_clip": 0.01378656, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.06398749, "balance_loss_mlp": 1.03200555, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 1.9944262246827513, "language_loss": 0.91877079, "learning_rate": 3.794878087658242e-06, "loss": 0.94297087, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 4.531185865402222 }, { "auxiliary_loss_clip": 0.01281077, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.06195426, "balance_loss_mlp": 1.02851427, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 2.0050941179441644, "language_loss": 0.78331375, "learning_rate": 3.7945343171540873e-06, "loss": 0.80649799, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 2.7561933994293213 }, { "auxiliary_loss_clip": 0.01229119, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.06204057, "balance_loss_mlp": 1.02064133, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 1.967433975963455, "language_loss": 0.7903229, "learning_rate": 3.7941902744205033e-06, "loss": 0.81291556, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 2.698808193206787 }, { "auxiliary_loss_clip": 0.01331752, "auxiliary_loss_mlp": 0.01032373, "balance_loss_clip": 1.05994487, "balance_loss_mlp": 1.02282381, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 1.946977014345416, "language_loss": 0.83613271, "learning_rate": 3.7938459595096817e-06, "loss": 0.85977399, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 3.62626576423645 }, { "auxiliary_loss_clip": 0.01287796, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.06281924, "balance_loss_mlp": 1.03043365, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 2.0153613405672353, "language_loss": 0.8643223, "learning_rate": 3.7935013724738545e-06, "loss": 0.88759983, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.7154006958007812 }, { "auxiliary_loss_clip": 0.01274629, "auxiliary_loss_mlp": 0.01035458, "balance_loss_clip": 1.0625391, "balance_loss_mlp": 1.02591562, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.9030502695278126, "language_loss": 0.78028899, "learning_rate": 3.7931565133652945e-06, "loss": 0.80338985, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.8286454677581787 }, { "auxiliary_loss_clip": 0.01229737, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.06264019, "balance_loss_mlp": 1.02311015, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.875852798619069, "language_loss": 0.67820776, "learning_rate": 3.792811382236317e-06, "loss": 0.70083052, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.724975109100342 }, { "auxiliary_loss_clip": 0.0128143, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.05968606, "balance_loss_mlp": 1.02623856, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 3.505298686708056, "language_loss": 0.7834971, "learning_rate": 3.792465979139279e-06, "loss": 0.8066684, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.783107042312622 }, { "auxiliary_loss_clip": 0.012871, "auxiliary_loss_mlp": 0.01002842, "balance_loss_clip": 1.03154159, "balance_loss_mlp": 1.00069618, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9437133131710864, "language_loss": 0.65640104, "learning_rate": 3.792120304126576e-06, "loss": 0.67930049, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.3497447967529297 }, { "auxiliary_loss_clip": 0.01525756, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.05229616, "balance_loss_mlp": 1.02548051, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 2.0439850660908325, "language_loss": 0.83531308, "learning_rate": 3.791774357250649e-06, "loss": 0.86091769, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 3.1649739742279053 }, { "auxiliary_loss_clip": 0.01327832, "auxiliary_loss_mlp": 0.01049302, "balance_loss_clip": 1.059618, "balance_loss_mlp": 1.03935361, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 3.6732116521001625, "language_loss": 0.79117751, "learning_rate": 3.7914281385639757e-06, "loss": 0.81494892, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 3.1704022884368896 }, { "auxiliary_loss_clip": 0.01280593, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.05975413, "balance_loss_mlp": 1.02079368, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.0697825607142826, "language_loss": 0.79114068, "learning_rate": 3.7910816481190784e-06, "loss": 0.81424862, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.6796376705169678 }, { "auxiliary_loss_clip": 0.01324435, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.05942321, "balance_loss_mlp": 1.02246141, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.120165652512844, "language_loss": 0.75059319, "learning_rate": 3.7907348859685193e-06, "loss": 0.7741611, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.8358755111694336 }, { "auxiliary_loss_clip": 0.01280503, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.06641614, "balance_loss_mlp": 1.02813923, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.5968870836870868, "language_loss": 0.80766618, "learning_rate": 3.790387852164902e-06, "loss": 0.83084202, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.7645745277404785 }, { "auxiliary_loss_clip": 0.01279847, "auxiliary_loss_mlp": 0.01042619, "balance_loss_clip": 1.06176698, "balance_loss_mlp": 1.03295755, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 1.9270882459162937, "language_loss": 0.76630449, "learning_rate": 3.7900405467608707e-06, "loss": 0.7895292, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.7616662979125977 }, { "auxiliary_loss_clip": 0.01416816, "auxiliary_loss_mlp": 0.01037918, "balance_loss_clip": 1.05074549, "balance_loss_mlp": 1.02867317, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.559547932625824, "language_loss": 0.79638648, "learning_rate": 3.7896929698091114e-06, "loss": 0.82093382, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.8744747638702393 }, { "auxiliary_loss_clip": 0.01237108, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.06854546, "balance_loss_mlp": 1.02987099, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 6.135528435683419, "language_loss": 0.67972445, "learning_rate": 3.7893451213623518e-06, "loss": 0.7024861, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.710486650466919 }, { "auxiliary_loss_clip": 0.01283155, "auxiliary_loss_mlp": 0.02582461, "balance_loss_clip": 1.06663108, "balance_loss_mlp": 1.00024033, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 3.6494198189163, "language_loss": 0.82770598, "learning_rate": 3.7889970014733606e-06, "loss": 0.86636209, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.7459568977355957 }, { "auxiliary_loss_clip": 0.01416156, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.05253935, "balance_loss_mlp": 1.0205791, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.9110900615034798, "language_loss": 0.77854097, "learning_rate": 3.7886486101949463e-06, "loss": 0.80299902, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.8128662109375 }, { "auxiliary_loss_clip": 0.01359489, "auxiliary_loss_mlp": 0.01033202, "balance_loss_clip": 1.05326009, "balance_loss_mlp": 1.02501774, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 7.54654069924172, "language_loss": 0.88405758, "learning_rate": 3.7882999475799594e-06, "loss": 0.9079845, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.762700319290161 }, { "auxiliary_loss_clip": 0.01411436, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 1.05333281, "balance_loss_mlp": 1.02327204, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 2.0511615383156094, "language_loss": 0.81366193, "learning_rate": 3.787951013681293e-06, "loss": 0.83810353, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.786680221557617 }, { "auxiliary_loss_clip": 0.01279086, "auxiliary_loss_mlp": 0.010364, "balance_loss_clip": 1.05981517, "balance_loss_mlp": 1.02626693, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.8292372765244984, "language_loss": 0.77748531, "learning_rate": 3.787601808551879e-06, "loss": 0.80064017, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 4.357468605041504 }, { "auxiliary_loss_clip": 0.0131381, "auxiliary_loss_mlp": 0.01041944, "balance_loss_clip": 1.05935144, "balance_loss_mlp": 1.0326097, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 2.3494301329433216, "language_loss": 0.84023106, "learning_rate": 3.7872523322446926e-06, "loss": 0.8637886, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.7460668087005615 }, { "auxiliary_loss_clip": 0.01429757, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.05117369, "balance_loss_mlp": 1.02283072, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 2.158983530936075, "language_loss": 0.60267901, "learning_rate": 3.7869025848127478e-06, "loss": 0.62729985, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 2.9756698608398438 }, { "auxiliary_loss_clip": 0.01280821, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.0599947, "balance_loss_mlp": 1.02549386, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 2.3422628008226907, "language_loss": 0.80590487, "learning_rate": 3.786552566309102e-06, "loss": 0.82906067, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.6814067363739014 }, { "auxiliary_loss_clip": 0.01323468, "auxiliary_loss_mlp": 0.02577754, "balance_loss_clip": 1.06352997, "balance_loss_mlp": 1.00016236, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.4694341390728476, "language_loss": 0.86268437, "learning_rate": 3.7862022767868517e-06, "loss": 0.90169656, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.6798505783081055 }, { "auxiliary_loss_clip": 0.01380991, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.06357431, "balance_loss_mlp": 1.02664852, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.309479643719411, "language_loss": 0.84343112, "learning_rate": 3.7858517162991367e-06, "loss": 0.86759841, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 4.5342793464660645 }, { "auxiliary_loss_clip": 0.01381007, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.05650949, "balance_loss_mlp": 1.02828097, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.428141744567282, "language_loss": 0.60936451, "learning_rate": 3.7855008848991363e-06, "loss": 0.63355184, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 2.87052059173584 }, { "auxiliary_loss_clip": 0.01324713, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.06288934, "balance_loss_mlp": 1.02365494, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 1.9383111455127349, "language_loss": 0.77895725, "learning_rate": 3.7851497826400714e-06, "loss": 0.80253106, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 2.9122369289398193 }, { "auxiliary_loss_clip": 0.01230362, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.06435442, "balance_loss_mlp": 1.02710462, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 1.8451749998410498, "language_loss": 0.76041055, "learning_rate": 3.7847984095752034e-06, "loss": 0.78308296, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 3.7654407024383545 }, { "auxiliary_loss_clip": 0.01226718, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.06063378, "balance_loss_mlp": 1.02087402, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 2.2462920665764208, "language_loss": 0.80244434, "learning_rate": 3.784446765757836e-06, "loss": 0.82501543, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.63551926612854 }, { "auxiliary_loss_clip": 0.0136218, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.05565524, "balance_loss_mlp": 1.02250457, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 5.9131343569252115, "language_loss": 0.77814299, "learning_rate": 3.7840948512413133e-06, "loss": 0.80208534, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.7685084342956543 }, { "auxiliary_loss_clip": 0.01377878, "auxiliary_loss_mlp": 0.01036984, "balance_loss_clip": 1.06110775, "balance_loss_mlp": 1.02738714, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 1.8663053936351666, "language_loss": 0.78624678, "learning_rate": 3.7837426660790196e-06, "loss": 0.81039536, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.924888849258423 }, { "auxiliary_loss_clip": 0.01226096, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.06146288, "balance_loss_mlp": 1.02673697, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.1751820922183778, "language_loss": 0.82056499, "learning_rate": 3.783390210324382e-06, "loss": 0.8431834, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.6471166610717773 }, { "auxiliary_loss_clip": 0.01309027, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.05836439, "balance_loss_mlp": 1.02049327, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 1.9389041578147124, "language_loss": 0.72767115, "learning_rate": 3.7830374840308676e-06, "loss": 0.75105405, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.7452762126922607 }, { "auxiliary_loss_clip": 0.01279798, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.06423616, "balance_loss_mlp": 1.02156973, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 3.0841208009712586, "language_loss": 0.82497811, "learning_rate": 3.7826844872519842e-06, "loss": 0.84808606, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.6837852001190186 }, { "auxiliary_loss_clip": 0.01324512, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.06009066, "balance_loss_mlp": 1.02422881, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 5.234716046956519, "language_loss": 0.73220539, "learning_rate": 3.782331220041282e-06, "loss": 0.75578701, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.7159008979797363 }, { "auxiliary_loss_clip": 0.01322, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.05945849, "balance_loss_mlp": 1.02302384, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.3606295721581856, "language_loss": 0.82972789, "learning_rate": 3.7819776824523504e-06, "loss": 0.85326827, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.7380359172821045 }, { "auxiliary_loss_clip": 0.01336112, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.05964303, "balance_loss_mlp": 1.02169955, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 2.092563332977659, "language_loss": 0.8420521, "learning_rate": 3.7816238745388213e-06, "loss": 0.86572015, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.7182483673095703 }, { "auxiliary_loss_clip": 0.0133174, "auxiliary_loss_mlp": 0.01037427, "balance_loss_clip": 1.05693817, "balance_loss_mlp": 1.02833712, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 2.7356513669948233, "language_loss": 0.87058604, "learning_rate": 3.781269796354367e-06, "loss": 0.89427769, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.72864031791687 }, { "auxiliary_loss_clip": 0.01327602, "auxiliary_loss_mlp": 0.01038699, "balance_loss_clip": 1.05981028, "balance_loss_mlp": 1.02959692, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 6.5309277094432066, "language_loss": 0.86036015, "learning_rate": 3.7809154479527006e-06, "loss": 0.88402319, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.706270933151245 }, { "auxiliary_loss_clip": 0.01370317, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.05862761, "balance_loss_mlp": 1.02120447, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.482541566122146, "language_loss": 0.84143746, "learning_rate": 3.780560829387577e-06, "loss": 0.86544466, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.7423558235168457 }, { "auxiliary_loss_clip": 0.01183969, "auxiliary_loss_mlp": 0.01001732, "balance_loss_clip": 1.03580415, "balance_loss_mlp": 0.99988431, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8550779020794265, "language_loss": 0.57860905, "learning_rate": 3.7802059407127915e-06, "loss": 0.60046607, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.272942066192627 }, { "auxiliary_loss_clip": 0.01325191, "auxiliary_loss_mlp": 0.01035508, "balance_loss_clip": 1.05676091, "balance_loss_mlp": 1.02654338, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.75144633195439, "language_loss": 0.86459816, "learning_rate": 3.7798507819821797e-06, "loss": 0.88820517, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.75236177444458 }, { "auxiliary_loss_clip": 0.01368655, "auxiliary_loss_mlp": 0.01034988, "balance_loss_clip": 1.05847907, "balance_loss_mlp": 1.02532029, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.7058859849176984, "language_loss": 0.79429662, "learning_rate": 3.7794953532496197e-06, "loss": 0.81833303, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.7081191539764404 }, { "auxiliary_loss_clip": 0.01328593, "auxiliary_loss_mlp": 0.02533877, "balance_loss_clip": 1.0282867, "balance_loss_mlp": 1.00021386, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.8551365450203078, "language_loss": 0.57899261, "learning_rate": 3.7791396545690295e-06, "loss": 0.61761737, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.3379290103912354 }, { "auxiliary_loss_clip": 0.01279255, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.06454599, "balance_loss_mlp": 1.0253377, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 4.09781220569232, "language_loss": 0.80684823, "learning_rate": 3.7787836859943685e-06, "loss": 0.82999021, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.825531244277954 }, { "auxiliary_loss_clip": 0.01279296, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.06315148, "balance_loss_mlp": 1.02018869, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 3.454837933064007, "language_loss": 0.79401231, "learning_rate": 3.7784274475796363e-06, "loss": 0.81710267, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 3.9424755573272705 }, { "auxiliary_loss_clip": 0.0131137, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.05769062, "balance_loss_mlp": 1.0275383, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.175526359020434, "language_loss": 0.76652741, "learning_rate": 3.7780709393788745e-06, "loss": 0.79001927, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.7441909313201904 }, { "auxiliary_loss_clip": 0.01231478, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.06627774, "balance_loss_mlp": 1.03264189, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 3.984130105632141, "language_loss": 0.75012577, "learning_rate": 3.777714161446165e-06, "loss": 0.77285445, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.667877197265625 }, { "auxiliary_loss_clip": 0.0128352, "auxiliary_loss_mlp": 0.01036374, "balance_loss_clip": 1.06637788, "balance_loss_mlp": 1.02768397, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 3.441428095454451, "language_loss": 0.69460058, "learning_rate": 3.7773571138356304e-06, "loss": 0.71779954, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.7804720401763916 }, { "auxiliary_loss_clip": 0.01417972, "auxiliary_loss_mlp": 0.010243, "balance_loss_clip": 1.05823505, "balance_loss_mlp": 1.01508522, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.93087618770942, "language_loss": 0.89364392, "learning_rate": 3.776999796601435e-06, "loss": 0.91806662, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.779010772705078 }, { "auxiliary_loss_clip": 0.01282214, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.06229401, "balance_loss_mlp": 1.02456534, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.0777515325293487, "language_loss": 0.72702146, "learning_rate": 3.776642209797783e-06, "loss": 0.75017881, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 4.547362804412842 }, { "auxiliary_loss_clip": 0.0127396, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.05999541, "balance_loss_mlp": 1.02158737, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.598507690838947, "language_loss": 0.78010935, "learning_rate": 3.7762843534789205e-06, "loss": 0.80315799, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 2.6544530391693115 }, { "auxiliary_loss_clip": 0.01333503, "auxiliary_loss_mlp": 0.01042011, "balance_loss_clip": 1.05945373, "balance_loss_mlp": 1.03259945, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.9584299073374756, "language_loss": 0.8837437, "learning_rate": 3.7759262276991343e-06, "loss": 0.90749884, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 2.7369773387908936 }, { "auxiliary_loss_clip": 0.01336548, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.06436145, "balance_loss_mlp": 1.02320075, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.3484327079737475, "language_loss": 0.81126523, "learning_rate": 3.7755678325127506e-06, "loss": 0.83496094, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 3.569138765335083 }, { "auxiliary_loss_clip": 0.01424315, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.06265521, "balance_loss_mlp": 1.02626491, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 1.706897067019515, "language_loss": 0.75820994, "learning_rate": 3.7752091679741393e-06, "loss": 0.78280979, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.767169237136841 }, { "auxiliary_loss_clip": 0.01278074, "auxiliary_loss_mlp": 0.01032693, "balance_loss_clip": 1.06252933, "balance_loss_mlp": 1.02276897, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 3.2580270892307, "language_loss": 0.7714209, "learning_rate": 3.774850234137708e-06, "loss": 0.7945286, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.746973752975464 }, { "auxiliary_loss_clip": 0.01277366, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.06201053, "balance_loss_mlp": 1.02490473, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.2510960557049056, "language_loss": 0.82458127, "learning_rate": 3.7744910310579076e-06, "loss": 0.84769094, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.6865270137786865 }, { "auxiliary_loss_clip": 0.01228513, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.06552815, "balance_loss_mlp": 1.02451277, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.0675181027770204, "language_loss": 0.84833181, "learning_rate": 3.774131558789229e-06, "loss": 0.87094605, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.6404054164886475 }, { "auxiliary_loss_clip": 0.01230497, "auxiliary_loss_mlp": 0.0257497, "balance_loss_clip": 1.06610131, "balance_loss_mlp": 1.00002873, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.375245839528329, "language_loss": 0.69722152, "learning_rate": 3.773771817386203e-06, "loss": 0.73527616, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.6037697792053223 }, { "auxiliary_loss_clip": 0.01324356, "auxiliary_loss_mlp": 0.01032946, "balance_loss_clip": 1.05843139, "balance_loss_mlp": 1.02367175, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.7506668385089612, "language_loss": 0.79284656, "learning_rate": 3.773411806903403e-06, "loss": 0.8164196, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.7362802028656006 }, { "auxiliary_loss_clip": 0.01415006, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.05692434, "balance_loss_mlp": 1.02250516, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 2.2418675071114933, "language_loss": 0.94888389, "learning_rate": 3.7730515273954415e-06, "loss": 0.97334266, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.820146083831787 }, { "auxiliary_loss_clip": 0.01227595, "auxiliary_loss_mlp": 0.01036676, "balance_loss_clip": 1.06530988, "balance_loss_mlp": 1.02785408, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 2.145603132708544, "language_loss": 0.84891021, "learning_rate": 3.772690978916973e-06, "loss": 0.87155294, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.7018401622772217 }, { "auxiliary_loss_clip": 0.01277051, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.06218028, "balance_loss_mlp": 1.02205753, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 2.146817649113855, "language_loss": 0.86520934, "learning_rate": 3.772330161522693e-06, "loss": 0.88830137, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.5915093421936035 }, { "auxiliary_loss_clip": 0.01325626, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.06602716, "balance_loss_mlp": 1.02179337, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 2.1058117640023535, "language_loss": 0.80232644, "learning_rate": 3.7719690752673365e-06, "loss": 0.82589811, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.6561591625213623 }, { "auxiliary_loss_clip": 0.01376163, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.06050229, "balance_loss_mlp": 1.02243304, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 2.182595468529718, "language_loss": 0.78594196, "learning_rate": 3.7716077202056796e-06, "loss": 0.81001258, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.676953077316284 }, { "auxiliary_loss_clip": 0.01319287, "auxiliary_loss_mlp": 0.01036791, "balance_loss_clip": 1.06159067, "balance_loss_mlp": 1.02798772, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.4980134345616536, "language_loss": 0.93483341, "learning_rate": 3.7712460963925404e-06, "loss": 0.95839423, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.73061203956604 }, { "auxiliary_loss_clip": 0.01325806, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.06025743, "balance_loss_mlp": 1.02021813, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 21.113277242333755, "language_loss": 0.75687009, "learning_rate": 3.7708842038827775e-06, "loss": 0.78041679, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.732940435409546 }, { "auxiliary_loss_clip": 0.01275935, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.06141734, "balance_loss_mlp": 1.02812457, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.9439926822514368, "language_loss": 0.86165112, "learning_rate": 3.770522042731288e-06, "loss": 0.88478047, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.774278402328491 }, { "auxiliary_loss_clip": 0.01420072, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.0578171, "balance_loss_mlp": 1.02764058, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 2.4550559052659797, "language_loss": 0.88114321, "learning_rate": 3.7701596129930122e-06, "loss": 0.90571541, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.791640520095825 }, { "auxiliary_loss_clip": 0.01315812, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.06088328, "balance_loss_mlp": 1.02698779, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 2.2327119589176863, "language_loss": 0.73243594, "learning_rate": 3.7697969147229315e-06, "loss": 0.7559545, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.7167084217071533 }, { "auxiliary_loss_clip": 0.01279408, "auxiliary_loss_mlp": 0.01035046, "balance_loss_clip": 1.06406069, "balance_loss_mlp": 1.02680862, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 2.603731084848324, "language_loss": 0.85635018, "learning_rate": 3.7694339479760647e-06, "loss": 0.87949467, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.6924164295196533 }, { "auxiliary_loss_clip": 0.01235765, "auxiliary_loss_mlp": 0.01000301, "balance_loss_clip": 1.03686571, "balance_loss_mlp": 0.99840564, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.7704122207748717, "language_loss": 0.57317054, "learning_rate": 3.769070712807476e-06, "loss": 0.59553123, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 4.483577013015747 }, { "auxiliary_loss_clip": 0.01464299, "auxiliary_loss_mlp": 0.01043424, "balance_loss_clip": 1.05487633, "balance_loss_mlp": 1.03425062, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 2.377599223397255, "language_loss": 0.79074788, "learning_rate": 3.768707209272266e-06, "loss": 0.8158251, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.7813940048217773 }, { "auxiliary_loss_clip": 0.01325336, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.05929589, "balance_loss_mlp": 1.02384579, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.8792183391443995, "language_loss": 0.76986337, "learning_rate": 3.768343437425579e-06, "loss": 0.79344738, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.6971933841705322 }, { "auxiliary_loss_clip": 0.0152518, "auxiliary_loss_mlp": 0.01041441, "balance_loss_clip": 1.05379033, "balance_loss_mlp": 1.0318445, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 3.4281237012024977, "language_loss": 0.86073148, "learning_rate": 3.7679793973225987e-06, "loss": 0.88639772, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.8400216102600098 }, { "auxiliary_loss_clip": 0.01330292, "auxiliary_loss_mlp": 0.0100339, "balance_loss_clip": 1.02846074, "balance_loss_mlp": 1.00150609, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8535107532193602, "language_loss": 0.6158998, "learning_rate": 3.767615089018549e-06, "loss": 0.63923669, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.4407029151916504 }, { "auxiliary_loss_clip": 0.01321488, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.06123066, "balance_loss_mlp": 1.02304316, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.1523205918471113, "language_loss": 0.86611426, "learning_rate": 3.7672505125686966e-06, "loss": 0.88965255, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 4.705912828445435 }, { "auxiliary_loss_clip": 0.01422103, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 1.05455267, "balance_loss_mlp": 1.02449608, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.3889831094584224, "language_loss": 0.84828508, "learning_rate": 3.7668856680283455e-06, "loss": 0.87284732, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 2.8574633598327637 }, { "auxiliary_loss_clip": 0.01334648, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.06349707, "balance_loss_mlp": 1.02294552, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 1.894444001222212, "language_loss": 0.82782638, "learning_rate": 3.7665205554528437e-06, "loss": 0.85149097, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.7274699211120605 }, { "auxiliary_loss_clip": 0.01323054, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.06204021, "balance_loss_mlp": 1.02591252, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.8444919059563152, "language_loss": 0.74240965, "learning_rate": 3.7661551748975782e-06, "loss": 0.76599079, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.7413785457611084 }, { "auxiliary_loss_clip": 0.01230291, "auxiliary_loss_mlp": 0.01006947, "balance_loss_clip": 1.03304541, "balance_loss_mlp": 1.00501585, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.807651729145999, "language_loss": 0.60462391, "learning_rate": 3.7657895264179772e-06, "loss": 0.62699634, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.2627556324005127 }, { "auxiliary_loss_clip": 0.01252354, "auxiliary_loss_mlp": 0.01029796, "balance_loss_clip": 1.05801511, "balance_loss_mlp": 1.02038455, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.9605460161251065, "language_loss": 0.74890459, "learning_rate": 3.765423610069509e-06, "loss": 0.77172601, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.9200267791748047 }, { "auxiliary_loss_clip": 0.0132971, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.06527495, "balance_loss_mlp": 1.02856946, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.8586833885302507, "language_loss": 0.72465575, "learning_rate": 3.765057425907683e-06, "loss": 0.74833083, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.823455572128296 }, { "auxiliary_loss_clip": 0.01281601, "auxiliary_loss_mlp": 0.01030099, "balance_loss_clip": 1.0624876, "balance_loss_mlp": 1.02076519, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 1.8608123048885692, "language_loss": 0.78719658, "learning_rate": 3.764690973988048e-06, "loss": 0.81031358, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.6938164234161377 }, { "auxiliary_loss_clip": 0.01312622, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.06147575, "balance_loss_mlp": 1.02615356, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 1.8445285071634894, "language_loss": 0.73922729, "learning_rate": 3.7643242543661967e-06, "loss": 0.76270372, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.8014402389526367 }, { "auxiliary_loss_clip": 0.01219825, "auxiliary_loss_mlp": 0.01009274, "balance_loss_clip": 1.02712035, "balance_loss_mlp": 1.00750983, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8902578050279709, "language_loss": 0.60464346, "learning_rate": 3.7639572670977573e-06, "loss": 0.62693447, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.122131109237671 }, { "auxiliary_loss_clip": 0.01380819, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 1.0601542, "balance_loss_mlp": 1.02110267, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.7432783812717951, "language_loss": 0.76802933, "learning_rate": 3.7635900122384042e-06, "loss": 0.79214108, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.9694597721099854 }, { "auxiliary_loss_clip": 0.01259324, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.05944681, "balance_loss_mlp": 1.02214003, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.1993748090054592, "language_loss": 0.87071532, "learning_rate": 3.7632224898438477e-06, "loss": 0.89363062, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.6210787296295166 }, { "auxiliary_loss_clip": 0.01374456, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.05495, "balance_loss_mlp": 1.02475929, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.6299524774624237, "language_loss": 0.78973544, "learning_rate": 3.762854699969842e-06, "loss": 0.81381065, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.8520002365112305 }, { "auxiliary_loss_clip": 0.01274512, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.06360447, "balance_loss_mlp": 1.01757753, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 1.8337879940906343, "language_loss": 0.73211563, "learning_rate": 3.762486642672179e-06, "loss": 0.75512028, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.666271448135376 }, { "auxiliary_loss_clip": 0.01329466, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.06320477, "balance_loss_mlp": 1.02629209, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 6.837352379259871, "language_loss": 0.86674875, "learning_rate": 3.7621183180066946e-06, "loss": 0.89039528, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.679243803024292 }, { "auxiliary_loss_clip": 0.01327198, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.0589962, "balance_loss_mlp": 1.02317834, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.555622723963142, "language_loss": 0.74022561, "learning_rate": 3.7617497260292625e-06, "loss": 0.7638191, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.794055223464966 }, { "auxiliary_loss_clip": 0.01326379, "auxiliary_loss_mlp": 0.01039129, "balance_loss_clip": 1.06300831, "balance_loss_mlp": 1.02898479, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 3.440362490795796, "language_loss": 0.78786051, "learning_rate": 3.7613808667957967e-06, "loss": 0.81151557, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.639190673828125 }, { "auxiliary_loss_clip": 0.01325516, "auxiliary_loss_mlp": 0.01033521, "balance_loss_clip": 1.05935669, "balance_loss_mlp": 1.0238235, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 2.4320221355112346, "language_loss": 0.91265273, "learning_rate": 3.7610117403622547e-06, "loss": 0.93624306, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.6486666202545166 }, { "auxiliary_loss_clip": 0.01374512, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.057302, "balance_loss_mlp": 1.02586424, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.8486312174951298, "language_loss": 0.90422314, "learning_rate": 3.7606423467846313e-06, "loss": 0.92831421, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.764415740966797 }, { "auxiliary_loss_clip": 0.01384077, "auxiliary_loss_mlp": 0.01040867, "balance_loss_clip": 1.06224668, "balance_loss_mlp": 1.03089476, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.6012033846428653, "language_loss": 0.79535741, "learning_rate": 3.760272686118964e-06, "loss": 0.81960684, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.7111802101135254 }, { "auxiliary_loss_clip": 0.01325306, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.05963123, "balance_loss_mlp": 1.02451921, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.2251339963712895, "language_loss": 0.92423701, "learning_rate": 3.7599027584213297e-06, "loss": 0.94781762, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.6793038845062256 }, { "auxiliary_loss_clip": 0.01281577, "auxiliary_loss_mlp": 0.01036016, "balance_loss_clip": 1.06198847, "balance_loss_mlp": 1.02630079, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 1.9237510364129342, "language_loss": 0.78241706, "learning_rate": 3.7595325637478465e-06, "loss": 0.80559301, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 3.5520341396331787 }, { "auxiliary_loss_clip": 0.01321207, "auxiliary_loss_mlp": 0.01037334, "balance_loss_clip": 1.05958033, "balance_loss_mlp": 1.02759516, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 2.1851659901385907, "language_loss": 0.81681836, "learning_rate": 3.7591621021546723e-06, "loss": 0.84040374, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.77325701713562 }, { "auxiliary_loss_clip": 0.01273079, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.05924845, "balance_loss_mlp": 1.02667463, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.7928902046503656, "language_loss": 0.81236994, "learning_rate": 3.7587913736980062e-06, "loss": 0.83546007, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 2.6930010318756104 }, { "auxiliary_loss_clip": 0.01462155, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.05108762, "balance_loss_mlp": 1.02611256, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.8905173833146378, "language_loss": 0.84347904, "learning_rate": 3.7584203784340865e-06, "loss": 0.8684516, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.8417577743530273 }, { "auxiliary_loss_clip": 0.013225, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.05911112, "balance_loss_mlp": 1.02526927, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 1.9928666285120804, "language_loss": 0.85959959, "learning_rate": 3.7580491164191938e-06, "loss": 0.88317084, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 3.542262315750122 }, { "auxiliary_loss_clip": 0.01178038, "auxiliary_loss_mlp": 0.01001822, "balance_loss_clip": 1.0310992, "balance_loss_mlp": 0.99997389, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.7406119319739408, "language_loss": 0.61165136, "learning_rate": 3.757677587709648e-06, "loss": 0.63344997, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 4.1913769245147705 }, { "auxiliary_loss_clip": 0.01368846, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.06002259, "balance_loss_mlp": 1.02695334, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 3.821276520090281, "language_loss": 0.7559768, "learning_rate": 3.7573057923618095e-06, "loss": 0.7800281, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 2.8863253593444824 }, { "auxiliary_loss_clip": 0.01421987, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.05286813, "balance_loss_mlp": 1.01890206, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 2.1144114731080488, "language_loss": 0.74298739, "learning_rate": 3.7569337304320793e-06, "loss": 0.76748651, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.712099075317383 }, { "auxiliary_loss_clip": 0.01219284, "auxiliary_loss_mlp": 0.01000522, "balance_loss_clip": 1.02473307, "balance_loss_mlp": 0.99880499, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8485929580686579, "language_loss": 0.64457363, "learning_rate": 3.756561401976899e-06, "loss": 0.66677171, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 4.008121728897095 }, { "auxiliary_loss_clip": 0.01229928, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.0649941, "balance_loss_mlp": 1.02086854, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 2.1764686778484577, "language_loss": 0.82584298, "learning_rate": 3.7561888070527514e-06, "loss": 0.8484388, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.7367424964904785 }, { "auxiliary_loss_clip": 0.0136011, "auxiliary_loss_mlp": 0.02578488, "balance_loss_clip": 1.05562639, "balance_loss_mlp": 0.99994671, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.6240605053419586, "language_loss": 0.80218089, "learning_rate": 3.7558159457161577e-06, "loss": 0.84156692, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.6696903705596924 }, { "auxiliary_loss_clip": 0.0132728, "auxiliary_loss_mlp": 0.02577833, "balance_loss_clip": 1.06068873, "balance_loss_mlp": 0.99995685, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.1945317712228833, "language_loss": 0.7777375, "learning_rate": 3.755442818023681e-06, "loss": 0.81678867, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.7071237564086914 }, { "auxiliary_loss_clip": 0.01373499, "auxiliary_loss_mlp": 0.01028619, "balance_loss_clip": 1.05788624, "balance_loss_mlp": 1.01941013, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 2.1350919472001957, "language_loss": 0.76207221, "learning_rate": 3.7550694240319246e-06, "loss": 0.78609335, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.7147843837738037 }, { "auxiliary_loss_clip": 0.01279674, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.06064558, "balance_loss_mlp": 1.01908135, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 11.83530599556708, "language_loss": 0.76325041, "learning_rate": 3.7546957637975326e-06, "loss": 0.78633684, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.6583712100982666 }, { "auxiliary_loss_clip": 0.01469449, "auxiliary_loss_mlp": 0.01033146, "balance_loss_clip": 1.0501281, "balance_loss_mlp": 1.02443135, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 2.1637520201808003, "language_loss": 0.74233985, "learning_rate": 3.7543218373771873e-06, "loss": 0.76736581, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.7301926612854004 }, { "auxiliary_loss_clip": 0.01472319, "auxiliary_loss_mlp": 0.02578391, "balance_loss_clip": 1.05297709, "balance_loss_mlp": 0.99994677, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.4098276241501269, "language_loss": 0.78143287, "learning_rate": 3.753947644827615e-06, "loss": 0.82194, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.875382423400879 }, { "auxiliary_loss_clip": 0.0122207, "auxiliary_loss_mlp": 0.01005798, "balance_loss_clip": 1.02246785, "balance_loss_mlp": 1.00403333, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9067280771338199, "language_loss": 0.57149976, "learning_rate": 3.753573186205579e-06, "loss": 0.59377843, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.3460488319396973 }, { "auxiliary_loss_clip": 0.01318767, "auxiliary_loss_mlp": 0.02576949, "balance_loss_clip": 1.05703688, "balance_loss_mlp": 0.99989891, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.8222685286886913, "language_loss": 0.78159559, "learning_rate": 3.753198461567885e-06, "loss": 0.82055271, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.641657829284668 }, { "auxiliary_loss_clip": 0.01366638, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.05907369, "balance_loss_mlp": 1.02586246, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 2.042039610549057, "language_loss": 0.92178935, "learning_rate": 3.7528234709713783e-06, "loss": 0.9457978, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.8332266807556152 }, { "auxiliary_loss_clip": 0.01281269, "auxiliary_loss_mlp": 0.01039842, "balance_loss_clip": 1.06507349, "balance_loss_mlp": 1.03153944, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 2.6237903879913365, "language_loss": 0.8452059, "learning_rate": 3.7524482144729447e-06, "loss": 0.86841702, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.673372507095337 }, { "auxiliary_loss_clip": 0.0137359, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.05581796, "balance_loss_mlp": 1.02433372, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.851785837179984, "language_loss": 0.84144133, "learning_rate": 3.7520726921295106e-06, "loss": 0.8655085, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.7095768451690674 }, { "auxiliary_loss_clip": 0.01273229, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.05966067, "balance_loss_mlp": 1.02327073, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.6985039941937567, "language_loss": 0.72387791, "learning_rate": 3.751696903998042e-06, "loss": 0.74693769, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.661353826522827 }, { "auxiliary_loss_clip": 0.01273794, "auxiliary_loss_mlp": 0.0103548, "balance_loss_clip": 1.06329775, "balance_loss_mlp": 1.02657521, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.8304767926306873, "language_loss": 0.69988662, "learning_rate": 3.7513208501355456e-06, "loss": 0.72297937, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.7166810035705566 }, { "auxiliary_loss_clip": 0.01320864, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.06008613, "balance_loss_mlp": 1.02145529, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 1.9804650390671747, "language_loss": 0.83614463, "learning_rate": 3.750944530599069e-06, "loss": 0.85965109, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.6373379230499268 }, { "auxiliary_loss_clip": 0.01285243, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.06614494, "balance_loss_mlp": 1.02515316, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.793967263722739, "language_loss": 0.80846059, "learning_rate": 3.7505679454456992e-06, "loss": 0.83166206, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.6117494106292725 }, { "auxiliary_loss_clip": 0.01443704, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.05179381, "balance_loss_mlp": 1.02766824, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 1.8699654799144776, "language_loss": 0.70292699, "learning_rate": 3.750191094732564e-06, "loss": 0.72773081, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.9491324424743652 }, { "auxiliary_loss_clip": 0.01447481, "auxiliary_loss_mlp": 0.02578792, "balance_loss_clip": 1.05027676, "balance_loss_mlp": 0.9999249, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 1.7879162356467515, "language_loss": 0.75239527, "learning_rate": 3.7498139785168313e-06, "loss": 0.79265803, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 4.036811828613281 }, { "auxiliary_loss_clip": 0.01274634, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.06395555, "balance_loss_mlp": 1.02578533, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 2.008091668910084, "language_loss": 0.76872468, "learning_rate": 3.749436596855709e-06, "loss": 0.79182005, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 3.0681862831115723 }, { "auxiliary_loss_clip": 0.01276631, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.0627265, "balance_loss_mlp": 1.0253737, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.264092314501058, "language_loss": 0.9042663, "learning_rate": 3.749058949806446e-06, "loss": 0.92737758, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.6655731201171875 }, { "auxiliary_loss_clip": 0.01277722, "auxiliary_loss_mlp": 0.01030356, "balance_loss_clip": 1.06226456, "balance_loss_mlp": 1.02103996, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.8719135861937704, "language_loss": 0.84353256, "learning_rate": 3.748681037426331e-06, "loss": 0.86661333, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.6278204917907715 }, { "auxiliary_loss_clip": 0.01228268, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.06573129, "balance_loss_mlp": 1.02174485, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.1626783701733974, "language_loss": 0.91755849, "learning_rate": 3.7483028597726936e-06, "loss": 0.94015533, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 3.497960329055786 }, { "auxiliary_loss_clip": 0.01380861, "auxiliary_loss_mlp": 0.01040657, "balance_loss_clip": 1.06593573, "balance_loss_mlp": 1.03062522, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 2.4835270216512053, "language_loss": 0.62320334, "learning_rate": 3.7479244169029017e-06, "loss": 0.6474185, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 3.617387533187866 }, { "auxiliary_loss_clip": 0.01277126, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.05933881, "balance_loss_mlp": 1.02196395, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.5170237714154116, "language_loss": 0.73421705, "learning_rate": 3.7475457088743658e-06, "loss": 0.75729316, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 2.6822307109832764 }, { "auxiliary_loss_clip": 0.01325739, "auxiliary_loss_mlp": 0.01034925, "balance_loss_clip": 1.06418347, "balance_loss_mlp": 1.02548397, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 1.8731931201011762, "language_loss": 0.74698305, "learning_rate": 3.7471667357445348e-06, "loss": 0.77058971, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 2.8088340759277344 }, { "auxiliary_loss_clip": 0.01483212, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.06145239, "balance_loss_mlp": 1.02636504, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 1.7741834837185226, "language_loss": 0.72532117, "learning_rate": 3.7467874975709e-06, "loss": 0.75051767, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 4.111306428909302 }, { "auxiliary_loss_clip": 0.01289295, "auxiliary_loss_mlp": 0.01034544, "balance_loss_clip": 1.07083786, "balance_loss_mlp": 1.02473295, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 4.185189063590591, "language_loss": 0.78387523, "learning_rate": 3.7464079944109904e-06, "loss": 0.80711365, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.927212953567505 }, { "auxiliary_loss_clip": 0.01311181, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.06198168, "balance_loss_mlp": 1.02992952, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 1.929568055163094, "language_loss": 0.77803081, "learning_rate": 3.746028226322376e-06, "loss": 0.80153304, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.7257416248321533 }, { "auxiliary_loss_clip": 0.01326512, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.06436563, "balance_loss_mlp": 1.0296067, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.9280740904146816, "language_loss": 0.75701046, "learning_rate": 3.745648193362669e-06, "loss": 0.78066123, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.6474809646606445 }, { "auxiliary_loss_clip": 0.01323091, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.06116152, "balance_loss_mlp": 1.02811801, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.6860096544240983, "language_loss": 0.72458071, "learning_rate": 3.745267895589518e-06, "loss": 0.74817991, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.7224202156066895 }, { "auxiliary_loss_clip": 0.01331074, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.06516814, "balance_loss_mlp": 1.02506447, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 2.6479964561143676, "language_loss": 0.81782252, "learning_rate": 3.7448873330606154e-06, "loss": 0.8414793, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.6960654258728027 }, { "auxiliary_loss_clip": 0.01372802, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.05973029, "balance_loss_mlp": 1.02432847, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.028076361895609, "language_loss": 0.87238455, "learning_rate": 3.7445065058336914e-06, "loss": 0.89644909, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7259116172790527 }, { "auxiliary_loss_clip": 0.01422023, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.0561223, "balance_loss_mlp": 1.02426612, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.8881918190024216, "language_loss": 0.86516607, "learning_rate": 3.7441254139665176e-06, "loss": 0.8897202, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.730700731277466 }, { "auxiliary_loss_clip": 0.01232062, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.06983304, "balance_loss_mlp": 1.02498877, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 2.7528215602557604, "language_loss": 0.82633573, "learning_rate": 3.743744057516905e-06, "loss": 0.84899068, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.6147055625915527 }, { "auxiliary_loss_clip": 0.01427303, "auxiliary_loss_mlp": 0.01037238, "balance_loss_clip": 1.05951345, "balance_loss_mlp": 1.02748036, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 4.024404710843535, "language_loss": 0.87740254, "learning_rate": 3.743362436542706e-06, "loss": 0.90204787, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.6966302394866943 }, { "auxiliary_loss_clip": 0.01229658, "auxiliary_loss_mlp": 0.01026655, "balance_loss_clip": 1.06744361, "balance_loss_mlp": 1.01736259, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 2.296866298136529, "language_loss": 0.76854807, "learning_rate": 3.7429805511018115e-06, "loss": 0.79111123, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.8523616790771484 }, { "auxiliary_loss_clip": 0.01373912, "auxiliary_loss_mlp": 0.02580092, "balance_loss_clip": 1.0603056, "balance_loss_mlp": 0.99986959, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 2.2875972480383595, "language_loss": 0.7796433, "learning_rate": 3.7425984012521524e-06, "loss": 0.81918329, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.77474308013916 }, { "auxiliary_loss_clip": 0.01276623, "auxiliary_loss_mlp": 0.02529073, "balance_loss_clip": 1.03056574, "balance_loss_mlp": 0.99972039, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7634359496930964, "language_loss": 0.60425007, "learning_rate": 3.7422159870517025e-06, "loss": 0.64230704, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.4091663360595703 }, { "auxiliary_loss_clip": 0.0132778, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.06297874, "balance_loss_mlp": 1.02161908, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.5446992642519313, "language_loss": 0.78944993, "learning_rate": 3.7418333085584717e-06, "loss": 0.81303084, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.845991611480713 }, { "auxiliary_loss_clip": 0.01374423, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.06226826, "balance_loss_mlp": 1.02194154, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.7182896402171663, "language_loss": 0.90426016, "learning_rate": 3.7414503658305128e-06, "loss": 0.92831123, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.695324182510376 }, { "auxiliary_loss_clip": 0.01429116, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.05713606, "balance_loss_mlp": 1.02033472, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.4301028557683844, "language_loss": 0.77743316, "learning_rate": 3.7410671589259185e-06, "loss": 0.80201864, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.7833962440490723 }, { "auxiliary_loss_clip": 0.01231503, "auxiliary_loss_mlp": 0.01033017, "balance_loss_clip": 1.06830454, "balance_loss_mlp": 1.02376604, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.3977315170668527, "language_loss": 0.79730392, "learning_rate": 3.7406836879028205e-06, "loss": 0.81994915, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.5999042987823486 }, { "auxiliary_loss_clip": 0.01272948, "auxiliary_loss_mlp": 0.01033986, "balance_loss_clip": 1.0637815, "balance_loss_mlp": 1.02555788, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 6.638150757592697, "language_loss": 0.7694236, "learning_rate": 3.7402999528193907e-06, "loss": 0.79249287, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.694312334060669 }, { "auxiliary_loss_clip": 0.01365883, "auxiliary_loss_mlp": 0.02577262, "balance_loss_clip": 1.06030464, "balance_loss_mlp": 0.99991941, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 2.4635209118923482, "language_loss": 0.85688537, "learning_rate": 3.739915953733842e-06, "loss": 0.89631683, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 3.7370975017547607 }, { "auxiliary_loss_clip": 0.01225516, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.06477952, "balance_loss_mlp": 1.02666593, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.7157930581080236, "language_loss": 0.82129699, "learning_rate": 3.7395316907044264e-06, "loss": 0.84391147, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.68109393119812 }, { "auxiliary_loss_clip": 0.01275929, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.06317997, "balance_loss_mlp": 1.02525711, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.8989705432895705, "language_loss": 0.80085397, "learning_rate": 3.7391471637894364e-06, "loss": 0.82395667, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.725973129272461 }, { "auxiliary_loss_clip": 0.01375919, "auxiliary_loss_mlp": 0.01031865, "balance_loss_clip": 1.05624294, "balance_loss_mlp": 1.02338934, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 4.6811578748429765, "language_loss": 0.85144961, "learning_rate": 3.738762373047205e-06, "loss": 0.87552738, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.6754956245422363 }, { "auxiliary_loss_clip": 0.01374837, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.05881, "balance_loss_mlp": 1.02120245, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 2.0678455594463143, "language_loss": 0.83273494, "learning_rate": 3.738377318536103e-06, "loss": 0.85678566, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 3.641477108001709 }, { "auxiliary_loss_clip": 0.01220775, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.0635829, "balance_loss_mlp": 1.02039766, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 2.17665743176533, "language_loss": 0.70919532, "learning_rate": 3.7379920003145447e-06, "loss": 0.73169082, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 3.533215045928955 }, { "auxiliary_loss_clip": 0.01322061, "auxiliary_loss_mlp": 0.01040614, "balance_loss_clip": 1.06325781, "balance_loss_mlp": 1.03133333, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 1.7974393843664607, "language_loss": 0.83723629, "learning_rate": 3.7376064184409817e-06, "loss": 0.86086309, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 2.6971888542175293 }, { "auxiliary_loss_clip": 0.01322602, "auxiliary_loss_mlp": 0.01028458, "balance_loss_clip": 1.06209135, "balance_loss_mlp": 1.01984477, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.4550947657259516, "language_loss": 0.86998487, "learning_rate": 3.7372205729739063e-06, "loss": 0.89349544, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.718827724456787 }, { "auxiliary_loss_clip": 0.01277619, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.06349051, "balance_loss_mlp": 1.02885306, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 1.916926591146347, "language_loss": 0.71600741, "learning_rate": 3.7368344639718514e-06, "loss": 0.7391659, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 3.58052396774292 }, { "auxiliary_loss_clip": 0.01275669, "auxiliary_loss_mlp": 0.01031884, "balance_loss_clip": 1.06131983, "balance_loss_mlp": 1.02306294, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.7540758258451201, "language_loss": 0.80553615, "learning_rate": 3.7364480914933895e-06, "loss": 0.82861173, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.7210943698883057 }, { "auxiliary_loss_clip": 0.01417488, "auxiliary_loss_mlp": 0.02576758, "balance_loss_clip": 1.05579543, "balance_loss_mlp": 0.99984217, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 2.0976335900764274, "language_loss": 0.80683792, "learning_rate": 3.7360614555971325e-06, "loss": 0.84678042, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.79917049407959 }, { "auxiliary_loss_clip": 0.01274418, "auxiliary_loss_mlp": 0.02574393, "balance_loss_clip": 1.06484199, "balance_loss_mlp": 0.99984276, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 2.069059938470754, "language_loss": 0.85134065, "learning_rate": 3.735674556341733e-06, "loss": 0.88982874, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.798358917236328 }, { "auxiliary_loss_clip": 0.01320847, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.06157863, "balance_loss_mlp": 1.02907956, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 2.0077150732941993, "language_loss": 0.82935834, "learning_rate": 3.7352873937858835e-06, "loss": 0.85294592, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.7389304637908936 }, { "auxiliary_loss_clip": 0.01370965, "auxiliary_loss_mlp": 0.02577213, "balance_loss_clip": 1.06114793, "balance_loss_mlp": 0.9998225, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 2.0879217892397812, "language_loss": 0.71900427, "learning_rate": 3.734899967988316e-06, "loss": 0.75848603, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.7403852939605713 }, { "auxiliary_loss_clip": 0.01366832, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.0576272, "balance_loss_mlp": 1.02707028, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 2.0060733557821995, "language_loss": 0.84025788, "learning_rate": 3.7345122790078026e-06, "loss": 0.86428213, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.734964609146118 }, { "auxiliary_loss_clip": 0.01273735, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.06355274, "balance_loss_mlp": 1.03087986, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 4.807057013860972, "language_loss": 0.92631257, "learning_rate": 3.7341243269031556e-06, "loss": 0.94945312, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.6260786056518555 }, { "auxiliary_loss_clip": 0.0131939, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.06278276, "balance_loss_mlp": 1.02510405, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.5946069264220581, "language_loss": 0.77520257, "learning_rate": 3.7337361117332275e-06, "loss": 0.79872739, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.7638638019561768 }, { "auxiliary_loss_clip": 0.01379286, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.06074703, "balance_loss_mlp": 1.0273149, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 5.559809598946598, "language_loss": 0.77221191, "learning_rate": 3.7333476335569087e-06, "loss": 0.79636306, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.686599016189575 }, { "auxiliary_loss_clip": 0.01323911, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.06160259, "balance_loss_mlp": 1.01994348, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 6.269812036376255, "language_loss": 0.67183095, "learning_rate": 3.7329588924331325e-06, "loss": 0.69535744, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.6736905574798584 }, { "auxiliary_loss_clip": 0.01368752, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.0564301, "balance_loss_mlp": 1.02357888, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 2.704053204865874, "language_loss": 0.82586849, "learning_rate": 3.732569888420871e-06, "loss": 0.84988403, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.698427200317383 }, { "auxiliary_loss_clip": 0.01228268, "auxiliary_loss_mlp": 0.01033261, "balance_loss_clip": 1.0644908, "balance_loss_mlp": 1.02399242, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 4.257591176240382, "language_loss": 0.82573867, "learning_rate": 3.732180621579134e-06, "loss": 0.84835398, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.624579668045044 }, { "auxiliary_loss_clip": 0.01390604, "auxiliary_loss_mlp": 0.01042865, "balance_loss_clip": 1.06684625, "balance_loss_mlp": 1.03349495, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 2.2733804876581423, "language_loss": 0.81549966, "learning_rate": 3.7317910919669745e-06, "loss": 0.83983439, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.830038547515869 }, { "auxiliary_loss_clip": 0.0127079, "auxiliary_loss_mlp": 0.01035446, "balance_loss_clip": 1.06108594, "balance_loss_mlp": 1.02562916, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.3499102044321534, "language_loss": 0.76747012, "learning_rate": 3.7314012996434826e-06, "loss": 0.79053247, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.6713778972625732 }, { "auxiliary_loss_clip": 0.01245501, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.05698276, "balance_loss_mlp": 1.02710867, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 2.0370847736743967, "language_loss": 0.81233031, "learning_rate": 3.7310112446677907e-06, "loss": 0.83514071, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.6779978275299072 }, { "auxiliary_loss_clip": 0.01230201, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.06773126, "balance_loss_mlp": 1.02460647, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 2.199490494354691, "language_loss": 0.69151396, "learning_rate": 3.7306209270990695e-06, "loss": 0.71415615, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.574169874191284 }, { "auxiliary_loss_clip": 0.0132718, "auxiliary_loss_mlp": 0.01038498, "balance_loss_clip": 1.06380296, "balance_loss_mlp": 1.02983153, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 1.794130281661613, "language_loss": 0.8664614, "learning_rate": 3.7302303469965292e-06, "loss": 0.89011812, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.7408499717712402 }, { "auxiliary_loss_clip": 0.01278515, "auxiliary_loss_mlp": 0.01037513, "balance_loss_clip": 1.06643903, "balance_loss_mlp": 1.02807152, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 1.9654174354860336, "language_loss": 0.7055974, "learning_rate": 3.7298395044194206e-06, "loss": 0.72875774, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 3.581028461456299 }, { "auxiliary_loss_clip": 0.01229848, "auxiliary_loss_mlp": 0.01035222, "balance_loss_clip": 1.06911755, "balance_loss_mlp": 1.0268594, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 1.9923742829773876, "language_loss": 0.94494545, "learning_rate": 3.7294483994270356e-06, "loss": 0.96759617, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.599031448364258 }, { "auxiliary_loss_clip": 0.01416569, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.05908775, "balance_loss_mlp": 1.02723217, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.0765211928812137, "language_loss": 0.78458393, "learning_rate": 3.7290570320787033e-06, "loss": 0.80911589, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 2.7741239070892334 }, { "auxiliary_loss_clip": 0.01282915, "auxiliary_loss_mlp": 0.01037471, "balance_loss_clip": 1.06945622, "balance_loss_mlp": 1.02817893, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.249383112238618, "language_loss": 0.71324074, "learning_rate": 3.728665402433793e-06, "loss": 0.73644465, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.6608128547668457 }, { "auxiliary_loss_clip": 0.01330562, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.06753898, "balance_loss_mlp": 1.02451897, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.740697480506498, "language_loss": 0.86061269, "learning_rate": 3.7282735105517164e-06, "loss": 0.88424885, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 3.606609344482422 }, { "auxiliary_loss_clip": 0.01430999, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.05964351, "balance_loss_mlp": 1.02656007, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.136332928491254, "language_loss": 0.67508149, "learning_rate": 3.727881356491922e-06, "loss": 0.6997503, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 3.616151809692383 }, { "auxiliary_loss_clip": 0.01230506, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.07175064, "balance_loss_mlp": 1.02735639, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 1.984762290856617, "language_loss": 0.75520182, "learning_rate": 3.7274889403139002e-06, "loss": 0.77786386, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.615187883377075 }, { "auxiliary_loss_clip": 0.01414748, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.06115532, "balance_loss_mlp": 1.02323806, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.8677020948390517, "language_loss": 0.78804463, "learning_rate": 3.727096262077179e-06, "loss": 0.81250739, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 3.7641241550445557 }, { "auxiliary_loss_clip": 0.01276723, "auxiliary_loss_mlp": 0.01030042, "balance_loss_clip": 1.06651807, "balance_loss_mlp": 1.0208981, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 2.010241591249997, "language_loss": 0.85330868, "learning_rate": 3.7267033218413285e-06, "loss": 0.87637627, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.5748157501220703 }, { "auxiliary_loss_clip": 0.01475616, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.0514549, "balance_loss_mlp": 1.02539456, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.4044617244439808, "language_loss": 0.80971146, "learning_rate": 3.726310119665957e-06, "loss": 0.83480841, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.7809226512908936 }, { "auxiliary_loss_clip": 0.01274572, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.06615829, "balance_loss_mlp": 1.02294564, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.766424966613472, "language_loss": 0.85799384, "learning_rate": 3.725916655610713e-06, "loss": 0.88105619, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.670405387878418 }, { "auxiliary_loss_clip": 0.01320499, "auxiliary_loss_mlp": 0.01035335, "balance_loss_clip": 1.06014431, "balance_loss_mlp": 1.02659059, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 2.890815172117339, "language_loss": 0.75856501, "learning_rate": 3.725522929735284e-06, "loss": 0.78212333, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.6856460571289062 }, { "auxiliary_loss_clip": 0.01331041, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.06009614, "balance_loss_mlp": 1.02177978, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 1.9622279938092642, "language_loss": 0.74600101, "learning_rate": 3.725128942099399e-06, "loss": 0.76961869, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.7510533332824707 }, { "auxiliary_loss_clip": 0.01321311, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.06112444, "balance_loss_mlp": 1.02689695, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.6575902897539643, "language_loss": 0.79901987, "learning_rate": 3.7247346927628245e-06, "loss": 0.82258856, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.7569332122802734 }, { "auxiliary_loss_clip": 0.01323743, "auxiliary_loss_mlp": 0.02578953, "balance_loss_clip": 1.06364202, "balance_loss_mlp": 0.99982178, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 7.808048207736728, "language_loss": 0.78962046, "learning_rate": 3.7243401817853694e-06, "loss": 0.82864743, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.7305638790130615 }, { "auxiliary_loss_clip": 0.01272251, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.06453562, "balance_loss_mlp": 1.02728796, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 2.708856297993703, "language_loss": 0.72124892, "learning_rate": 3.723945409226879e-06, "loss": 0.74433082, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.7009339332580566 }, { "auxiliary_loss_clip": 0.01278101, "auxiliary_loss_mlp": 0.01028321, "balance_loss_clip": 1.06508887, "balance_loss_mlp": 1.0190047, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 3.2855454468495067, "language_loss": 0.79849112, "learning_rate": 3.723550375147241e-06, "loss": 0.82155532, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.688101053237915 }, { "auxiliary_loss_clip": 0.01365717, "auxiliary_loss_mlp": 0.01034614, "balance_loss_clip": 1.05835581, "balance_loss_mlp": 1.0247252, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.815903514794724, "language_loss": 0.80272162, "learning_rate": 3.7231550796063816e-06, "loss": 0.82672489, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.7260186672210693 }, { "auxiliary_loss_clip": 0.01334483, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.06733775, "balance_loss_mlp": 1.02643895, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 2.6935295279768154, "language_loss": 0.65229195, "learning_rate": 3.722759522664266e-06, "loss": 0.67599177, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.6493189334869385 }, { "auxiliary_loss_clip": 0.01427362, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.05990553, "balance_loss_mlp": 1.02409327, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 2.960070663531769, "language_loss": 0.82022285, "learning_rate": 3.7223637043809016e-06, "loss": 0.84482241, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.6997365951538086 }, { "auxiliary_loss_clip": 0.01380111, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.06456208, "balance_loss_mlp": 1.03097677, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 2.626150399216743, "language_loss": 0.86686182, "learning_rate": 3.7219676248163322e-06, "loss": 0.89106017, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.724510669708252 }, { "auxiliary_loss_clip": 0.0128569, "auxiliary_loss_mlp": 0.01045906, "balance_loss_clip": 1.06960034, "balance_loss_mlp": 1.03691792, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 2.076605864202758, "language_loss": 0.93719983, "learning_rate": 3.721571284030643e-06, "loss": 0.9605158, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.6683905124664307 }, { "auxiliary_loss_clip": 0.01279396, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.06515706, "balance_loss_mlp": 1.02140903, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.1842344338348005, "language_loss": 0.79052651, "learning_rate": 3.7211746820839587e-06, "loss": 0.81362373, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.7129218578338623 }, { "auxiliary_loss_clip": 0.01508868, "auxiliary_loss_mlp": 0.0103688, "balance_loss_clip": 1.05419111, "balance_loss_mlp": 1.02774239, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.7772317286182924, "language_loss": 0.80885893, "learning_rate": 3.7207778190364437e-06, "loss": 0.83431643, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.8700013160705566 }, { "auxiliary_loss_clip": 0.01462349, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.05476546, "balance_loss_mlp": 1.02653193, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.6583199020224215, "language_loss": 0.74294996, "learning_rate": 3.720380694948302e-06, "loss": 0.76793087, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.9747817516326904 }, { "auxiliary_loss_clip": 0.01354666, "auxiliary_loss_mlp": 0.01009032, "balance_loss_clip": 1.04805779, "balance_loss_mlp": 1.00704098, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0347040211516259, "language_loss": 0.71264768, "learning_rate": 3.719983309879777e-06, "loss": 0.73628461, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 4.256620407104492 }, { "auxiliary_loss_clip": 0.0136844, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.06002927, "balance_loss_mlp": 1.02412772, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 1.9409172921653, "language_loss": 0.77587306, "learning_rate": 3.719585663891151e-06, "loss": 0.79988509, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.721174955368042 }, { "auxiliary_loss_clip": 0.01423042, "auxiliary_loss_mlp": 0.01041042, "balance_loss_clip": 1.06337428, "balance_loss_mlp": 1.03182101, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.4068151477579645, "language_loss": 0.78923512, "learning_rate": 3.719187757042747e-06, "loss": 0.81387591, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.728510856628418 }, { "auxiliary_loss_clip": 0.01238639, "auxiliary_loss_mlp": 0.01007779, "balance_loss_clip": 1.04216862, "balance_loss_mlp": 1.00589585, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7315007079120539, "language_loss": 0.54955041, "learning_rate": 3.7187895893949275e-06, "loss": 0.57201457, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.32606840133667 }, { "auxiliary_loss_clip": 0.0141532, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.05637193, "balance_loss_mlp": 1.02519691, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.995131503901059, "language_loss": 0.76030356, "learning_rate": 3.7183911610080937e-06, "loss": 0.78479421, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 3.6623995304107666 }, { "auxiliary_loss_clip": 0.01305366, "auxiliary_loss_mlp": 0.01035309, "balance_loss_clip": 1.06120396, "balance_loss_mlp": 1.02604651, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 4.1566697818360225, "language_loss": 0.75111079, "learning_rate": 3.7179924719426872e-06, "loss": 0.77451754, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 2.7885754108428955 }, { "auxiliary_loss_clip": 0.01278838, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.06625915, "balance_loss_mlp": 1.03202832, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.9022533482444013, "language_loss": 0.76047027, "learning_rate": 3.7175935222591885e-06, "loss": 0.783674, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 3.607504367828369 }, { "auxiliary_loss_clip": 0.013311, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.06975675, "balance_loss_mlp": 1.03293777, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 2.0646939994658036, "language_loss": 0.74516457, "learning_rate": 3.717194312018118e-06, "loss": 0.7688939, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 2.7685353755950928 }, { "auxiliary_loss_clip": 0.01277834, "auxiliary_loss_mlp": 0.01037987, "balance_loss_clip": 1.06596112, "balance_loss_mlp": 1.02944529, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.747004545600834, "language_loss": 0.76131326, "learning_rate": 3.716794841280036e-06, "loss": 0.78447139, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 3.582960605621338 }, { "auxiliary_loss_clip": 0.01282731, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 1.06494069, "balance_loss_mlp": 1.02458835, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 2.0224364333353804, "language_loss": 0.77362412, "learning_rate": 3.7163951101055407e-06, "loss": 0.79679132, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.652224063873291 }, { "auxiliary_loss_clip": 0.0132899, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.06788564, "balance_loss_mlp": 1.027812, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 2.713023112800421, "language_loss": 0.7873801, "learning_rate": 3.715995118555273e-06, "loss": 0.81103814, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.7692902088165283 }, { "auxiliary_loss_clip": 0.01355697, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.06057656, "balance_loss_mlp": 1.02292252, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.20729684441871, "language_loss": 0.8589884, "learning_rate": 3.71559486668991e-06, "loss": 0.88287312, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.9372501373291016 }, { "auxiliary_loss_clip": 0.01282083, "auxiliary_loss_mlp": 0.02578211, "balance_loss_clip": 1.06735468, "balance_loss_mlp": 0.99987519, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.62807709885512, "language_loss": 0.77539194, "learning_rate": 3.715194354570169e-06, "loss": 0.81399488, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.7777137756347656 }, { "auxiliary_loss_clip": 0.0127829, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.07143891, "balance_loss_mlp": 1.02998996, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 1.9818268899689517, "language_loss": 0.83549643, "learning_rate": 3.714793582256809e-06, "loss": 0.85866547, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.720048189163208 }, { "auxiliary_loss_clip": 0.01225942, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.06679404, "balance_loss_mlp": 1.02684557, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.946475094882912, "language_loss": 0.84997672, "learning_rate": 3.7143925498106253e-06, "loss": 0.87259889, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.6627140045166016 }, { "auxiliary_loss_clip": 0.01325471, "auxiliary_loss_mlp": 0.01040324, "balance_loss_clip": 1.05932939, "balance_loss_mlp": 1.03091812, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 1.991505543512618, "language_loss": 0.79373944, "learning_rate": 3.7139912572924558e-06, "loss": 0.81739742, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.6914827823638916 }, { "auxiliary_loss_clip": 0.01273705, "auxiliary_loss_mlp": 0.01029837, "balance_loss_clip": 1.06144691, "balance_loss_mlp": 1.02092612, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 2.5315872197725526, "language_loss": 0.8065201, "learning_rate": 3.7135897047631744e-06, "loss": 0.82955551, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.6427090167999268 }, { "auxiliary_loss_clip": 0.01334248, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.06787193, "balance_loss_mlp": 1.02139902, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.9806369794888865, "language_loss": 0.76329261, "learning_rate": 3.713187892283698e-06, "loss": 0.78694391, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.693169593811035 }, { "auxiliary_loss_clip": 0.01420425, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.05612993, "balance_loss_mlp": 1.02741218, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 3.154642741200047, "language_loss": 0.87062871, "learning_rate": 3.71278581991498e-06, "loss": 0.89519358, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.7449069023132324 }, { "auxiliary_loss_clip": 0.01384026, "auxiliary_loss_mlp": 0.02582457, "balance_loss_clip": 1.07087326, "balance_loss_mlp": 0.99984628, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.7908206521950611, "language_loss": 0.78996474, "learning_rate": 3.712383487718015e-06, "loss": 0.82962954, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.752018928527832 }, { "auxiliary_loss_clip": 0.01390973, "auxiliary_loss_mlp": 0.01031379, "balance_loss_clip": 1.05664515, "balance_loss_mlp": 1.02343392, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.9011143634136871, "language_loss": 0.87350762, "learning_rate": 3.7119808957538365e-06, "loss": 0.89773113, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.755253791809082 }, { "auxiliary_loss_clip": 0.01321177, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.05944324, "balance_loss_mlp": 1.02311814, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 12.864956822226443, "language_loss": 0.80397975, "learning_rate": 3.711578044083517e-06, "loss": 0.82751256, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.693596601486206 }, { "auxiliary_loss_clip": 0.01319741, "auxiliary_loss_mlp": 0.01038244, "balance_loss_clip": 1.05999136, "balance_loss_mlp": 1.02958381, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 2.434915707221235, "language_loss": 0.74726009, "learning_rate": 3.7111749327681698e-06, "loss": 0.77083993, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.7016055583953857 }, { "auxiliary_loss_clip": 0.01281165, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.0677743, "balance_loss_mlp": 1.02434909, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 1.9499021584300493, "language_loss": 0.86662006, "learning_rate": 3.7107715618689455e-06, "loss": 0.88975728, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.6840555667877197 }, { "auxiliary_loss_clip": 0.01272329, "auxiliary_loss_mlp": 0.0103058, "balance_loss_clip": 1.06497228, "balance_loss_mlp": 1.02189565, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.7629934197173565, "language_loss": 0.83423412, "learning_rate": 3.710367931447035e-06, "loss": 0.85726321, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.709613084793091 }, { "auxiliary_loss_clip": 0.01285358, "auxiliary_loss_mlp": 0.01038143, "balance_loss_clip": 1.06688917, "balance_loss_mlp": 1.02912486, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.9420838386285064, "language_loss": 0.86822867, "learning_rate": 3.70996404156367e-06, "loss": 0.89146376, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.597578287124634 }, { "auxiliary_loss_clip": 0.0141286, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.05712652, "balance_loss_mlp": 1.02388549, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.7857287795391952, "language_loss": 0.72997165, "learning_rate": 3.7095598922801187e-06, "loss": 0.75442922, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 3.8049657344818115 }, { "auxiliary_loss_clip": 0.01226515, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.06663799, "balance_loss_mlp": 1.02659845, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 3.9779927920367473, "language_loss": 0.76350474, "learning_rate": 3.7091554836576914e-06, "loss": 0.7861197, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.592480182647705 }, { "auxiliary_loss_clip": 0.01267206, "auxiliary_loss_mlp": 0.02574943, "balance_loss_clip": 1.05986881, "balance_loss_mlp": 0.99982131, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.8463627967460794, "language_loss": 0.83127427, "learning_rate": 3.708750815757736e-06, "loss": 0.86969572, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.7091140747070312 }, { "auxiliary_loss_clip": 0.01274152, "auxiliary_loss_mlp": 0.01036041, "balance_loss_clip": 1.06216621, "balance_loss_mlp": 1.02693319, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 4.111203368968098, "language_loss": 0.73352361, "learning_rate": 3.7083458886416407e-06, "loss": 0.75662553, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.7219035625457764 }, { "auxiliary_loss_clip": 0.01397142, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.05861056, "balance_loss_mlp": 1.02748227, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 4.82672370187564, "language_loss": 0.88096631, "learning_rate": 3.707940702370832e-06, "loss": 0.90530348, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 3.7567856311798096 }, { "auxiliary_loss_clip": 0.01183619, "auxiliary_loss_mlp": 0.01000082, "balance_loss_clip": 1.04266131, "balance_loss_mlp": 0.99844855, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.7545459574257879, "language_loss": 0.58276856, "learning_rate": 3.707535257006777e-06, "loss": 0.60460556, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.3170125484466553 }, { "auxiliary_loss_clip": 0.01334399, "auxiliary_loss_mlp": 0.01036105, "balance_loss_clip": 1.06816506, "balance_loss_mlp": 1.02637768, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 5.52228309801078, "language_loss": 0.88618791, "learning_rate": 3.707129552610981e-06, "loss": 0.90989298, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 3.5554146766662598 }, { "auxiliary_loss_clip": 0.01315483, "auxiliary_loss_mlp": 0.01032999, "balance_loss_clip": 1.05961525, "balance_loss_mlp": 1.02472031, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 2.0146780058519314, "language_loss": 0.73835731, "learning_rate": 3.70672358924499e-06, "loss": 0.76184213, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 2.643749713897705 }, { "auxiliary_loss_clip": 0.01375914, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.06568003, "balance_loss_mlp": 1.02759886, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 2.017569368247558, "language_loss": 0.78778106, "learning_rate": 3.706317366970386e-06, "loss": 0.81189871, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 3.826295852661133 }, { "auxiliary_loss_clip": 0.01226937, "auxiliary_loss_mlp": 0.02579213, "balance_loss_clip": 1.06432962, "balance_loss_mlp": 0.99989778, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 2.0891626123938574, "language_loss": 0.83409524, "learning_rate": 3.705910885848795e-06, "loss": 0.87215674, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.634070634841919 }, { "auxiliary_loss_clip": 0.01275128, "auxiliary_loss_mlp": 0.01036516, "balance_loss_clip": 1.06553149, "balance_loss_mlp": 1.02777171, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 2.3362508821818038, "language_loss": 0.84930742, "learning_rate": 3.705504145941879e-06, "loss": 0.87242383, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.6389496326446533 }, { "auxiliary_loss_clip": 0.01222216, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.06341696, "balance_loss_mlp": 1.02309525, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 3.1470355402084804, "language_loss": 0.78875494, "learning_rate": 3.7050971473113403e-06, "loss": 0.81129801, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.6208863258361816 }, { "auxiliary_loss_clip": 0.01272177, "auxiliary_loss_mlp": 0.02573699, "balance_loss_clip": 1.06165528, "balance_loss_mlp": 0.99982476, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 2.0991158106972394, "language_loss": 0.79944628, "learning_rate": 3.7046898900189196e-06, "loss": 0.83790505, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.7715420722961426 }, { "auxiliary_loss_clip": 0.01377963, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.0650878, "balance_loss_mlp": 1.03024614, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 2.9491536421111877, "language_loss": 0.82873875, "learning_rate": 3.704282374126398e-06, "loss": 0.8529163, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.7438595294952393 }, { "auxiliary_loss_clip": 0.01377481, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.06274319, "balance_loss_mlp": 1.02404535, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.7475760122731514, "language_loss": 0.87413543, "learning_rate": 3.7038745996955954e-06, "loss": 0.89824146, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 2.8300352096557617 }, { "auxiliary_loss_clip": 0.01304695, "auxiliary_loss_mlp": 0.0103899, "balance_loss_clip": 1.06222439, "balance_loss_mlp": 1.02949524, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 3.4241071825257094, "language_loss": 0.72360092, "learning_rate": 3.703466566788371e-06, "loss": 0.74703783, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.846301794052124 }, { "auxiliary_loss_clip": 0.01328142, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.06756091, "balance_loss_mlp": 1.03160346, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 2.008750459134699, "language_loss": 0.74827456, "learning_rate": 3.703058275466622e-06, "loss": 0.77196711, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.814880609512329 }, { "auxiliary_loss_clip": 0.01320041, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.05873942, "balance_loss_mlp": 1.02314234, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 7.347942489737876, "language_loss": 0.777843, "learning_rate": 3.7026497257922877e-06, "loss": 0.80135733, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.750514030456543 }, { "auxiliary_loss_clip": 0.01422254, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.05843413, "balance_loss_mlp": 1.02586436, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.7411298520704428, "language_loss": 0.85087669, "learning_rate": 3.7022409178273436e-06, "loss": 0.87544692, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.7990002632141113 }, { "auxiliary_loss_clip": 0.01268626, "auxiliary_loss_mlp": 0.01034228, "balance_loss_clip": 1.05951357, "balance_loss_mlp": 1.02538848, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 2.9427474532139666, "language_loss": 0.78642505, "learning_rate": 3.7018318516338054e-06, "loss": 0.80945355, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.627643346786499 }, { "auxiliary_loss_clip": 0.0128135, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.06601119, "balance_loss_mlp": 1.02003026, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 2.5854287198896504, "language_loss": 0.81203669, "learning_rate": 3.7014225272737284e-06, "loss": 0.83513272, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.6971592903137207 }, { "auxiliary_loss_clip": 0.01267941, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.06192994, "balance_loss_mlp": 1.0241127, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.8196039912423387, "language_loss": 0.74118567, "learning_rate": 3.701012944809207e-06, "loss": 0.76419753, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.65588641166687 }, { "auxiliary_loss_clip": 0.01325154, "auxiliary_loss_mlp": 0.02577064, "balance_loss_clip": 1.06522584, "balance_loss_mlp": 0.99983704, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.3434051998572745, "language_loss": 0.78958309, "learning_rate": 3.700603104302374e-06, "loss": 0.82860529, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.6895415782928467 }, { "auxiliary_loss_clip": 0.01264681, "auxiliary_loss_mlp": 0.01001912, "balance_loss_clip": 1.0291481, "balance_loss_mlp": 1.00010014, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.9200876188886893, "language_loss": 0.55956268, "learning_rate": 3.7001930058154027e-06, "loss": 0.58222866, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.3157219886779785 }, { "auxiliary_loss_clip": 0.01370544, "auxiliary_loss_mlp": 0.01041009, "balance_loss_clip": 1.05983293, "balance_loss_mlp": 1.03156161, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.8489277070641745, "language_loss": 0.79951513, "learning_rate": 3.6997826494105037e-06, "loss": 0.82363057, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.75981068611145 }, { "auxiliary_loss_clip": 0.0132397, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.06301785, "balance_loss_mlp": 1.02215636, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 4.850010425315004, "language_loss": 0.69830894, "learning_rate": 3.6993720351499286e-06, "loss": 0.72186852, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.8245737552642822 }, { "auxiliary_loss_clip": 0.01331477, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.07280171, "balance_loss_mlp": 1.02454519, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 2.0451117016223592, "language_loss": 0.76975119, "learning_rate": 3.6989611630959666e-06, "loss": 0.79340327, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 3.7269303798675537 }, { "auxiliary_loss_clip": 0.01181521, "auxiliary_loss_mlp": 0.01004492, "balance_loss_clip": 1.04058337, "balance_loss_mlp": 1.00270355, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.7100113937801776, "language_loss": 0.58250046, "learning_rate": 3.6985500333109474e-06, "loss": 0.60436058, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.284301280975342 }, { "auxiliary_loss_clip": 0.01365833, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.05786407, "balance_loss_mlp": 1.0231986, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 3.604574505055815, "language_loss": 0.76035565, "learning_rate": 3.6981386458572385e-06, "loss": 0.78433567, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 3.665034294128418 }, { "auxiliary_loss_clip": 0.01371209, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.05997419, "balance_loss_mlp": 1.02144873, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.2914695103364973, "language_loss": 0.76098937, "learning_rate": 3.6977270007972468e-06, "loss": 0.78502405, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.7360188961029053 }, { "auxiliary_loss_clip": 0.01327373, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.06296182, "balance_loss_mlp": 1.0368607, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.2790244707231264, "language_loss": 0.72214818, "learning_rate": 3.6973150981934196e-06, "loss": 0.74587834, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 2.723375082015991 }, { "auxiliary_loss_clip": 0.01229956, "auxiliary_loss_mlp": 0.01042086, "balance_loss_clip": 1.06610656, "balance_loss_mlp": 1.03337789, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 6.85945107841545, "language_loss": 0.83658177, "learning_rate": 3.6969029381082415e-06, "loss": 0.85930216, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 3.5132176876068115 }, { "auxiliary_loss_clip": 0.01244998, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.06225336, "balance_loss_mlp": 1.02570462, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.691394996584591, "language_loss": 0.79480517, "learning_rate": 3.696490520604237e-06, "loss": 0.81759763, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.688344955444336 }, { "auxiliary_loss_clip": 0.0127087, "auxiliary_loss_mlp": 0.01030676, "balance_loss_clip": 1.06498289, "balance_loss_mlp": 1.02289701, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 2.070584048190453, "language_loss": 0.80574334, "learning_rate": 3.696077845743968e-06, "loss": 0.82875878, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.696791887283325 }, { "auxiliary_loss_clip": 0.01225769, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.06374061, "balance_loss_mlp": 1.02058482, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 3.246539925304922, "language_loss": 0.73417181, "learning_rate": 3.69566491359004e-06, "loss": 0.75673795, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 3.5936810970306396 }, { "auxiliary_loss_clip": 0.01321669, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.05793643, "balance_loss_mlp": 1.02516687, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 1.9953262378128434, "language_loss": 0.69378853, "learning_rate": 3.695251724205092e-06, "loss": 0.71734929, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.927419662475586 }, { "auxiliary_loss_clip": 0.01225291, "auxiliary_loss_mlp": 0.01038524, "balance_loss_clip": 1.0660789, "balance_loss_mlp": 1.02948213, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 2.082580894639418, "language_loss": 0.86496037, "learning_rate": 3.6948382776518054e-06, "loss": 0.88759851, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.7014248371124268 }, { "auxiliary_loss_clip": 0.01381268, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.05974865, "balance_loss_mlp": 1.02362156, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.629379662004305, "language_loss": 0.79582012, "learning_rate": 3.6944245739929e-06, "loss": 0.81996167, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.804232120513916 }, { "auxiliary_loss_clip": 0.01280102, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.06789589, "balance_loss_mlp": 1.02369118, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 2.2709526950643717, "language_loss": 0.71940595, "learning_rate": 3.6940106132911332e-06, "loss": 0.74253905, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.668020009994507 }, { "auxiliary_loss_clip": 0.012764, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.06584299, "balance_loss_mlp": 1.02663982, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 2.0071278942120307, "language_loss": 0.88753408, "learning_rate": 3.6935963956093037e-06, "loss": 0.91064632, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.625235080718994 }, { "auxiliary_loss_clip": 0.01267848, "auxiliary_loss_mlp": 0.0103717, "balance_loss_clip": 1.06277204, "balance_loss_mlp": 1.02881968, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 2.4890665604988675, "language_loss": 0.6918695, "learning_rate": 3.6931819210102474e-06, "loss": 0.71491969, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.699068546295166 }, { "auxiliary_loss_clip": 0.01226903, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.06556296, "balance_loss_mlp": 1.02343166, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 2.184465144649324, "language_loss": 0.8461051, "learning_rate": 3.6927671895568402e-06, "loss": 0.86869866, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.5956993103027344 }, { "auxiliary_loss_clip": 0.01230696, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.06898785, "balance_loss_mlp": 1.02206206, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 2.021867251389478, "language_loss": 0.87210697, "learning_rate": 3.692352201311996e-06, "loss": 0.89472318, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.6266164779663086 }, { "auxiliary_loss_clip": 0.01370027, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.06009388, "balance_loss_mlp": 1.02337027, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 1.8460726947749244, "language_loss": 0.76840031, "learning_rate": 3.6919369563386687e-06, "loss": 0.79242337, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.729376792907715 }, { "auxiliary_loss_clip": 0.01319847, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.06499648, "balance_loss_mlp": 1.02411771, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 2.304111998423511, "language_loss": 0.7890687, "learning_rate": 3.69152145469985e-06, "loss": 0.81259179, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.678624153137207 }, { "auxiliary_loss_clip": 0.01423939, "auxiliary_loss_mlp": 0.01034151, "balance_loss_clip": 1.05842352, "balance_loss_mlp": 1.02434587, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 3.46805242552658, "language_loss": 0.82432044, "learning_rate": 3.691105696458572e-06, "loss": 0.84890127, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.7951457500457764 }, { "auxiliary_loss_clip": 0.01229361, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.07042587, "balance_loss_mlp": 1.02061415, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 3.35183288827725, "language_loss": 0.68442309, "learning_rate": 3.690689681677904e-06, "loss": 0.70701104, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.6266632080078125 }, { "auxiliary_loss_clip": 0.013245, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.06032741, "balance_loss_mlp": 1.02598894, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.7559516687438865, "language_loss": 0.88647115, "learning_rate": 3.690273410420956e-06, "loss": 0.91006076, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.749053955078125 }, { "auxiliary_loss_clip": 0.01271157, "auxiliary_loss_mlp": 0.01033254, "balance_loss_clip": 1.06364775, "balance_loss_mlp": 1.02458191, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.5971573943267963, "language_loss": 0.76430291, "learning_rate": 3.689856882750875e-06, "loss": 0.78734702, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.6545238494873047 }, { "auxiliary_loss_clip": 0.01270598, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.0669558, "balance_loss_mlp": 1.02877331, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.7647510006711382, "language_loss": 0.7874831, "learning_rate": 3.6894400987308486e-06, "loss": 0.81056118, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.6769585609436035 }, { "auxiliary_loss_clip": 0.01277441, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.06326032, "balance_loss_mlp": 1.02172017, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 2.9430100475385808, "language_loss": 0.85330898, "learning_rate": 3.6890230584241024e-06, "loss": 0.87638885, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.62534761428833 }, { "auxiliary_loss_clip": 0.01130801, "auxiliary_loss_mlp": 0.01002604, "balance_loss_clip": 1.04127932, "balance_loss_mlp": 1.00095928, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.0804683706377751, "language_loss": 0.66401911, "learning_rate": 3.6886057618939016e-06, "loss": 0.68535316, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.2759385108947754 }, { "auxiliary_loss_clip": 0.01298187, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.06118989, "balance_loss_mlp": 1.02530789, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.0316128916545564, "language_loss": 0.6887942, "learning_rate": 3.6881882092035492e-06, "loss": 0.71211964, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 3.7965002059936523 }, { "auxiliary_loss_clip": 0.0129342, "auxiliary_loss_mlp": 0.02528347, "balance_loss_clip": 1.03952289, "balance_loss_mlp": 0.9996891, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9393771280061165, "language_loss": 0.61222589, "learning_rate": 3.6877704004163873e-06, "loss": 0.65044355, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.385798454284668 }, { "auxiliary_loss_clip": 0.01227521, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.06591773, "balance_loss_mlp": 1.02388477, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 1.8395011068794145, "language_loss": 0.77852625, "learning_rate": 3.6873523355957984e-06, "loss": 0.8011328, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.649529218673706 }, { "auxiliary_loss_clip": 0.0112819, "auxiliary_loss_mlp": 0.01000645, "balance_loss_clip": 1.03906333, "balance_loss_mlp": 0.99896395, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 1.0372227072027458, "language_loss": 0.64085948, "learning_rate": 3.686934014805201e-06, "loss": 0.66214776, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.050250291824341 }, { "auxiliary_loss_clip": 0.01275895, "auxiliary_loss_mlp": 0.01042837, "balance_loss_clip": 1.06679106, "balance_loss_mlp": 1.03373551, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.8270061935013222, "language_loss": 0.81094134, "learning_rate": 3.6865154381080552e-06, "loss": 0.83412862, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 2.6549713611602783 }, { "auxiliary_loss_clip": 0.01519603, "auxiliary_loss_mlp": 0.01033156, "balance_loss_clip": 1.05395532, "balance_loss_mlp": 1.02441764, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 2.5054879899293216, "language_loss": 0.82816321, "learning_rate": 3.6860966055678585e-06, "loss": 0.8536908, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 3.7794735431671143 }, { "auxiliary_loss_clip": 0.01275493, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.06605613, "balance_loss_mlp": 1.02210593, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 2.1401690789310157, "language_loss": 0.8653003, "learning_rate": 3.685677517248147e-06, "loss": 0.88837171, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.682203531265259 }, { "auxiliary_loss_clip": 0.01320501, "auxiliary_loss_mlp": 0.0257455, "balance_loss_clip": 1.06568813, "balance_loss_mlp": 0.99987012, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 9.79805664118204, "language_loss": 0.80551362, "learning_rate": 3.6852581732124967e-06, "loss": 0.84446412, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.67453670501709 }, { "auxiliary_loss_clip": 0.01275807, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.06626153, "balance_loss_mlp": 1.02371526, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 1.8610296955554615, "language_loss": 0.76307273, "learning_rate": 3.6848385735245213e-06, "loss": 0.78616118, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 3.618485927581787 }, { "auxiliary_loss_clip": 0.012624, "auxiliary_loss_mlp": 0.01036655, "balance_loss_clip": 1.0578469, "balance_loss_mlp": 1.02794111, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 1.9909103560763126, "language_loss": 0.861525, "learning_rate": 3.6844187182478734e-06, "loss": 0.88451558, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.7200582027435303 }, { "auxiliary_loss_clip": 0.01317113, "auxiliary_loss_mlp": 0.01030743, "balance_loss_clip": 1.05878687, "balance_loss_mlp": 1.02161729, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 1.7870360130062108, "language_loss": 0.75278431, "learning_rate": 3.683998607446246e-06, "loss": 0.77626288, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.7512638568878174 }, { "auxiliary_loss_clip": 0.01279654, "auxiliary_loss_mlp": 0.0103416, "balance_loss_clip": 1.06811404, "balance_loss_mlp": 1.02565992, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 2.493863199302456, "language_loss": 0.75440609, "learning_rate": 3.6835782411833686e-06, "loss": 0.77754414, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.6189229488372803 }, { "auxiliary_loss_clip": 0.0136037, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.05821443, "balance_loss_mlp": 1.02603459, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.9286203832992679, "language_loss": 0.74052942, "learning_rate": 3.68315761952301e-06, "loss": 0.76447904, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.770328998565674 }, { "auxiliary_loss_clip": 0.01226084, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.06686676, "balance_loss_mlp": 1.02930927, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 2.0262083934546036, "language_loss": 0.83388329, "learning_rate": 3.6827367425289797e-06, "loss": 0.85652369, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.6359527111053467 }, { "auxiliary_loss_clip": 0.0131973, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.06310081, "balance_loss_mlp": 1.02182651, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.643337097838561, "language_loss": 0.72670609, "learning_rate": 3.6823156102651225e-06, "loss": 0.75021255, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.7509500980377197 }, { "auxiliary_loss_clip": 0.01458158, "auxiliary_loss_mlp": 0.0103724, "balance_loss_clip": 1.05575335, "balance_loss_mlp": 1.02832305, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 2.222896996465956, "language_loss": 0.70807743, "learning_rate": 3.6818942227953257e-06, "loss": 0.73303139, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.8121814727783203 }, { "auxiliary_loss_clip": 0.01375539, "auxiliary_loss_mlp": 0.01036572, "balance_loss_clip": 1.0619061, "balance_loss_mlp": 1.02808428, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 2.1305746952922076, "language_loss": 0.68848747, "learning_rate": 3.681472580183512e-06, "loss": 0.71260858, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.779588222503662 }, { "auxiliary_loss_clip": 0.01270087, "auxiliary_loss_mlp": 0.01034506, "balance_loss_clip": 1.06375289, "balance_loss_mlp": 1.02615559, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 2.0407202710539676, "language_loss": 0.86124218, "learning_rate": 3.6810506824936455e-06, "loss": 0.88428807, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.6859185695648193 }, { "auxiliary_loss_clip": 0.01223899, "auxiliary_loss_mlp": 0.01004349, "balance_loss_clip": 1.03211451, "balance_loss_mlp": 1.00276375, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.0253389557418644, "language_loss": 0.62522817, "learning_rate": 3.680628529789726e-06, "loss": 0.64751065, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.207810878753662 }, { "auxiliary_loss_clip": 0.01230421, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.06694508, "balance_loss_mlp": 1.02863717, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 2.7459276978122915, "language_loss": 0.862643, "learning_rate": 3.680206122135796e-06, "loss": 0.88533109, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.6467299461364746 }, { "auxiliary_loss_clip": 0.01428142, "auxiliary_loss_mlp": 0.01040387, "balance_loss_clip": 1.06271458, "balance_loss_mlp": 1.03179204, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 1.9247565899619996, "language_loss": 0.78343666, "learning_rate": 3.6797834595959323e-06, "loss": 0.80812204, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.878404140472412 }, { "auxiliary_loss_clip": 0.01416573, "auxiliary_loss_mlp": 0.01036331, "balance_loss_clip": 1.05715394, "balance_loss_mlp": 1.02639461, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 2.7373245652356264, "language_loss": 0.7773127, "learning_rate": 3.679360542234254e-06, "loss": 0.80184174, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.8337326049804688 }, { "auxiliary_loss_clip": 0.01315276, "auxiliary_loss_mlp": 0.02576445, "balance_loss_clip": 1.05540848, "balance_loss_mlp": 0.99999416, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 2.299169074848696, "language_loss": 0.72686625, "learning_rate": 3.678937370114916e-06, "loss": 0.76578343, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.7990612983703613 }, { "auxiliary_loss_clip": 0.01317617, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.06241095, "balance_loss_mlp": 1.02370334, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 2.037022919483095, "language_loss": 0.78655446, "learning_rate": 3.678513943302114e-06, "loss": 0.810045, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.8272628784179688 }, { "auxiliary_loss_clip": 0.01222643, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.06505609, "balance_loss_mlp": 1.02749503, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 3.7254250067592234, "language_loss": 0.85263479, "learning_rate": 3.678090261860082e-06, "loss": 0.87522739, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.671719551086426 }, { "auxiliary_loss_clip": 0.01372209, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.05554938, "balance_loss_mlp": 1.02622271, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 2.9632718390182693, "language_loss": 0.77425575, "learning_rate": 3.6776663258530906e-06, "loss": 0.79833066, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.771486759185791 }, { "auxiliary_loss_clip": 0.01275482, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.06305742, "balance_loss_mlp": 1.03100252, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 2.471891837246355, "language_loss": 0.7140522, "learning_rate": 3.6772421353454516e-06, "loss": 0.73719996, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 3.7716257572174072 }, { "auxiliary_loss_clip": 0.01272886, "auxiliary_loss_mlp": 0.01035307, "balance_loss_clip": 1.06518805, "balance_loss_mlp": 1.02635396, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.8708959961133214, "language_loss": 0.88950652, "learning_rate": 3.6768176904015153e-06, "loss": 0.91258848, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.638502836227417 }, { "auxiliary_loss_clip": 0.0127689, "auxiliary_loss_mlp": 0.0103397, "balance_loss_clip": 1.06461203, "balance_loss_mlp": 1.02439129, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 2.4868299715985174, "language_loss": 0.6011585, "learning_rate": 3.6763929910856674e-06, "loss": 0.6242671, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.643238067626953 }, { "auxiliary_loss_clip": 0.01273704, "auxiliary_loss_mlp": 0.01039885, "balance_loss_clip": 1.06446779, "balance_loss_mlp": 1.03055739, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.6329506134499754, "language_loss": 0.77624631, "learning_rate": 3.6759680374623365e-06, "loss": 0.79938215, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.7076990604400635 }, { "auxiliary_loss_clip": 0.01226388, "auxiliary_loss_mlp": 0.01028146, "balance_loss_clip": 1.06867456, "balance_loss_mlp": 1.0195868, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.3900421590955125, "language_loss": 0.75257939, "learning_rate": 3.675542829595986e-06, "loss": 0.77512479, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 2.628584861755371 }, { "auxiliary_loss_clip": 0.01320551, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.06103384, "balance_loss_mlp": 1.02508092, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.881764362917191, "language_loss": 0.79538709, "learning_rate": 3.6751173675511213e-06, "loss": 0.81892806, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 3.8192715644836426 }, { "auxiliary_loss_clip": 0.01322181, "auxiliary_loss_mlp": 0.01036236, "balance_loss_clip": 1.05705404, "balance_loss_mlp": 1.02705646, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.1222476218305077, "language_loss": 0.8762387, "learning_rate": 3.674691651392283e-06, "loss": 0.89982289, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.744683265686035 }, { "auxiliary_loss_clip": 0.01328806, "auxiliary_loss_mlp": 0.0103925, "balance_loss_clip": 1.06422114, "balance_loss_mlp": 1.0298506, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 2.194188202382927, "language_loss": 0.75737637, "learning_rate": 3.674265681184053e-06, "loss": 0.78105694, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 2.864872932434082 }, { "auxiliary_loss_clip": 0.01320968, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.05824804, "balance_loss_mlp": 1.02269077, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 2.5555526631259307, "language_loss": 0.86441398, "learning_rate": 3.6738394569910504e-06, "loss": 0.88794601, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 3.6651058197021484 }, { "auxiliary_loss_clip": 0.01280282, "auxiliary_loss_mlp": 0.01036092, "balance_loss_clip": 1.06819701, "balance_loss_mlp": 1.02647805, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 1.9846856410028844, "language_loss": 0.82845634, "learning_rate": 3.6734129788779333e-06, "loss": 0.85162008, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.665830612182617 }, { "auxiliary_loss_clip": 0.01293219, "auxiliary_loss_mlp": 0.0103398, "balance_loss_clip": 1.06160712, "balance_loss_mlp": 1.02554011, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 2.014672139354798, "language_loss": 0.90324295, "learning_rate": 3.6729862469093976e-06, "loss": 0.92651498, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.7729225158691406 }, { "auxiliary_loss_clip": 0.01308765, "auxiliary_loss_mlp": 0.01030164, "balance_loss_clip": 1.0561111, "balance_loss_mlp": 1.02099729, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.3547151240556716, "language_loss": 0.83040273, "learning_rate": 3.6725592611501782e-06, "loss": 0.85379201, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.695065975189209 }, { "auxiliary_loss_clip": 0.01273416, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.06140423, "balance_loss_mlp": 1.02129424, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 2.35073843927977, "language_loss": 0.76103365, "learning_rate": 3.6721320216650496e-06, "loss": 0.78407097, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.767033576965332 }, { "auxiliary_loss_clip": 0.01316229, "auxiliary_loss_mlp": 0.01028846, "balance_loss_clip": 1.06263232, "balance_loss_mlp": 1.01997137, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 2.9019244404021993, "language_loss": 0.8373152, "learning_rate": 3.6717045285188215e-06, "loss": 0.86076593, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.7059032917022705 }, { "auxiliary_loss_clip": 0.01409121, "auxiliary_loss_mlp": 0.01042718, "balance_loss_clip": 1.05169153, "balance_loss_mlp": 1.03334212, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.311275593555789, "language_loss": 0.87058568, "learning_rate": 3.671276781776346e-06, "loss": 0.89510405, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.754453420639038 }, { "auxiliary_loss_clip": 0.01300288, "auxiliary_loss_mlp": 0.01038066, "balance_loss_clip": 1.055583, "balance_loss_mlp": 1.02933383, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 3.253123060543526, "language_loss": 0.67245382, "learning_rate": 3.6708487815025128e-06, "loss": 0.69583738, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7577831745147705 }, { "auxiliary_loss_clip": 0.01370625, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.0582931, "balance_loss_mlp": 1.02372193, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.385000999865213, "language_loss": 0.74370337, "learning_rate": 3.6704205277622463e-06, "loss": 0.76774347, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.7095906734466553 }, { "auxiliary_loss_clip": 0.01326247, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.05931377, "balance_loss_mlp": 1.02260232, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.8693245127873834, "language_loss": 0.80210394, "learning_rate": 3.6699920206205146e-06, "loss": 0.82568872, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.788283586502075 }, { "auxiliary_loss_clip": 0.01273124, "auxiliary_loss_mlp": 0.01038257, "balance_loss_clip": 1.0612787, "balance_loss_mlp": 1.02982283, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 3.0883696773192018, "language_loss": 0.81937522, "learning_rate": 3.669563260142321e-06, "loss": 0.842489, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.6362459659576416 }, { "auxiliary_loss_clip": 0.01318426, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.06335521, "balance_loss_mlp": 1.01949978, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 4.399219998747569, "language_loss": 0.84299892, "learning_rate": 3.6691342463927083e-06, "loss": 0.86646438, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.739823818206787 }, { "auxiliary_loss_clip": 0.01375696, "auxiliary_loss_mlp": 0.01040669, "balance_loss_clip": 1.05984259, "balance_loss_mlp": 1.03117371, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 2.022125697180014, "language_loss": 0.81959039, "learning_rate": 3.668704979436758e-06, "loss": 0.84375405, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.7498390674591064 }, { "auxiliary_loss_clip": 0.01319126, "auxiliary_loss_mlp": 0.01035904, "balance_loss_clip": 1.05865085, "balance_loss_mlp": 1.02682626, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.950291104536783, "language_loss": 0.78632414, "learning_rate": 3.668275459339588e-06, "loss": 0.80987442, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.7106075286865234 }, { "auxiliary_loss_clip": 0.01224472, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.06659126, "balance_loss_mlp": 1.02351809, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 1.9582133532635921, "language_loss": 0.80320483, "learning_rate": 3.667845686166358e-06, "loss": 0.82576722, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.623673439025879 }, { "auxiliary_loss_clip": 0.01361072, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.056499, "balance_loss_mlp": 1.02604079, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 5.166637474835836, "language_loss": 0.86176252, "learning_rate": 3.6674156599822634e-06, "loss": 0.88572598, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.6811368465423584 }, { "auxiliary_loss_clip": 0.01422186, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.0534488, "balance_loss_mlp": 1.03048062, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 2.527886186663741, "language_loss": 0.81879407, "learning_rate": 3.666985380852539e-06, "loss": 0.84341538, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.7897238731384277 }, { "auxiliary_loss_clip": 0.01322127, "auxiliary_loss_mlp": 0.01035222, "balance_loss_clip": 1.06275153, "balance_loss_mlp": 1.02513671, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 2.7505427351975666, "language_loss": 0.74410188, "learning_rate": 3.6665548488424576e-06, "loss": 0.76767534, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.7297680377960205 }, { "auxiliary_loss_clip": 0.0122621, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.06643772, "balance_loss_mlp": 1.02313972, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 1.7734979551404007, "language_loss": 0.88196695, "learning_rate": 3.6661240640173307e-06, "loss": 0.9045518, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 3.6571662425994873 }, { "auxiliary_loss_clip": 0.01201777, "auxiliary_loss_mlp": 0.01005858, "balance_loss_clip": 1.03724074, "balance_loss_mlp": 1.00436199, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8568274907729799, "language_loss": 0.57889402, "learning_rate": 3.6656930264425085e-06, "loss": 0.60097039, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.3091979026794434 }, { "auxiliary_loss_clip": 0.01226057, "auxiliary_loss_mlp": 0.01030088, "balance_loss_clip": 1.06609023, "balance_loss_mlp": 1.0210458, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 2.367687893141832, "language_loss": 0.75797862, "learning_rate": 3.665261736183378e-06, "loss": 0.78054011, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 3.5261120796203613 }, { "auxiliary_loss_clip": 0.0138645, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.0698204, "balance_loss_mlp": 1.02267945, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 2.49880337742844, "language_loss": 0.88979357, "learning_rate": 3.664830193305366e-06, "loss": 0.91397244, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.8925654888153076 }, { "auxiliary_loss_clip": 0.01369972, "auxiliary_loss_mlp": 0.01039115, "balance_loss_clip": 1.05845833, "balance_loss_mlp": 1.03024006, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 2.8754511319670812, "language_loss": 0.77223563, "learning_rate": 3.6643983978739373e-06, "loss": 0.79632652, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 2.6272199153900146 }, { "auxiliary_loss_clip": 0.01322306, "auxiliary_loss_mlp": 0.01039777, "balance_loss_clip": 1.06404722, "balance_loss_mlp": 1.03074741, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.8574461690617623, "language_loss": 0.82494563, "learning_rate": 3.663966349954596e-06, "loss": 0.84856641, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 3.5971145629882812 }, { "auxiliary_loss_clip": 0.01178462, "auxiliary_loss_mlp": 0.01004776, "balance_loss_clip": 1.034729, "balance_loss_mlp": 1.00329232, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7875044742188703, "language_loss": 0.59662235, "learning_rate": 3.6635340496128816e-06, "loss": 0.61845481, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 3.1651768684387207 }, { "auxiliary_loss_clip": 0.01422838, "auxiliary_loss_mlp": 0.01035138, "balance_loss_clip": 1.05956697, "balance_loss_mlp": 1.02633429, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.673745513944039, "language_loss": 0.92819566, "learning_rate": 3.6631014969143747e-06, "loss": 0.95277542, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.8077399730682373 }, { "auxiliary_loss_clip": 0.01274521, "auxiliary_loss_mlp": 0.01041595, "balance_loss_clip": 1.06449914, "balance_loss_mlp": 1.03196263, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 1.8820962280805233, "language_loss": 0.88953674, "learning_rate": 3.662668691924693e-06, "loss": 0.91269779, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 3.620624303817749 }, { "auxiliary_loss_clip": 0.01375487, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.05902386, "balance_loss_mlp": 1.0291307, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 2.263020172934962, "language_loss": 0.71367347, "learning_rate": 3.6622356347094927e-06, "loss": 0.73781765, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.842888116836548 }, { "auxiliary_loss_clip": 0.01303834, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.06034708, "balance_loss_mlp": 1.03154516, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 2.1624631395346507, "language_loss": 0.78514129, "learning_rate": 3.6618023253344684e-06, "loss": 0.80859184, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.7139394283294678 }, { "auxiliary_loss_clip": 0.01275103, "auxiliary_loss_mlp": 0.01040112, "balance_loss_clip": 1.0625062, "balance_loss_mlp": 1.02979493, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.7301384679887115, "language_loss": 0.83933461, "learning_rate": 3.6613687638653527e-06, "loss": 0.86248678, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.7071187496185303 }, { "auxiliary_loss_clip": 0.01322323, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.06804407, "balance_loss_mlp": 1.02425098, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 3.004519927478719, "language_loss": 0.7764371, "learning_rate": 3.660934950367916e-06, "loss": 0.79999101, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.7264814376831055 }, { "auxiliary_loss_clip": 0.01276578, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.06478167, "balance_loss_mlp": 1.0248636, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 1.7269809657744628, "language_loss": 0.8302356, "learning_rate": 3.660500884907968e-06, "loss": 0.85334313, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.719905376434326 }, { "auxiliary_loss_clip": 0.01260619, "auxiliary_loss_mlp": 0.01004029, "balance_loss_clip": 1.02463698, "balance_loss_mlp": 1.00212145, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8289654695877348, "language_loss": 0.59951389, "learning_rate": 3.660066567551356e-06, "loss": 0.62216038, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.2155396938323975 }, { "auxiliary_loss_clip": 0.01276763, "auxiliary_loss_mlp": 0.02580387, "balance_loss_clip": 1.06637144, "balance_loss_mlp": 1.00010633, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 6.515510965562395, "language_loss": 0.84543657, "learning_rate": 3.6596319983639657e-06, "loss": 0.88400805, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.7066287994384766 }, { "auxiliary_loss_clip": 0.01371283, "auxiliary_loss_mlp": 0.02580169, "balance_loss_clip": 1.06236625, "balance_loss_mlp": 1.00003147, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.681172293775958, "language_loss": 0.86436832, "learning_rate": 3.6591971774117214e-06, "loss": 0.90388286, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.80973219871521 }, { "auxiliary_loss_clip": 0.01280346, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.06702852, "balance_loss_mlp": 1.02479959, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.1846579218978452, "language_loss": 0.80594134, "learning_rate": 3.6587621047605833e-06, "loss": 0.82908237, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.6686975955963135 }, { "auxiliary_loss_clip": 0.01274324, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.06544495, "balance_loss_mlp": 1.02082753, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.2534928364758273, "language_loss": 0.86625165, "learning_rate": 3.6583267804765542e-06, "loss": 0.88929462, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.6781694889068604 }, { "auxiliary_loss_clip": 0.01267512, "auxiliary_loss_mlp": 0.01030817, "balance_loss_clip": 1.06133509, "balance_loss_mlp": 1.02188253, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.3494531725510535, "language_loss": 0.85925657, "learning_rate": 3.6578912046256702e-06, "loss": 0.88223982, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.6669938564300537 }, { "auxiliary_loss_clip": 0.01369505, "auxiliary_loss_mlp": 0.0104284, "balance_loss_clip": 1.05793381, "balance_loss_mlp": 1.03374469, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.547695201039595, "language_loss": 0.76169783, "learning_rate": 3.6574553772740083e-06, "loss": 0.78582132, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.7556629180908203 }, { "auxiliary_loss_clip": 0.01177854, "auxiliary_loss_mlp": 0.0101472, "balance_loss_clip": 1.0407443, "balance_loss_mlp": 1.01326525, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8609051581600449, "language_loss": 0.61823809, "learning_rate": 3.657019298487684e-06, "loss": 0.64016384, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.246567726135254 }, { "auxiliary_loss_clip": 0.0120199, "auxiliary_loss_mlp": 0.02581356, "balance_loss_clip": 1.06397784, "balance_loss_mlp": 1.00004864, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 2.0677815715548196, "language_loss": 0.83927566, "learning_rate": 3.6565829683328495e-06, "loss": 0.87710917, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.7719578742980957 }, { "auxiliary_loss_clip": 0.01278678, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.07033849, "balance_loss_mlp": 1.03157246, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 2.072539695510629, "language_loss": 0.86121464, "learning_rate": 3.6561463868756965e-06, "loss": 0.88440657, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.6473135948181152 }, { "auxiliary_loss_clip": 0.01271631, "auxiliary_loss_mlp": 0.01036237, "balance_loss_clip": 1.06641829, "balance_loss_mlp": 1.02670574, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 2.5916173600739283, "language_loss": 0.78083807, "learning_rate": 3.655709554182452e-06, "loss": 0.80391675, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.7059826850891113 }, { "auxiliary_loss_clip": 0.01277951, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.06457853, "balance_loss_mlp": 1.02692032, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 2.151908928040915, "language_loss": 0.84718746, "learning_rate": 3.6552724703193855e-06, "loss": 0.87033176, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.6796207427978516 }, { "auxiliary_loss_clip": 0.0132744, "auxiliary_loss_mlp": 0.01001632, "balance_loss_clip": 1.02319264, "balance_loss_mlp": 1.00009429, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7921044272079738, "language_loss": 0.55883908, "learning_rate": 3.654835135352801e-06, "loss": 0.58212984, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 4.1659018993377686 }, { "auxiliary_loss_clip": 0.01425297, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.05423832, "balance_loss_mlp": 1.02214921, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 3.9191011373234037, "language_loss": 0.87221694, "learning_rate": 3.654397549349043e-06, "loss": 0.89678323, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.755384922027588 }, { "auxiliary_loss_clip": 0.01318316, "auxiliary_loss_mlp": 0.01032081, "balance_loss_clip": 1.0649972, "balance_loss_mlp": 1.02341413, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.989746319018865, "language_loss": 0.75754344, "learning_rate": 3.653959712374491e-06, "loss": 0.7810474, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 3.627288818359375 }, { "auxiliary_loss_clip": 0.01373825, "auxiliary_loss_mlp": 0.01040906, "balance_loss_clip": 1.06853032, "balance_loss_mlp": 1.03058839, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 2.0498277978259023, "language_loss": 0.83139491, "learning_rate": 3.6535216244955663e-06, "loss": 0.85554218, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.7062625885009766 }, { "auxiliary_loss_clip": 0.01319471, "auxiliary_loss_mlp": 0.01031265, "balance_loss_clip": 1.06164813, "balance_loss_mlp": 1.02239037, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 1.8729542045869425, "language_loss": 0.70876324, "learning_rate": 3.653083285778726e-06, "loss": 0.7322706, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.827552556991577 }, { "auxiliary_loss_clip": 0.01279897, "auxiliary_loss_mlp": 0.01038334, "balance_loss_clip": 1.06548309, "balance_loss_mlp": 1.02904749, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 4.430154575242656, "language_loss": 0.81321549, "learning_rate": 3.6526446962904653e-06, "loss": 0.83639783, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.536332368850708 }, { "auxiliary_loss_clip": 0.01270112, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.06536269, "balance_loss_mlp": 1.02787876, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.5936039418731456, "language_loss": 0.74281693, "learning_rate": 3.652205856097318e-06, "loss": 0.76588523, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.7212398052215576 }, { "auxiliary_loss_clip": 0.01385551, "auxiliary_loss_mlp": 0.02581464, "balance_loss_clip": 1.06244278, "balance_loss_mlp": 1.00004101, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 2.096367683624753, "language_loss": 0.79048938, "learning_rate": 3.651766765265856e-06, "loss": 0.83015954, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 2.6626946926116943 }, { "auxiliary_loss_clip": 0.01317496, "auxiliary_loss_mlp": 0.01036262, "balance_loss_clip": 1.05823255, "balance_loss_mlp": 1.02668929, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 2.7871301142360396, "language_loss": 0.81330216, "learning_rate": 3.65132742386269e-06, "loss": 0.83683974, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.685608148574829 }, { "auxiliary_loss_clip": 0.01223384, "auxiliary_loss_mlp": 0.010395, "balance_loss_clip": 1.06453812, "balance_loss_mlp": 1.02989173, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.8271341856337657, "language_loss": 0.84700918, "learning_rate": 3.6508878319544656e-06, "loss": 0.86963803, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 3.5692155361175537 }, { "auxiliary_loss_clip": 0.01317693, "auxiliary_loss_mlp": 0.0105159, "balance_loss_clip": 1.06213307, "balance_loss_mlp": 1.04167211, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 3.496962010025046, "language_loss": 0.81780076, "learning_rate": 3.65044798960787e-06, "loss": 0.84149361, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.6545181274414062 }, { "auxiliary_loss_clip": 0.01361306, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.0561043, "balance_loss_mlp": 1.0217483, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 1.9567917602834946, "language_loss": 0.7835598, "learning_rate": 3.650007896889627e-06, "loss": 0.80747688, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.758579969406128 }, { "auxiliary_loss_clip": 0.0122394, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.06836724, "balance_loss_mlp": 1.02967739, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 2.0858369676241, "language_loss": 0.8107267, "learning_rate": 3.6495675538664974e-06, "loss": 0.83335376, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.549421548843384 }, { "auxiliary_loss_clip": 0.01322159, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.05783868, "balance_loss_mlp": 1.02453816, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 1.8043421653627878, "language_loss": 0.82531834, "learning_rate": 3.649126960605282e-06, "loss": 0.84887332, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.7396738529205322 }, { "auxiliary_loss_clip": 0.01327114, "auxiliary_loss_mlp": 0.01036224, "balance_loss_clip": 1.06694138, "balance_loss_mlp": 1.02718806, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 3.8218788601248934, "language_loss": 0.83437389, "learning_rate": 3.6486861171728174e-06, "loss": 0.85800731, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.8259174823760986 }, { "auxiliary_loss_clip": 0.01375823, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.05755234, "balance_loss_mlp": 1.02748787, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 2.4148424152212438, "language_loss": 0.78591049, "learning_rate": 3.6482450236359803e-06, "loss": 0.81003493, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.803398609161377 }, { "auxiliary_loss_clip": 0.01271812, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.06546664, "balance_loss_mlp": 1.01981449, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.561215370409979, "language_loss": 0.77874839, "learning_rate": 3.647803680061683e-06, "loss": 0.80175, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.722527027130127 }, { "auxiliary_loss_clip": 0.01321405, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.06302953, "balance_loss_mlp": 1.02364278, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 4.040781519588101, "language_loss": 0.74851084, "learning_rate": 3.6473620865168776e-06, "loss": 0.77205527, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.6588895320892334 }, { "auxiliary_loss_clip": 0.01323789, "auxiliary_loss_mlp": 0.01037547, "balance_loss_clip": 1.06502819, "balance_loss_mlp": 1.02877951, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 1.8637174041370437, "language_loss": 0.81647867, "learning_rate": 3.646920243068554e-06, "loss": 0.84009206, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.698209762573242 }, { "auxiliary_loss_clip": 0.01311413, "auxiliary_loss_mlp": 0.01025272, "balance_loss_clip": 1.0587709, "balance_loss_mlp": 1.01662982, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 6.690382426527499, "language_loss": 0.74545413, "learning_rate": 3.6464781497837384e-06, "loss": 0.768821, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.7022581100463867 }, { "auxiliary_loss_clip": 0.0132416, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.05836141, "balance_loss_mlp": 1.02804792, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.8000329786039209, "language_loss": 0.7290144, "learning_rate": 3.6460358067294965e-06, "loss": 0.75262702, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.779560089111328 }, { "auxiliary_loss_clip": 0.01224774, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.06364369, "balance_loss_mlp": 1.02151382, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.121907900622631, "language_loss": 0.77874351, "learning_rate": 3.645593213972932e-06, "loss": 0.8013041, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.623281478881836 }, { "auxiliary_loss_clip": 0.01270112, "auxiliary_loss_mlp": 0.01044163, "balance_loss_clip": 1.06137514, "balance_loss_mlp": 1.0345602, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.1023726878575846, "language_loss": 0.80270457, "learning_rate": 3.6451503715811852e-06, "loss": 0.82584739, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.6429011821746826 }, { "auxiliary_loss_clip": 0.01321374, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.06406641, "balance_loss_mlp": 1.02505672, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 1.9040539132475822, "language_loss": 0.80028355, "learning_rate": 3.6447072796214345e-06, "loss": 0.82383746, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.6334779262542725 }, { "auxiliary_loss_clip": 0.01329649, "auxiliary_loss_mlp": 0.01001847, "balance_loss_clip": 1.02841902, "balance_loss_mlp": 1.00020206, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.9049969286103443, "language_loss": 0.63119602, "learning_rate": 3.644263938160898e-06, "loss": 0.65451097, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.2207343578338623 }, { "auxiliary_loss_clip": 0.01373416, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.05969465, "balance_loss_mlp": 1.03090167, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 2.2940225935224445, "language_loss": 0.71886533, "learning_rate": 3.6438203472668293e-06, "loss": 0.74300915, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 3.667471408843994 }, { "auxiliary_loss_clip": 0.01325371, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.06387711, "balance_loss_mlp": 1.0295825, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 3.761815281131092, "language_loss": 0.82234895, "learning_rate": 3.6433765070065206e-06, "loss": 0.84598511, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.7338051795959473 }, { "auxiliary_loss_clip": 0.01224926, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.06554794, "balance_loss_mlp": 1.02522922, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 3.1218814819774416, "language_loss": 0.87782538, "learning_rate": 3.6429324174473025e-06, "loss": 0.90041608, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.6756317615509033 }, { "auxiliary_loss_clip": 0.01273087, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.06221628, "balance_loss_mlp": 1.02589536, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 3.0161572708815245, "language_loss": 0.84781528, "learning_rate": 3.6424880786565425e-06, "loss": 0.87089598, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.6617579460144043 }, { "auxiliary_loss_clip": 0.01412873, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.05963922, "balance_loss_mlp": 1.02527356, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.8105043503279843, "language_loss": 0.79709983, "learning_rate": 3.6420434907016482e-06, "loss": 0.82157588, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 3.701144218444824 }, { "auxiliary_loss_clip": 0.01274401, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.06681943, "balance_loss_mlp": 1.0250628, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 3.5978555649867676, "language_loss": 0.81395048, "learning_rate": 3.6415986536500606e-06, "loss": 0.83703572, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.6591572761535645 }, { "auxiliary_loss_clip": 0.01412531, "auxiliary_loss_mlp": 0.01037765, "balance_loss_clip": 1.06119049, "balance_loss_mlp": 1.02909303, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 2.342628866402087, "language_loss": 0.80443585, "learning_rate": 3.641153567569263e-06, "loss": 0.82893884, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 3.694887161254883 }, { "auxiliary_loss_clip": 0.01266325, "auxiliary_loss_mlp": 0.01035373, "balance_loss_clip": 1.0613656, "balance_loss_mlp": 1.02620554, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.5794797298738183, "language_loss": 0.95532787, "learning_rate": 3.640708232526774e-06, "loss": 0.9783448, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 2.736747980117798 }, { "auxiliary_loss_clip": 0.0146356, "auxiliary_loss_mlp": 0.01040572, "balance_loss_clip": 1.04798591, "balance_loss_mlp": 1.03146994, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 2.3396340931039066, "language_loss": 0.78917521, "learning_rate": 3.6402626485901504e-06, "loss": 0.81421655, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 2.797799825668335 }, { "auxiliary_loss_clip": 0.01265921, "auxiliary_loss_mlp": 0.01038536, "balance_loss_clip": 1.06284189, "balance_loss_mlp": 1.0298574, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.3710009202224533, "language_loss": 0.78312647, "learning_rate": 3.639816815826988e-06, "loss": 0.806171, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.7242581844329834 }, { "auxiliary_loss_clip": 0.01317931, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.06298518, "balance_loss_mlp": 1.02760911, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 1.8858278050058819, "language_loss": 0.7815882, "learning_rate": 3.6393707343049176e-06, "loss": 0.80512917, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 3.6603691577911377 }, { "auxiliary_loss_clip": 0.01274608, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.0620513, "balance_loss_mlp": 1.02955294, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.5102095147729004, "language_loss": 0.735116, "learning_rate": 3.6389244040916104e-06, "loss": 0.75825107, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.7004237174987793 }, { "auxiliary_loss_clip": 0.01316257, "auxiliary_loss_mlp": 0.02579937, "balance_loss_clip": 1.05892324, "balance_loss_mlp": 1.00002372, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 2.223379214500999, "language_loss": 0.79566258, "learning_rate": 3.6384778252547747e-06, "loss": 0.83462453, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.7978434562683105 }, { "auxiliary_loss_clip": 0.01314885, "auxiliary_loss_mlp": 0.02580902, "balance_loss_clip": 1.0615567, "balance_loss_mlp": 1.00004733, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.285920552183603, "language_loss": 0.78202534, "learning_rate": 3.638030997862155e-06, "loss": 0.82098323, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.671473741531372 }, { "auxiliary_loss_clip": 0.01216162, "auxiliary_loss_mlp": 0.01008649, "balance_loss_clip": 1.03184485, "balance_loss_mlp": 1.00694418, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7564528170685185, "language_loss": 0.59432018, "learning_rate": 3.6375839219815356e-06, "loss": 0.61656833, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.1525533199310303 }, { "auxiliary_loss_clip": 0.01222009, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.06364989, "balance_loss_mlp": 1.02298164, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 2.0161550397297696, "language_loss": 0.82601744, "learning_rate": 3.6371365976807375e-06, "loss": 0.84856403, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.6714985370635986 }, { "auxiliary_loss_clip": 0.01416506, "auxiliary_loss_mlp": 0.01041943, "balance_loss_clip": 1.06102824, "balance_loss_mlp": 1.0325433, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.6779311393265888, "language_loss": 0.83610618, "learning_rate": 3.6366890250276185e-06, "loss": 0.86069071, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.777338743209839 }, { "auxiliary_loss_clip": 0.01221653, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.06375146, "balance_loss_mlp": 1.02465546, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.387016392903714, "language_loss": 0.89780402, "learning_rate": 3.6362412040900764e-06, "loss": 0.92035913, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.6886239051818848 }, { "auxiliary_loss_clip": 0.0127732, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.06378341, "balance_loss_mlp": 1.02676427, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 1.864789241091381, "language_loss": 0.80332911, "learning_rate": 3.635793134936044e-06, "loss": 0.82646543, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.703505039215088 }, { "auxiliary_loss_clip": 0.01268723, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.06213379, "balance_loss_mlp": 1.02583683, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 1.8725702383660616, "language_loss": 0.72995639, "learning_rate": 3.635344817633494e-06, "loss": 0.75298882, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.6812338829040527 }, { "auxiliary_loss_clip": 0.01269274, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.06391335, "balance_loss_mlp": 1.02088928, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.0191732481649467, "language_loss": 0.75573444, "learning_rate": 3.634896252250436e-06, "loss": 0.77872461, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.7083253860473633 }, { "auxiliary_loss_clip": 0.01225352, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.06650662, "balance_loss_mlp": 1.0278002, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 1.7698607574185932, "language_loss": 0.82270294, "learning_rate": 3.6344474388549157e-06, "loss": 0.8453334, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.6956613063812256 }, { "auxiliary_loss_clip": 0.01274283, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.06459033, "balance_loss_mlp": 1.02148294, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 2.3342130709076074, "language_loss": 0.80255377, "learning_rate": 3.6339983775150183e-06, "loss": 0.82560611, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.625328302383423 }, { "auxiliary_loss_clip": 0.01269461, "auxiliary_loss_mlp": 0.01029305, "balance_loss_clip": 1.06438029, "balance_loss_mlp": 1.02045393, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 2.7970604649599644, "language_loss": 0.84078288, "learning_rate": 3.6335490682988664e-06, "loss": 0.8637706, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.6860878467559814 }, { "auxiliary_loss_clip": 0.01469119, "auxiliary_loss_mlp": 0.01037673, "balance_loss_clip": 1.05670798, "balance_loss_mlp": 1.02802289, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 2.9914652258743852, "language_loss": 0.82970464, "learning_rate": 3.63309951127462e-06, "loss": 0.85477257, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.7630081176757812 }, { "auxiliary_loss_clip": 0.01371227, "auxiliary_loss_mlp": 0.01033191, "balance_loss_clip": 1.0628891, "balance_loss_mlp": 1.02401781, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.1403053935800065, "language_loss": 0.75179166, "learning_rate": 3.6326497065104757e-06, "loss": 0.77583587, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.7554643154144287 }, { "auxiliary_loss_clip": 0.01279803, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.06540871, "balance_loss_mlp": 1.02693462, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 2.3441794960813147, "language_loss": 0.78161693, "learning_rate": 3.6321996540746697e-06, "loss": 0.80477238, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 3.67094349861145 }, { "auxiliary_loss_clip": 0.01370664, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.06219411, "balance_loss_mlp": 1.0185349, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.8522488254730218, "language_loss": 0.80658162, "learning_rate": 3.6317493540354733e-06, "loss": 0.83056068, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.827941656112671 }, { "auxiliary_loss_clip": 0.01269258, "auxiliary_loss_mlp": 0.0103112, "balance_loss_clip": 1.06122804, "balance_loss_mlp": 1.02160096, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 3.363740250902558, "language_loss": 0.7705164, "learning_rate": 3.6312988064611976e-06, "loss": 0.79352015, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.70715069770813 }, { "auxiliary_loss_clip": 0.01369863, "auxiliary_loss_mlp": 0.01029318, "balance_loss_clip": 1.05544591, "balance_loss_mlp": 1.02060425, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 1.7508629114528453, "language_loss": 0.81353599, "learning_rate": 3.6308480114201896e-06, "loss": 0.83752781, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 3.5960114002227783 }, { "auxiliary_loss_clip": 0.01226517, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.06802845, "balance_loss_mlp": 1.0354526, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.9589345506656368, "language_loss": 0.76258695, "learning_rate": 3.630396968980835e-06, "loss": 0.78530562, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.6338889598846436 }, { "auxiliary_loss_clip": 0.01322303, "auxiliary_loss_mlp": 0.0103645, "balance_loss_clip": 1.06048012, "balance_loss_mlp": 1.02738392, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 3.20191906944085, "language_loss": 0.83128929, "learning_rate": 3.6299456792115575e-06, "loss": 0.85487688, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.823408365249634 }, { "auxiliary_loss_clip": 0.01426012, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.04846036, "balance_loss_mlp": 1.02321124, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 2.7530071133951695, "language_loss": 0.80904317, "learning_rate": 3.629494142180815e-06, "loss": 0.83362782, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.7246651649475098 }, { "auxiliary_loss_clip": 0.01223889, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.06633472, "balance_loss_mlp": 1.02261984, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.1913291336338014, "language_loss": 0.85129225, "learning_rate": 3.6290423579571075e-06, "loss": 0.87385237, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 2.6867613792419434 }, { "auxiliary_loss_clip": 0.0126897, "auxiliary_loss_mlp": 0.0103319, "balance_loss_clip": 1.06325495, "balance_loss_mlp": 1.02430928, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.696262651886668, "language_loss": 0.80342662, "learning_rate": 3.6285903266089694e-06, "loss": 0.82644826, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.6539618968963623 }, { "auxiliary_loss_clip": 0.01325917, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.06407619, "balance_loss_mlp": 1.027843, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 1.962005155059415, "language_loss": 0.77145129, "learning_rate": 3.628138048204974e-06, "loss": 0.79508126, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.684108257293701 }, { "auxiliary_loss_clip": 0.01418558, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.05913782, "balance_loss_mlp": 1.02258527, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 3.6966878753413464, "language_loss": 0.76377082, "learning_rate": 3.6276855228137304e-06, "loss": 0.78828979, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 3.703460693359375 }, { "auxiliary_loss_clip": 0.01222313, "auxiliary_loss_mlp": 0.02581577, "balance_loss_clip": 1.06520689, "balance_loss_mlp": 0.99998903, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.286886766137286, "language_loss": 0.81913334, "learning_rate": 3.6272327505038874e-06, "loss": 0.85717225, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.640068531036377 }, { "auxiliary_loss_clip": 0.01347407, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.0582788, "balance_loss_mlp": 1.02045465, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 3.0225514934789377, "language_loss": 0.78342295, "learning_rate": 3.626779731344131e-06, "loss": 0.80719286, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.803255558013916 }, { "auxiliary_loss_clip": 0.01218379, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.06287909, "balance_loss_mlp": 1.02512467, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.16371920075099, "language_loss": 0.85393989, "learning_rate": 3.6263264654031814e-06, "loss": 0.87646258, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.5834646224975586 }, { "auxiliary_loss_clip": 0.01268515, "auxiliary_loss_mlp": 0.01000562, "balance_loss_clip": 1.02855587, "balance_loss_mlp": 0.99905455, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.6932171599192984, "language_loss": 0.59074008, "learning_rate": 3.6258729527498008e-06, "loss": 0.61343086, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.2964723110198975 }, { "auxiliary_loss_clip": 0.01327028, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.06356025, "balance_loss_mlp": 1.02463913, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.5143963170106027, "language_loss": 0.6476115, "learning_rate": 3.6254191934527854e-06, "loss": 0.67122394, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.7232534885406494 }, { "auxiliary_loss_clip": 0.01379144, "auxiliary_loss_mlp": 0.01034664, "balance_loss_clip": 1.06766224, "balance_loss_mlp": 1.02559829, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 2.5656449517258086, "language_loss": 0.64896661, "learning_rate": 3.6249651875809715e-06, "loss": 0.67310464, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.7850353717803955 }, { "auxiliary_loss_clip": 0.01316132, "auxiliary_loss_mlp": 0.01033504, "balance_loss_clip": 1.06382978, "balance_loss_mlp": 1.02548718, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 2.3796383790200477, "language_loss": 0.8915965, "learning_rate": 3.62451093520323e-06, "loss": 0.91509283, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.6955721378326416 }, { "auxiliary_loss_clip": 0.01420753, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.05746925, "balance_loss_mlp": 1.02904391, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 3.026755904331516, "language_loss": 0.90478647, "learning_rate": 3.6240564363884714e-06, "loss": 0.92937261, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.731663942337036 }, { "auxiliary_loss_clip": 0.01279598, "auxiliary_loss_mlp": 0.01035225, "balance_loss_clip": 1.06434083, "balance_loss_mlp": 1.0252353, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 2.556576510665642, "language_loss": 0.70534098, "learning_rate": 3.623601691205643e-06, "loss": 0.72848928, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.677340030670166 }, { "auxiliary_loss_clip": 0.01270425, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.06165981, "balance_loss_mlp": 1.02423072, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.738875591268448, "language_loss": 0.81741476, "learning_rate": 3.623146699723729e-06, "loss": 0.84044945, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.656926155090332 }, { "auxiliary_loss_clip": 0.0131753, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.06518447, "balance_loss_mlp": 1.03019929, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.8387116214174863, "language_loss": 0.77685106, "learning_rate": 3.6226914620117507e-06, "loss": 0.80041993, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.6997530460357666 }, { "auxiliary_loss_clip": 0.01288104, "auxiliary_loss_mlp": 0.01029154, "balance_loss_clip": 1.05038667, "balance_loss_mlp": 1.02056468, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.4870314769279425, "language_loss": 0.8087306, "learning_rate": 3.622235978138768e-06, "loss": 0.83190322, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.731600284576416 }, { "auxiliary_loss_clip": 0.01271268, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.06600428, "balance_loss_mlp": 1.02274144, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 1.8860092181182444, "language_loss": 0.8156479, "learning_rate": 3.621780248173877e-06, "loss": 0.83867681, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.7144312858581543 }, { "auxiliary_loss_clip": 0.01171349, "auxiliary_loss_mlp": 0.01000355, "balance_loss_clip": 1.03004885, "balance_loss_mlp": 0.99881679, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8316270463594376, "language_loss": 0.61016047, "learning_rate": 3.6213242721862125e-06, "loss": 0.63187754, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.2136545181274414 }, { "auxiliary_loss_clip": 0.01317244, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.06292582, "balance_loss_mlp": 1.02690911, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.699694044303913, "language_loss": 0.75320333, "learning_rate": 3.620868050244945e-06, "loss": 0.77674007, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.7618048191070557 }, { "auxiliary_loss_clip": 0.01321739, "auxiliary_loss_mlp": 0.01032043, "balance_loss_clip": 1.06208038, "balance_loss_mlp": 1.02226138, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 1.9547039535291633, "language_loss": 0.77669793, "learning_rate": 3.6204115824192817e-06, "loss": 0.80023575, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 3.595716953277588 }, { "auxiliary_loss_clip": 0.01313205, "auxiliary_loss_mlp": 0.01031488, "balance_loss_clip": 1.05587721, "balance_loss_mlp": 1.02262509, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.5876381746343995, "language_loss": 0.76561755, "learning_rate": 3.619954868778471e-06, "loss": 0.78906453, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.6792781352996826 }, { "auxiliary_loss_clip": 0.01319507, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.05817735, "balance_loss_mlp": 1.03422976, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.908814384156504, "language_loss": 0.83008993, "learning_rate": 3.6194979093917944e-06, "loss": 0.85371745, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 3.656123399734497 }, { "auxiliary_loss_clip": 0.01316235, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.05902982, "balance_loss_mlp": 1.02294922, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 2.888739745645323, "language_loss": 0.86972153, "learning_rate": 3.6190407043285724e-06, "loss": 0.89320183, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.675870656967163 }, { "auxiliary_loss_clip": 0.01227475, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.06779146, "balance_loss_mlp": 1.02618694, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 1.9823733642289447, "language_loss": 0.75730932, "learning_rate": 3.618583253658163e-06, "loss": 0.77993935, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.6585733890533447 }, { "auxiliary_loss_clip": 0.01423567, "auxiliary_loss_mlp": 0.02580019, "balance_loss_clip": 1.06051755, "balance_loss_mlp": 0.99990785, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 8.498673202335121, "language_loss": 0.86513865, "learning_rate": 3.618125557449961e-06, "loss": 0.90517449, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 3.713798761367798 }, { "auxiliary_loss_clip": 0.012702, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.0642978, "balance_loss_mlp": 1.01983643, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 2.091846162269351, "language_loss": 0.83193767, "learning_rate": 3.6176676157733983e-06, "loss": 0.85492051, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.6170475482940674 }, { "auxiliary_loss_clip": 0.01367701, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.0594089, "balance_loss_mlp": 1.02679074, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.763568460897376, "language_loss": 0.76400989, "learning_rate": 3.6172094286979443e-06, "loss": 0.7880441, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.7104973793029785 }, { "auxiliary_loss_clip": 0.01323299, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.06223083, "balance_loss_mlp": 1.02079415, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.80730714345949, "language_loss": 0.81481832, "learning_rate": 3.6167509962931064e-06, "loss": 0.83834743, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 2.7885870933532715 }, { "auxiliary_loss_clip": 0.01426716, "auxiliary_loss_mlp": 0.01032979, "balance_loss_clip": 1.06423235, "balance_loss_mlp": 1.02391338, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 4.971600565844564, "language_loss": 0.77124548, "learning_rate": 3.6162923186284276e-06, "loss": 0.79584241, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 3.689216375350952 }, { "auxiliary_loss_clip": 0.01319084, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.05751073, "balance_loss_mlp": 1.02381861, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 2.608587945673555, "language_loss": 0.85960579, "learning_rate": 3.6158333957734888e-06, "loss": 0.88312, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.6634373664855957 }, { "auxiliary_loss_clip": 0.01374505, "auxiliary_loss_mlp": 0.010315, "balance_loss_clip": 1.05964208, "balance_loss_mlp": 1.02297103, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.169609644411522, "language_loss": 0.83120674, "learning_rate": 3.6153742277979088e-06, "loss": 0.85526681, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.689760446548462 }, { "auxiliary_loss_clip": 0.01316822, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.05990291, "balance_loss_mlp": 1.02871335, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 2.4737512980458116, "language_loss": 0.78222299, "learning_rate": 3.6149148147713434e-06, "loss": 0.80576175, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.6986377239227295 }, { "auxiliary_loss_clip": 0.01280478, "auxiliary_loss_mlp": 0.01040224, "balance_loss_clip": 1.07044697, "balance_loss_mlp": 1.0320642, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 2.960107293850705, "language_loss": 0.86944038, "learning_rate": 3.614455156763484e-06, "loss": 0.89264739, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.6707797050476074 }, { "auxiliary_loss_clip": 0.01418687, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.05591345, "balance_loss_mlp": 1.0213027, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.224286838864007, "language_loss": 0.7122336, "learning_rate": 3.613995253844061e-06, "loss": 0.73671907, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.8312413692474365 }, { "auxiliary_loss_clip": 0.01275853, "auxiliary_loss_mlp": 0.01039833, "balance_loss_clip": 1.0692122, "balance_loss_mlp": 1.03011799, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 9.120987066373251, "language_loss": 0.80703247, "learning_rate": 3.6135351060828414e-06, "loss": 0.83018923, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.7239787578582764 }, { "auxiliary_loss_clip": 0.01228214, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.06929827, "balance_loss_mlp": 1.02521658, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 1.996554383918932, "language_loss": 0.69767499, "learning_rate": 3.6130747135496285e-06, "loss": 0.72030425, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.6796834468841553 }, { "auxiliary_loss_clip": 0.01221459, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.0647819, "balance_loss_mlp": 1.02675319, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 2.079552866384218, "language_loss": 0.66292602, "learning_rate": 3.6126140763142646e-06, "loss": 0.68549562, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.739823579788208 }, { "auxiliary_loss_clip": 0.01223835, "auxiliary_loss_mlp": 0.01038969, "balance_loss_clip": 1.06625664, "balance_loss_mlp": 1.03024292, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 7.872005608953601, "language_loss": 0.85812747, "learning_rate": 3.6121531944466275e-06, "loss": 0.88075542, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.6168830394744873 }, { "auxiliary_loss_clip": 0.01268277, "auxiliary_loss_mlp": 0.0102474, "balance_loss_clip": 1.06307912, "balance_loss_mlp": 1.01590037, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.5878705896651417, "language_loss": 0.78105211, "learning_rate": 3.611692068016633e-06, "loss": 0.80398232, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.7410264015197754 }, { "auxiliary_loss_clip": 0.01372222, "auxiliary_loss_mlp": 0.01041258, "balance_loss_clip": 1.05798399, "balance_loss_mlp": 1.03114319, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 2.3765302702046314, "language_loss": 0.75533044, "learning_rate": 3.611230697094233e-06, "loss": 0.77946526, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.707064628601074 }, { "auxiliary_loss_clip": 0.01325473, "auxiliary_loss_mlp": 0.01033619, "balance_loss_clip": 1.06293571, "balance_loss_mlp": 1.0247972, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.192801456587915, "language_loss": 0.87583548, "learning_rate": 3.6107690817494173e-06, "loss": 0.89942634, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.7143213748931885 }, { "auxiliary_loss_clip": 0.01409743, "auxiliary_loss_mlp": 0.01034254, "balance_loss_clip": 1.05304372, "balance_loss_mlp": 1.02531898, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 3.3640416079368523, "language_loss": 0.71066928, "learning_rate": 3.6103072220522117e-06, "loss": 0.73510927, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.757096529006958 }, { "auxiliary_loss_clip": 0.01372335, "auxiliary_loss_mlp": 0.01032565, "balance_loss_clip": 1.05819595, "balance_loss_mlp": 1.02436352, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 2.496030038775358, "language_loss": 0.9186008, "learning_rate": 3.609845118072682e-06, "loss": 0.94264978, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.7222440242767334 }, { "auxiliary_loss_clip": 0.01275011, "auxiliary_loss_mlp": 0.02577282, "balance_loss_clip": 1.06329823, "balance_loss_mlp": 1.00004017, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 3.765272620739279, "language_loss": 0.79860616, "learning_rate": 3.6093827698809276e-06, "loss": 0.83712912, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.667694330215454 }, { "auxiliary_loss_clip": 0.0126834, "auxiliary_loss_mlp": 0.01037119, "balance_loss_clip": 1.05982089, "balance_loss_mlp": 1.02872109, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.247304075911097, "language_loss": 0.84504592, "learning_rate": 3.6089201775470864e-06, "loss": 0.86810052, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 3.621570587158203 }, { "auxiliary_loss_clip": 0.01357114, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.05574155, "balance_loss_mlp": 1.02106047, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.4723351130395, "language_loss": 0.77404165, "learning_rate": 3.6084573411413334e-06, "loss": 0.7979126, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.779315710067749 }, { "auxiliary_loss_clip": 0.01366764, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.05836153, "balance_loss_mlp": 1.02199674, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 1.9399862537407804, "language_loss": 0.8090809, "learning_rate": 3.607994260733881e-06, "loss": 0.83306485, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.6707701683044434 }, { "auxiliary_loss_clip": 0.01258592, "auxiliary_loss_mlp": 0.01028746, "balance_loss_clip": 1.05752087, "balance_loss_mlp": 1.02106321, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.6824783798474794, "language_loss": 0.74632347, "learning_rate": 3.6075309363949776e-06, "loss": 0.76919687, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 3.563429594039917 }, { "auxiliary_loss_clip": 0.01221824, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.06466126, "balance_loss_mlp": 1.02586675, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 1.9379266225566665, "language_loss": 0.81295007, "learning_rate": 3.6070673681949094e-06, "loss": 0.83551633, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.6333870887756348 }, { "auxiliary_loss_clip": 0.01320123, "auxiliary_loss_mlp": 0.02572672, "balance_loss_clip": 1.06190348, "balance_loss_mlp": 1.00002217, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.7518672529011046, "language_loss": 0.81588238, "learning_rate": 3.606603556203999e-06, "loss": 0.85481036, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.794067144393921 }, { "auxiliary_loss_clip": 0.01268978, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.06021714, "balance_loss_mlp": 1.02866077, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 2.1923081740506305, "language_loss": 0.83778906, "learning_rate": 3.6061395004926066e-06, "loss": 0.86084872, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 3.576988935470581 }, { "auxiliary_loss_clip": 0.01219981, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.06331861, "balance_loss_mlp": 1.02312541, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 2.8146734521628973, "language_loss": 0.84979779, "learning_rate": 3.605675201131129e-06, "loss": 0.87232149, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 2.58349347114563 }, { "auxiliary_loss_clip": 0.01196077, "auxiliary_loss_mlp": 0.01027653, "balance_loss_clip": 1.06601608, "balance_loss_mlp": 1.01912928, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.1206896244923934, "language_loss": 0.7986542, "learning_rate": 3.60521065819e-06, "loss": 0.8208915, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 2.6721315383911133 }, { "auxiliary_loss_clip": 0.01319885, "auxiliary_loss_mlp": 0.01038562, "balance_loss_clip": 1.05863976, "balance_loss_mlp": 1.02986002, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.866319222661721, "language_loss": 0.87831426, "learning_rate": 3.60474587173969e-06, "loss": 0.90189874, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.716698169708252 }, { "auxiliary_loss_clip": 0.01267578, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.06370807, "balance_loss_mlp": 1.0267868, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 2.7984184007321597, "language_loss": 0.84179306, "learning_rate": 3.6042808418507084e-06, "loss": 0.86482358, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 3.5499744415283203 }, { "auxiliary_loss_clip": 0.01271383, "auxiliary_loss_mlp": 0.01031346, "balance_loss_clip": 1.06540823, "balance_loss_mlp": 1.02219701, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 2.3763668305747547, "language_loss": 0.77170593, "learning_rate": 3.6038155685935976e-06, "loss": 0.79473323, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.6338815689086914 }, { "auxiliary_loss_clip": 0.01268701, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.06223714, "balance_loss_mlp": 1.02463269, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 2.6789459480266427, "language_loss": 0.70868111, "learning_rate": 3.6033500520389404e-06, "loss": 0.73170882, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.806927442550659 }, { "auxiliary_loss_clip": 0.01326426, "auxiliary_loss_mlp": 0.01008635, "balance_loss_clip": 1.02681661, "balance_loss_mlp": 1.00710928, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.7982207930301609, "language_loss": 0.64735079, "learning_rate": 3.6028842922573553e-06, "loss": 0.67070138, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.383547782897949 }, { "auxiliary_loss_clip": 0.01282561, "auxiliary_loss_mlp": 0.02525571, "balance_loss_clip": 1.02764165, "balance_loss_mlp": 0.9996807, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8534720171281481, "language_loss": 0.62867379, "learning_rate": 3.602418289319497e-06, "loss": 0.6667552, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.2421741485595703 }, { "auxiliary_loss_clip": 0.01417034, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.05528259, "balance_loss_mlp": 1.02472591, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 2.272271658331969, "language_loss": 0.7371304, "learning_rate": 3.601952043296059e-06, "loss": 0.76163572, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.806100606918335 }, { "auxiliary_loss_clip": 0.01243993, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.05951512, "balance_loss_mlp": 1.02483153, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.018886016256577, "language_loss": 0.80679357, "learning_rate": 3.6014855542577696e-06, "loss": 0.8295728, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.653918743133545 }, { "auxiliary_loss_clip": 0.01320573, "auxiliary_loss_mlp": 0.01039071, "balance_loss_clip": 1.06154346, "balance_loss_mlp": 1.02979076, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 4.293655842748259, "language_loss": 0.84517992, "learning_rate": 3.6010188222753943e-06, "loss": 0.86877632, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.763356924057007 }, { "auxiliary_loss_clip": 0.01222633, "auxiliary_loss_mlp": 0.01004954, "balance_loss_clip": 1.03365898, "balance_loss_mlp": 1.00349987, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.8972490597652837, "language_loss": 0.64139819, "learning_rate": 3.6005518474197372e-06, "loss": 0.66367412, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.1565401554107666 }, { "auxiliary_loss_clip": 0.01269615, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.06427217, "balance_loss_mlp": 1.02885604, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 2.7686144028701083, "language_loss": 0.78464198, "learning_rate": 3.6000846297616373e-06, "loss": 0.80772424, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.704374074935913 }, { "auxiliary_loss_clip": 0.01224562, "auxiliary_loss_mlp": 0.01039351, "balance_loss_clip": 1.06699908, "balance_loss_mlp": 1.02970707, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.367438596881115, "language_loss": 0.72540689, "learning_rate": 3.5996171693719717e-06, "loss": 0.74804604, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.6481473445892334 }, { "auxiliary_loss_clip": 0.01167727, "auxiliary_loss_mlp": 0.01006199, "balance_loss_clip": 1.02929807, "balance_loss_mlp": 1.00469673, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8432532237632567, "language_loss": 0.64802945, "learning_rate": 3.5991494663216528e-06, "loss": 0.66976869, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.2695555686950684 }, { "auxiliary_loss_clip": 0.01222487, "auxiliary_loss_mlp": 0.01040668, "balance_loss_clip": 1.06617641, "balance_loss_mlp": 1.03197217, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 1.8875704472887151, "language_loss": 0.87689722, "learning_rate": 3.5986815206816314e-06, "loss": 0.89952874, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.6459643840789795 }, { "auxiliary_loss_clip": 0.01218542, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.06281257, "balance_loss_mlp": 1.02094293, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 1.7442029136648958, "language_loss": 0.74900228, "learning_rate": 3.598213332522895e-06, "loss": 0.77148473, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.6363868713378906 }, { "auxiliary_loss_clip": 0.01267545, "auxiliary_loss_mlp": 0.01033822, "balance_loss_clip": 1.06110835, "balance_loss_mlp": 1.02516115, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.905596044706425, "language_loss": 0.77201402, "learning_rate": 3.597744901916466e-06, "loss": 0.79502767, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.7319064140319824 }, { "auxiliary_loss_clip": 0.01221767, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.06146657, "balance_loss_mlp": 1.02899063, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 3.6923526987657236, "language_loss": 0.77040458, "learning_rate": 3.5972762289334058e-06, "loss": 0.79301023, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.6155734062194824 }, { "auxiliary_loss_clip": 0.01457842, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.05332804, "balance_loss_mlp": 1.02666473, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 2.8626261397450246, "language_loss": 0.85359251, "learning_rate": 3.5968073136448116e-06, "loss": 0.87852943, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.819629430770874 }, { "auxiliary_loss_clip": 0.0127405, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.06183195, "balance_loss_mlp": 1.02608967, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.6503690366170085, "language_loss": 0.91530478, "learning_rate": 3.596338156121818e-06, "loss": 0.93839526, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 3.6550562381744385 }, { "auxiliary_loss_clip": 0.01214533, "auxiliary_loss_mlp": 0.01011812, "balance_loss_clip": 1.02730894, "balance_loss_mlp": 1.01028621, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7400251792914777, "language_loss": 0.59343737, "learning_rate": 3.595868756435595e-06, "loss": 0.61570084, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.3865480422973633 }, { "auxiliary_loss_clip": 0.01369881, "auxiliary_loss_mlp": 0.01038607, "balance_loss_clip": 1.06033862, "balance_loss_mlp": 1.0293982, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.454089329907637, "language_loss": 0.80313575, "learning_rate": 3.5953991146573504e-06, "loss": 0.82722062, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.7250075340270996 }, { "auxiliary_loss_clip": 0.01270642, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.05858624, "balance_loss_mlp": 1.02463865, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.630055236627394, "language_loss": 0.8331399, "learning_rate": 3.5949292308583294e-06, "loss": 0.8561945, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 3.4754066467285156 }, { "auxiliary_loss_clip": 0.01222972, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.06720245, "balance_loss_mlp": 1.02627814, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.758846829476453, "language_loss": 0.80932254, "learning_rate": 3.594459105109811e-06, "loss": 0.83190715, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.609114646911621 }, { "auxiliary_loss_clip": 0.01270105, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.06234014, "balance_loss_mlp": 1.02557206, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 2.4629271227468457, "language_loss": 0.81603265, "learning_rate": 3.593988737483115e-06, "loss": 0.83907574, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 3.6285440921783447 }, { "auxiliary_loss_clip": 0.01323002, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.06445265, "balance_loss_mlp": 1.02241254, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 2.0594492691988058, "language_loss": 0.78599274, "learning_rate": 3.5935181280495947e-06, "loss": 0.80953753, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.6871848106384277 }, { "auxiliary_loss_clip": 0.01208686, "auxiliary_loss_mlp": 0.00997473, "balance_loss_clip": 1.02456427, "balance_loss_mlp": 0.99604207, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.805585579546953, "language_loss": 0.54264432, "learning_rate": 3.5930472768806412e-06, "loss": 0.56470591, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.2540714740753174 }, { "auxiliary_loss_clip": 0.01220396, "auxiliary_loss_mlp": 0.01032388, "balance_loss_clip": 1.06514513, "balance_loss_mlp": 1.02279162, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 2.071968279026329, "language_loss": 0.7712605, "learning_rate": 3.5925761840476826e-06, "loss": 0.79378837, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 2.632484197616577 }, { "auxiliary_loss_clip": 0.01315792, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.06249249, "balance_loss_mlp": 1.02349651, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 2.0121463543404734, "language_loss": 0.81477618, "learning_rate": 3.592104849622183e-06, "loss": 0.83825231, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 3.6134703159332275 }, { "auxiliary_loss_clip": 0.01412288, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.05698705, "balance_loss_mlp": 1.01848853, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.4669158912866949, "language_loss": 0.73385346, "learning_rate": 3.591633273675644e-06, "loss": 0.75825572, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.78517746925354 }, { "auxiliary_loss_clip": 0.01265082, "auxiliary_loss_mlp": 0.01011973, "balance_loss_clip": 1.03677034, "balance_loss_mlp": 1.01042295, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9079384273471515, "language_loss": 0.58161056, "learning_rate": 3.591161456279602e-06, "loss": 0.60438108, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 3.0940945148468018 }, { "auxiliary_loss_clip": 0.01322985, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.0601691, "balance_loss_mlp": 1.02340031, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.649944732958213, "language_loss": 0.80628037, "learning_rate": 3.590689397505633e-06, "loss": 0.82983756, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.7187061309814453 }, { "auxiliary_loss_clip": 0.01217488, "auxiliary_loss_mlp": 0.01026193, "balance_loss_clip": 1.06358457, "balance_loss_mlp": 1.01824772, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.7213519601113307, "language_loss": 0.86919832, "learning_rate": 3.590217097425347e-06, "loss": 0.89163512, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.7208385467529297 }, { "auxiliary_loss_clip": 0.01220258, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.06361926, "balance_loss_mlp": 1.02447414, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 3.071569785922368, "language_loss": 0.71327645, "learning_rate": 3.589744556110391e-06, "loss": 0.73582149, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.5946052074432373 }, { "auxiliary_loss_clip": 0.01317001, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.05834901, "balance_loss_mlp": 1.02670336, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 2.1133939564369784, "language_loss": 0.84680337, "learning_rate": 3.58927177363245e-06, "loss": 0.87033689, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.8584423065185547 }, { "auxiliary_loss_clip": 0.01284719, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.05574751, "balance_loss_mlp": 1.0230118, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 3.158271724916513, "language_loss": 0.7298606, "learning_rate": 3.5887987500632447e-06, "loss": 0.75303411, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.7693490982055664 }, { "auxiliary_loss_clip": 0.01372117, "auxiliary_loss_mlp": 0.01035433, "balance_loss_clip": 1.05811548, "balance_loss_mlp": 1.02774405, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.7248048440630026, "language_loss": 0.84626186, "learning_rate": 3.5883254854745325e-06, "loss": 0.87033737, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.7901201248168945 }, { "auxiliary_loss_clip": 0.01272883, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.06170475, "balance_loss_mlp": 1.02512527, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 2.30014954904484, "language_loss": 0.75218666, "learning_rate": 3.587851979938107e-06, "loss": 0.77526629, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.6089015007019043 }, { "auxiliary_loss_clip": 0.01267038, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.06240034, "balance_loss_mlp": 1.03110528, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 1.9445959771673278, "language_loss": 0.7733947, "learning_rate": 3.5873782335257985e-06, "loss": 0.79647183, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.6734976768493652 }, { "auxiliary_loss_clip": 0.01367525, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.06249404, "balance_loss_mlp": 1.02263606, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.0128483730448776, "language_loss": 0.78547311, "learning_rate": 3.5869042463094744e-06, "loss": 0.80946678, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.699697256088257 }, { "auxiliary_loss_clip": 0.01405283, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 1.05299592, "balance_loss_mlp": 1.0274601, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 2.00933247131545, "language_loss": 0.77351147, "learning_rate": 3.586430018361038e-06, "loss": 0.79793078, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.8040659427642822 }, { "auxiliary_loss_clip": 0.01311829, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.05833638, "balance_loss_mlp": 1.02535844, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 2.6798154731059967, "language_loss": 0.76530367, "learning_rate": 3.5859555497524283e-06, "loss": 0.78876829, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.7362613677978516 }, { "auxiliary_loss_clip": 0.01268825, "auxiliary_loss_mlp": 0.0103274, "balance_loss_clip": 1.06345952, "balance_loss_mlp": 1.02341795, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.134678306216583, "language_loss": 0.9222852, "learning_rate": 3.5854808405556237e-06, "loss": 0.94530094, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.64353346824646 }, { "auxiliary_loss_clip": 0.01367803, "auxiliary_loss_mlp": 0.0103399, "balance_loss_clip": 1.05652928, "balance_loss_mlp": 1.02586603, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 4.110975466271458, "language_loss": 0.75843859, "learning_rate": 3.5850058908426355e-06, "loss": 0.78245658, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.7357468605041504 }, { "auxiliary_loss_clip": 0.01319328, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.0573138, "balance_loss_mlp": 1.02886438, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 3.106339726433307, "language_loss": 0.85599416, "learning_rate": 3.584530700685514e-06, "loss": 0.87957466, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 3.5847437381744385 }, { "auxiliary_loss_clip": 0.01313342, "auxiliary_loss_mlp": 0.0103007, "balance_loss_clip": 1.06197453, "balance_loss_mlp": 1.02154064, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.773967743169451, "language_loss": 0.88719773, "learning_rate": 3.5840552701563448e-06, "loss": 0.91063178, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.640392780303955 }, { "auxiliary_loss_clip": 0.0121809, "auxiliary_loss_mlp": 0.01030158, "balance_loss_clip": 1.06332827, "balance_loss_mlp": 1.02087116, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.1616852845805195, "language_loss": 0.82291329, "learning_rate": 3.5835795993272513e-06, "loss": 0.8453958, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.5955958366394043 }, { "auxiliary_loss_clip": 0.01560411, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.04929137, "balance_loss_mlp": 1.02926636, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 1.8796297699986553, "language_loss": 0.71565819, "learning_rate": 3.583103688270391e-06, "loss": 0.74164391, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 4.017335653305054 }, { "auxiliary_loss_clip": 0.01310907, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.05752754, "balance_loss_mlp": 1.02649379, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.2487917354460043, "language_loss": 0.89787412, "learning_rate": 3.58262753705796e-06, "loss": 0.9213385, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 3.1744954586029053 }, { "auxiliary_loss_clip": 0.01203553, "auxiliary_loss_mlp": 0.0100709, "balance_loss_clip": 1.02303195, "balance_loss_mlp": 1.00573087, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7681578756123835, "language_loss": 0.55518496, "learning_rate": 3.5821511457621902e-06, "loss": 0.57729137, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.305053949356079 }, { "auxiliary_loss_clip": 0.01315125, "auxiliary_loss_mlp": 0.0103521, "balance_loss_clip": 1.0605166, "balance_loss_mlp": 1.02479064, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 10.77120844767438, "language_loss": 0.81499714, "learning_rate": 3.5816745144553497e-06, "loss": 0.8385005, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 4.122735500335693 }, { "auxiliary_loss_clip": 0.01406752, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.05725837, "balance_loss_mlp": 1.02622938, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.022049390516949, "language_loss": 0.75497305, "learning_rate": 3.5811976432097424e-06, "loss": 0.77939612, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.7634754180908203 }, { "auxiliary_loss_clip": 0.01266659, "auxiliary_loss_mlp": 0.02574003, "balance_loss_clip": 1.06383562, "balance_loss_mlp": 1.00023746, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 2.2065232985677894, "language_loss": 0.85059702, "learning_rate": 3.58072053209771e-06, "loss": 0.88900363, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.597238063812256 }, { "auxiliary_loss_clip": 0.01314458, "auxiliary_loss_mlp": 0.01032224, "balance_loss_clip": 1.05587244, "balance_loss_mlp": 1.02298522, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.781758331341295, "language_loss": 0.7923435, "learning_rate": 3.5802431811916296e-06, "loss": 0.81581032, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 2.716317892074585 }, { "auxiliary_loss_clip": 0.01311858, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.05909705, "balance_loss_mlp": 1.02919507, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 1.7573575396385546, "language_loss": 0.80691469, "learning_rate": 3.579765590563916e-06, "loss": 0.83041084, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.6737658977508545 }, { "auxiliary_loss_clip": 0.01261556, "auxiliary_loss_mlp": 0.01044341, "balance_loss_clip": 1.05992639, "balance_loss_mlp": 1.03558517, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 2.5308152510148147, "language_loss": 0.82059437, "learning_rate": 3.579287760287017e-06, "loss": 0.84365338, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 3.6124868392944336 }, { "auxiliary_loss_clip": 0.01265451, "auxiliary_loss_mlp": 0.01042863, "balance_loss_clip": 1.06311667, "balance_loss_mlp": 1.03421485, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 2.800109301967222, "language_loss": 0.72646242, "learning_rate": 3.578809690433421e-06, "loss": 0.74954557, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.7733728885650635 }, { "auxiliary_loss_clip": 0.01226059, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.06853926, "balance_loss_mlp": 1.021824, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 6.269405518569942, "language_loss": 0.81350422, "learning_rate": 3.578331381075651e-06, "loss": 0.83607256, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.6663758754730225 }, { "auxiliary_loss_clip": 0.01268693, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.06054354, "balance_loss_mlp": 1.02617121, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.1343837253037643, "language_loss": 0.70236021, "learning_rate": 3.5778528322862646e-06, "loss": 0.72539842, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.6780872344970703 }, { "auxiliary_loss_clip": 0.01192808, "auxiliary_loss_mlp": 0.01036014, "balance_loss_clip": 1.06351829, "balance_loss_mlp": 1.02643538, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.5636887199867735, "language_loss": 0.86689496, "learning_rate": 3.5773740441378585e-06, "loss": 0.88918316, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.7372612953186035 }, { "auxiliary_loss_clip": 0.01267018, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.06264508, "balance_loss_mlp": 1.0289638, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.8369772268409867, "language_loss": 0.73931539, "learning_rate": 3.5768950167030633e-06, "loss": 0.76236498, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.918297052383423 }, { "auxiliary_loss_clip": 0.01308948, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.05538547, "balance_loss_mlp": 1.02831984, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 2.1587274892082893, "language_loss": 0.78558046, "learning_rate": 3.576415750054548e-06, "loss": 0.80904567, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.755760431289673 }, { "auxiliary_loss_clip": 0.01310601, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.05669212, "balance_loss_mlp": 1.02987516, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 2.372522104521301, "language_loss": 0.85843158, "learning_rate": 3.5759362442650172e-06, "loss": 0.88192689, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.663092851638794 }, { "auxiliary_loss_clip": 0.01274665, "auxiliary_loss_mlp": 0.01036839, "balance_loss_clip": 1.06847405, "balance_loss_mlp": 1.02710593, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 3.307450071177933, "language_loss": 0.85340488, "learning_rate": 3.5754564994072113e-06, "loss": 0.87651998, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.74505877494812 }, { "auxiliary_loss_clip": 0.0131288, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.05630064, "balance_loss_mlp": 1.01975727, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 11.684594704474568, "language_loss": 0.60231233, "learning_rate": 3.5749765155539067e-06, "loss": 0.62572753, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.7644052505493164 }, { "auxiliary_loss_clip": 0.01359186, "auxiliary_loss_mlp": 0.01034059, "balance_loss_clip": 1.05582666, "balance_loss_mlp": 1.02411127, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.2800192627872304, "language_loss": 0.92112887, "learning_rate": 3.574496292777917e-06, "loss": 0.94506133, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.754769802093506 }, { "auxiliary_loss_clip": 0.01323638, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.06239367, "balance_loss_mlp": 1.0249002, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 2.1288370011008677, "language_loss": 0.71735018, "learning_rate": 3.574015831152092e-06, "loss": 0.74092674, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.745117425918579 }, { "auxiliary_loss_clip": 0.01307505, "auxiliary_loss_mlp": 0.01029849, "balance_loss_clip": 1.05888104, "balance_loss_mlp": 1.02142727, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 5.404891455028431, "language_loss": 0.83219945, "learning_rate": 3.573535130749316e-06, "loss": 0.85557294, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.6845388412475586 }, { "auxiliary_loss_clip": 0.0131169, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.06130385, "balance_loss_mlp": 1.02159917, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.71806240589035, "language_loss": 0.73680335, "learning_rate": 3.5730541916425127e-06, "loss": 0.76022649, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.762404203414917 }, { "auxiliary_loss_clip": 0.01362397, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.05571818, "balance_loss_mlp": 1.02152562, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 2.231724518467874, "language_loss": 0.86245835, "learning_rate": 3.572573013904639e-06, "loss": 0.8863886, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.7309954166412354 }, { "auxiliary_loss_clip": 0.0121586, "auxiliary_loss_mlp": 0.01037409, "balance_loss_clip": 1.06296539, "balance_loss_mlp": 1.02857542, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 1.8784314343320545, "language_loss": 0.92044175, "learning_rate": 3.572091597608689e-06, "loss": 0.94297445, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.5806539058685303 }, { "auxiliary_loss_clip": 0.01319417, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.05955577, "balance_loss_mlp": 1.02547681, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.3151343868669114, "language_loss": 0.73922431, "learning_rate": 3.571609942827694e-06, "loss": 0.76276052, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 3.689704179763794 }, { "auxiliary_loss_clip": 0.01313408, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.05775845, "balance_loss_mlp": 1.02488494, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 2.315185929686582, "language_loss": 0.88805771, "learning_rate": 3.57112804963472e-06, "loss": 0.91152829, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.7121055126190186 }, { "auxiliary_loss_clip": 0.01358915, "auxiliary_loss_mlp": 0.01030618, "balance_loss_clip": 1.0601573, "balance_loss_mlp": 1.02232075, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 1.97287172454595, "language_loss": 0.76365477, "learning_rate": 3.57064591810287e-06, "loss": 0.78755009, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 3.5276012420654297 }, { "auxiliary_loss_clip": 0.01216366, "auxiliary_loss_mlp": 0.02573999, "balance_loss_clip": 1.06324303, "balance_loss_mlp": 1.00018704, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.2420860372933076, "language_loss": 0.80667353, "learning_rate": 3.570163548305284e-06, "loss": 0.84457719, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 2.691075563430786 }, { "auxiliary_loss_clip": 0.01312744, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.0587399, "balance_loss_mlp": 1.02330852, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 2.1560587593258442, "language_loss": 0.69773656, "learning_rate": 3.569680940315135e-06, "loss": 0.72118926, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.62320613861084 }, { "auxiliary_loss_clip": 0.01366355, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.05538809, "balance_loss_mlp": 1.02645183, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 1.8147274837027487, "language_loss": 0.81827688, "learning_rate": 3.5691980942056356e-06, "loss": 0.84229815, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.7448320388793945 }, { "auxiliary_loss_clip": 0.01267062, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.0581491, "balance_loss_mlp": 1.02358246, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 1.9691872527670764, "language_loss": 0.79817414, "learning_rate": 3.5687150100500332e-06, "loss": 0.8211714, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 3.673537015914917 }, { "auxiliary_loss_clip": 0.01270196, "auxiliary_loss_mlp": 0.01034175, "balance_loss_clip": 1.0625664, "balance_loss_mlp": 1.0252645, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.8866810138466232, "language_loss": 0.74434906, "learning_rate": 3.568231687921611e-06, "loss": 0.76739275, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.8294010162353516 }, { "auxiliary_loss_clip": 0.01212969, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.06124902, "balance_loss_mlp": 1.02666163, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.611411390724804, "language_loss": 0.8067534, "learning_rate": 3.5677481278936883e-06, "loss": 0.82923067, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 2.633720874786377 }, { "auxiliary_loss_clip": 0.01214663, "auxiliary_loss_mlp": 0.01001824, "balance_loss_clip": 1.02942157, "balance_loss_mlp": 1.00039911, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.8380039256783458, "language_loss": 0.57857513, "learning_rate": 3.5672643300396214e-06, "loss": 0.60073996, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.3568155765533447 }, { "auxiliary_loss_clip": 0.01361967, "auxiliary_loss_mlp": 0.01029079, "balance_loss_clip": 1.05771732, "balance_loss_mlp": 1.02097893, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.486289940663306, "language_loss": 0.67199159, "learning_rate": 3.566780294432802e-06, "loss": 0.69590205, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 3.578482151031494 }, { "auxiliary_loss_clip": 0.01212149, "auxiliary_loss_mlp": 0.01037378, "balance_loss_clip": 1.05975878, "balance_loss_mlp": 1.02877676, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.7585687578577693, "language_loss": 0.74966359, "learning_rate": 3.566296021146657e-06, "loss": 0.77215886, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.655172348022461 }, { "auxiliary_loss_clip": 0.01217586, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.06187582, "balance_loss_mlp": 1.02922583, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 2.0995161708564365, "language_loss": 0.73226601, "learning_rate": 3.565811510254652e-06, "loss": 0.7548278, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.792651414871216 }, { "auxiliary_loss_clip": 0.0116899, "auxiliary_loss_mlp": 0.01003555, "balance_loss_clip": 1.03577042, "balance_loss_mlp": 1.00207698, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8417150792464798, "language_loss": 0.58221865, "learning_rate": 3.5653267618302845e-06, "loss": 0.60394406, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.2486581802368164 }, { "auxiliary_loss_clip": 0.01214581, "auxiliary_loss_mlp": 0.01028898, "balance_loss_clip": 1.06205821, "balance_loss_mlp": 1.02068472, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 2.020923440478174, "language_loss": 0.85912204, "learning_rate": 3.564841775947093e-06, "loss": 0.88155681, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.642845630645752 }, { "auxiliary_loss_clip": 0.01363986, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.05520058, "balance_loss_mlp": 1.02973986, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.106009236525713, "language_loss": 0.76187587, "learning_rate": 3.5643565526786475e-06, "loss": 0.78589821, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.781310796737671 }, { "auxiliary_loss_clip": 0.01218005, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.06349182, "balance_loss_mlp": 1.02338004, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.6646778667112287, "language_loss": 0.77335358, "learning_rate": 3.5638710920985574e-06, "loss": 0.79584944, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.7182579040527344 }, { "auxiliary_loss_clip": 0.01268689, "auxiliary_loss_mlp": 0.02575929, "balance_loss_clip": 1.05738401, "balance_loss_mlp": 1.00015271, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.2701671042219673, "language_loss": 0.82114089, "learning_rate": 3.5633853942804655e-06, "loss": 0.85958707, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.6937849521636963 }, { "auxiliary_loss_clip": 0.01358531, "auxiliary_loss_mlp": 0.01043075, "balance_loss_clip": 1.05047262, "balance_loss_mlp": 1.03332412, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 2.5441354146983355, "language_loss": 0.77011275, "learning_rate": 3.5628994592980527e-06, "loss": 0.79412889, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.6846225261688232 }, { "auxiliary_loss_clip": 0.01214338, "auxiliary_loss_mlp": 0.01029291, "balance_loss_clip": 1.06030369, "balance_loss_mlp": 1.02029061, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.881965822172395, "language_loss": 0.70491612, "learning_rate": 3.562413287225034e-06, "loss": 0.72735244, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.671572208404541 }, { "auxiliary_loss_clip": 0.01260361, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.05882716, "balance_loss_mlp": 1.02587533, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.507571590224718, "language_loss": 0.89286542, "learning_rate": 3.5619268781351623e-06, "loss": 0.91581994, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.736248731613159 }, { "auxiliary_loss_clip": 0.01306625, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.0591445, "balance_loss_mlp": 1.02762437, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 1.8050434164249045, "language_loss": 0.76920247, "learning_rate": 3.5614402321022256e-06, "loss": 0.79263437, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.777867317199707 }, { "auxiliary_loss_clip": 0.01405862, "auxiliary_loss_mlp": 0.01042337, "balance_loss_clip": 1.05546391, "balance_loss_mlp": 1.03346765, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.9057890217624283, "language_loss": 0.87069786, "learning_rate": 3.5609533492000463e-06, "loss": 0.89517975, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.8368828296661377 }, { "auxiliary_loss_clip": 0.01309934, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.06023574, "balance_loss_mlp": 1.01854038, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.2607608470244025, "language_loss": 0.78625858, "learning_rate": 3.560466229502485e-06, "loss": 0.80963349, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.8222124576568604 }, { "auxiliary_loss_clip": 0.01309288, "auxiliary_loss_mlp": 0.02572892, "balance_loss_clip": 1.05953538, "balance_loss_mlp": 1.00016475, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.097917112169926, "language_loss": 0.90552491, "learning_rate": 3.5599788730834384e-06, "loss": 0.94434667, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.7352869510650635 }, { "auxiliary_loss_clip": 0.01268022, "auxiliary_loss_mlp": 0.01028866, "balance_loss_clip": 1.05988789, "balance_loss_mlp": 1.01972818, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.11975285821781, "language_loss": 0.78729534, "learning_rate": 3.559491280016836e-06, "loss": 0.81026423, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.6619064807891846 }, { "auxiliary_loss_clip": 0.01309793, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.05692649, "balance_loss_mlp": 1.02419674, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.650851691020631, "language_loss": 0.70798135, "learning_rate": 3.5590034503766465e-06, "loss": 0.73140562, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 3.600346565246582 }, { "auxiliary_loss_clip": 0.01216045, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.06330311, "balance_loss_mlp": 1.02353644, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.0397553111468887, "language_loss": 0.81040114, "learning_rate": 3.558515384236874e-06, "loss": 0.83288336, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.5338199138641357 }, { "auxiliary_loss_clip": 0.01353023, "auxiliary_loss_mlp": 0.02574616, "balance_loss_clip": 1.05504668, "balance_loss_mlp": 1.00017416, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.8592027016786858, "language_loss": 0.84102112, "learning_rate": 3.558027081671556e-06, "loss": 0.88029748, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 3.3376665115356445 }, { "auxiliary_loss_clip": 0.01266204, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.05793333, "balance_loss_mlp": 1.02114844, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 1.6927789625105827, "language_loss": 0.6882571, "learning_rate": 3.557538542754769e-06, "loss": 0.71121937, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 2.5560810565948486 }, { "auxiliary_loss_clip": 0.0121992, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.06575692, "balance_loss_mlp": 1.02659559, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 2.379044128226998, "language_loss": 0.66700256, "learning_rate": 3.557049767560623e-06, "loss": 0.68956071, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.6965601444244385 }, { "auxiliary_loss_clip": 0.01416011, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.06122541, "balance_loss_mlp": 1.02488422, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 2.7165730784434112, "language_loss": 0.85916483, "learning_rate": 3.5565607561632655e-06, "loss": 0.88365829, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 3.685983419418335 }, { "auxiliary_loss_clip": 0.01311788, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.06021059, "balance_loss_mlp": 1.02232385, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.7150633278523983, "language_loss": 0.79951727, "learning_rate": 3.5560715086368787e-06, "loss": 0.82294667, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 2.750213623046875 }, { "auxiliary_loss_clip": 0.01309537, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.0612725, "balance_loss_mlp": 1.0273416, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.116510283387766, "language_loss": 0.82538503, "learning_rate": 3.5555820250556816e-06, "loss": 0.8488431, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.6314034461975098 }, { "auxiliary_loss_clip": 0.01241419, "auxiliary_loss_mlp": 0.0103151, "balance_loss_clip": 1.06253767, "balance_loss_mlp": 1.02220619, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.377665055768994, "language_loss": 0.69521999, "learning_rate": 3.5550923054939278e-06, "loss": 0.71794927, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 2.6452934741973877 }, { "auxiliary_loss_clip": 0.01338127, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.05845046, "balance_loss_mlp": 1.03147936, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 2.1074794213469943, "language_loss": 0.74738097, "learning_rate": 3.5546023500259083e-06, "loss": 0.77115935, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.725383758544922 }, { "auxiliary_loss_clip": 0.01418492, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.05386472, "balance_loss_mlp": 1.02637875, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 2.18626864242386, "language_loss": 0.8076126, "learning_rate": 3.5541121587259477e-06, "loss": 0.83214772, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 3.613677740097046 }, { "auxiliary_loss_clip": 0.01158875, "auxiliary_loss_mlp": 0.009993, "balance_loss_clip": 1.03155887, "balance_loss_mlp": 0.99806005, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8471247679829159, "language_loss": 0.57873303, "learning_rate": 3.553621731668408e-06, "loss": 0.60031474, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.187566041946411 }, { "auxiliary_loss_clip": 0.01263527, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.05807412, "balance_loss_mlp": 1.02145588, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 1.9242364563531915, "language_loss": 0.83397371, "learning_rate": 3.553131068927688e-06, "loss": 0.85691416, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.661674737930298 }, { "auxiliary_loss_clip": 0.01362067, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.05913949, "balance_loss_mlp": 1.02263355, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.8435314135481022, "language_loss": 0.80364478, "learning_rate": 3.552640170578219e-06, "loss": 0.82758009, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.7319180965423584 }, { "auxiliary_loss_clip": 0.01312829, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.0576216, "balance_loss_mlp": 1.02279711, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.247766143998083, "language_loss": 0.78108573, "learning_rate": 3.5521490366944703e-06, "loss": 0.80453146, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.619098663330078 }, { "auxiliary_loss_clip": 0.01357781, "auxiliary_loss_mlp": 0.01038717, "balance_loss_clip": 1.05494809, "balance_loss_mlp": 1.030092, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.6306154895084375, "language_loss": 0.80094069, "learning_rate": 3.5516576673509474e-06, "loss": 0.82490563, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.7244203090667725 }, { "auxiliary_loss_clip": 0.01218792, "auxiliary_loss_mlp": 0.01035916, "balance_loss_clip": 1.06334496, "balance_loss_mlp": 1.02716589, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.9714584207925971, "language_loss": 0.86039865, "learning_rate": 3.5511660626221896e-06, "loss": 0.88294578, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.702199935913086 }, { "auxiliary_loss_clip": 0.01310774, "auxiliary_loss_mlp": 0.02576519, "balance_loss_clip": 1.06084776, "balance_loss_mlp": 1.00027585, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 2.362305996985324, "language_loss": 0.89356732, "learning_rate": 3.5506742225827744e-06, "loss": 0.93244016, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.7343859672546387 }, { "auxiliary_loss_clip": 0.01360394, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.05509853, "balance_loss_mlp": 1.02651143, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.2577006207401342, "language_loss": 0.90309417, "learning_rate": 3.5501821473073116e-06, "loss": 0.92705065, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.799283266067505 }, { "auxiliary_loss_clip": 0.0136762, "auxiliary_loss_mlp": 0.0103922, "balance_loss_clip": 1.06252253, "balance_loss_mlp": 1.02981424, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.355780744988493, "language_loss": 0.86886299, "learning_rate": 3.54968983687045e-06, "loss": 0.89293134, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.805720090866089 }, { "auxiliary_loss_clip": 0.01327076, "auxiliary_loss_mlp": 0.01034896, "balance_loss_clip": 1.06501758, "balance_loss_mlp": 1.02528739, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 4.704649427077454, "language_loss": 0.8913883, "learning_rate": 3.549197291346872e-06, "loss": 0.91500795, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.692507266998291 }, { "auxiliary_loss_clip": 0.01267802, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.05963373, "balance_loss_mlp": 1.0236429, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.553275814649865, "language_loss": 0.79225433, "learning_rate": 3.548704510811297e-06, "loss": 0.81526619, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.692596197128296 }, { "auxiliary_loss_clip": 0.01337367, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.0578258, "balance_loss_mlp": 1.02455926, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 3.780559895009379, "language_loss": 0.74971807, "learning_rate": 3.5482114953384787e-06, "loss": 0.77343494, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.83145809173584 }, { "auxiliary_loss_clip": 0.01267707, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.05987489, "balance_loss_mlp": 1.01998973, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.8863378163734335, "language_loss": 0.84732389, "learning_rate": 3.5477182450032077e-06, "loss": 0.87029004, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.62113356590271 }, { "auxiliary_loss_clip": 0.01264079, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.06093526, "balance_loss_mlp": 1.02550304, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 2.4706478739861994, "language_loss": 0.83265519, "learning_rate": 3.5472247598803097e-06, "loss": 0.85565317, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.6657283306121826 }, { "auxiliary_loss_clip": 0.01216903, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.06109309, "balance_loss_mlp": 1.01607907, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 4.04297839284616, "language_loss": 0.85286772, "learning_rate": 3.546731040044645e-06, "loss": 0.87529504, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.614208459854126 }, { "auxiliary_loss_clip": 0.0121723, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.06363845, "balance_loss_mlp": 1.0311811, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 2.2108174196615433, "language_loss": 0.75252712, "learning_rate": 3.546237085571112e-06, "loss": 0.7751075, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 3.5687689781188965 }, { "auxiliary_loss_clip": 0.01268409, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.0627507, "balance_loss_mlp": 1.02164114, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 3.1361228160049675, "language_loss": 0.72493315, "learning_rate": 3.5457428965346425e-06, "loss": 0.74792469, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.6901602745056152 }, { "auxiliary_loss_clip": 0.01465297, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.05375671, "balance_loss_mlp": 1.02343798, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.6285959609266867, "language_loss": 0.74736071, "learning_rate": 3.545248473010205e-06, "loss": 0.77233893, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 3.8339364528656006 }, { "auxiliary_loss_clip": 0.01219904, "auxiliary_loss_mlp": 0.02582677, "balance_loss_clip": 1.06295061, "balance_loss_mlp": 1.00026536, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.8164999601366036, "language_loss": 0.880404, "learning_rate": 3.544753815072802e-06, "loss": 0.91842979, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.8230113983154297 }, { "auxiliary_loss_clip": 0.01514801, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.05293524, "balance_loss_mlp": 1.02533889, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 1.9165965199080641, "language_loss": 0.88270915, "learning_rate": 3.544258922797474e-06, "loss": 0.90820718, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.9394783973693848 }, { "auxiliary_loss_clip": 0.01214447, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.06110215, "balance_loss_mlp": 1.02478421, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.6914831417435723, "language_loss": 0.78350824, "learning_rate": 3.543763796259295e-06, "loss": 0.80599236, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 4.181771755218506 }, { "auxiliary_loss_clip": 0.01261113, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.05909586, "balance_loss_mlp": 1.02760696, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.8065392786198953, "language_loss": 0.90800297, "learning_rate": 3.5432684355333754e-06, "loss": 0.93098593, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 2.7367825508117676 }, { "auxiliary_loss_clip": 0.01265048, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.05884027, "balance_loss_mlp": 1.02251899, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 3.0508284778579164, "language_loss": 0.76728487, "learning_rate": 3.5427728406948613e-06, "loss": 0.79025239, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.6627392768859863 }, { "auxiliary_loss_clip": 0.01215785, "auxiliary_loss_mlp": 0.01001111, "balance_loss_clip": 1.03045678, "balance_loss_mlp": 0.99981135, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7497471874378326, "language_loss": 0.57918411, "learning_rate": 3.542277011818934e-06, "loss": 0.60135305, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.4131600856781006 }, { "auxiliary_loss_clip": 0.01235061, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.0626725, "balance_loss_mlp": 1.02616763, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.014982228662073, "language_loss": 0.74118161, "learning_rate": 3.5417809489808104e-06, "loss": 0.76387942, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.872576951980591 }, { "auxiliary_loss_clip": 0.0126822, "auxiliary_loss_mlp": 0.01037005, "balance_loss_clip": 1.06218314, "balance_loss_mlp": 1.02793896, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 2.002194433395579, "language_loss": 0.72552025, "learning_rate": 3.5412846522557422e-06, "loss": 0.74857253, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 3.6466939449310303 }, { "auxiliary_loss_clip": 0.01218411, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.06427062, "balance_loss_mlp": 1.02787161, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.095112210630518, "language_loss": 0.73884177, "learning_rate": 3.540788121719018e-06, "loss": 0.76139903, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.598159074783325 }, { "auxiliary_loss_clip": 0.0136401, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.06395447, "balance_loss_mlp": 1.02960205, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 2.131651190888136, "language_loss": 0.82259047, "learning_rate": 3.5402913574459604e-06, "loss": 0.84661597, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.7055678367614746 }, { "auxiliary_loss_clip": 0.01377833, "auxiliary_loss_mlp": 0.01029067, "balance_loss_clip": 1.05007672, "balance_loss_mlp": 1.02025771, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 5.4704559427499175, "language_loss": 0.86235315, "learning_rate": 3.5397943595119297e-06, "loss": 0.88642216, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.838533878326416 }, { "auxiliary_loss_clip": 0.01312439, "auxiliary_loss_mlp": 0.01033824, "balance_loss_clip": 1.06176436, "balance_loss_mlp": 1.02503812, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 2.7597959833512826, "language_loss": 0.7724818, "learning_rate": 3.5392971279923177e-06, "loss": 0.79594433, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.691261053085327 }, { "auxiliary_loss_clip": 0.01361591, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.05517542, "balance_loss_mlp": 1.02910674, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.3025071263391492, "language_loss": 0.82986832, "learning_rate": 3.5387996629625557e-06, "loss": 0.85386837, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.8049063682556152 }, { "auxiliary_loss_clip": 0.01111175, "auxiliary_loss_mlp": 0.0100408, "balance_loss_clip": 1.02996111, "balance_loss_mlp": 1.00282192, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.805736003588865, "language_loss": 0.54988122, "learning_rate": 3.5383019644981083e-06, "loss": 0.57103372, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.201660394668579 }, { "auxiliary_loss_clip": 0.01309384, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.05774641, "balance_loss_mlp": 1.02319813, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.577626830585538, "language_loss": 0.73181158, "learning_rate": 3.5378040326744763e-06, "loss": 0.7552284, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.637068748474121 }, { "auxiliary_loss_clip": 0.01358184, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.05798578, "balance_loss_mlp": 1.02159071, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.127481236335183, "language_loss": 0.86031979, "learning_rate": 3.5373058675671946e-06, "loss": 0.88420498, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.7284598350524902 }, { "auxiliary_loss_clip": 0.01408005, "auxiliary_loss_mlp": 0.0103152, "balance_loss_clip": 1.05329823, "balance_loss_mlp": 1.02277637, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.0772509916994375, "language_loss": 0.72653651, "learning_rate": 3.536807469251836e-06, "loss": 0.75093174, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.7652342319488525 }, { "auxiliary_loss_clip": 0.01288093, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.05492878, "balance_loss_mlp": 1.02048171, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 1.6904256824664252, "language_loss": 0.83272541, "learning_rate": 3.5363088378040055e-06, "loss": 0.85590196, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.7444818019866943 }, { "auxiliary_loss_clip": 0.01110614, "auxiliary_loss_mlp": 0.02524748, "balance_loss_clip": 1.02908707, "balance_loss_mlp": 0.99987918, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7525979418515761, "language_loss": 0.64283001, "learning_rate": 3.5358099732993463e-06, "loss": 0.6791836, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.1334311962127686 }, { "auxiliary_loss_clip": 0.01321297, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.06030846, "balance_loss_mlp": 1.02157438, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 2.0804820566197324, "language_loss": 0.896474, "learning_rate": 3.535310875813535e-06, "loss": 0.91999304, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.6430015563964844 }, { "auxiliary_loss_clip": 0.01259242, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.05697691, "balance_loss_mlp": 1.02481365, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 2.345070387204629, "language_loss": 0.81579685, "learning_rate": 3.5348115454222843e-06, "loss": 0.83873093, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.8425345420837402 }, { "auxiliary_loss_clip": 0.01311352, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.0552671, "balance_loss_mlp": 1.01633704, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 2.1641272436421732, "language_loss": 0.86343288, "learning_rate": 3.5343119822013425e-06, "loss": 0.88679546, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.695773124694824 }, { "auxiliary_loss_clip": 0.01270193, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.06030273, "balance_loss_mlp": 1.02937317, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 1.9116681504461093, "language_loss": 0.77666271, "learning_rate": 3.533812186226493e-06, "loss": 0.7997551, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.7093608379364014 }, { "auxiliary_loss_clip": 0.01210774, "auxiliary_loss_mlp": 0.01031852, "balance_loss_clip": 1.05957568, "balance_loss_mlp": 1.02365041, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.8954267613516815, "language_loss": 0.76008409, "learning_rate": 3.5333121575735545e-06, "loss": 0.78251034, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 3.978466272354126 }, { "auxiliary_loss_clip": 0.01316516, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.06169999, "balance_loss_mlp": 1.0292536, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 2.1234129530789194, "language_loss": 0.75567067, "learning_rate": 3.532811896318381e-06, "loss": 0.77921355, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 3.6529383659362793 }, { "auxiliary_loss_clip": 0.01367139, "auxiliary_loss_mlp": 0.01037759, "balance_loss_clip": 1.05795801, "balance_loss_mlp": 1.02789426, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 3.9333314974963347, "language_loss": 0.81498367, "learning_rate": 3.5323114025368615e-06, "loss": 0.83903265, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.8374757766723633 }, { "auxiliary_loss_clip": 0.01260902, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.05554879, "balance_loss_mlp": 1.02573001, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.389929338981685, "language_loss": 0.81738389, "learning_rate": 3.53181067630492e-06, "loss": 0.84034055, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.6508355140686035 }, { "auxiliary_loss_clip": 0.01307375, "auxiliary_loss_mlp": 0.01036737, "balance_loss_clip": 1.05759656, "balance_loss_mlp": 1.02748013, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 2.600404985603515, "language_loss": 0.75845599, "learning_rate": 3.5313097176985175e-06, "loss": 0.78189707, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.6776084899902344 }, { "auxiliary_loss_clip": 0.01265244, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.06186438, "balance_loss_mlp": 1.02203989, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.747399534571349, "language_loss": 0.81243813, "learning_rate": 3.5308085267936482e-06, "loss": 0.83539808, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 3.5886030197143555 }, { "auxiliary_loss_clip": 0.01379747, "auxiliary_loss_mlp": 0.025748, "balance_loss_clip": 1.05474663, "balance_loss_mlp": 1.0002563, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 2.387163053043289, "language_loss": 0.8999666, "learning_rate": 3.530307103666342e-06, "loss": 0.93951201, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 2.801833391189575 }, { "auxiliary_loss_clip": 0.01289242, "auxiliary_loss_mlp": 0.01037837, "balance_loss_clip": 1.05882597, "balance_loss_mlp": 1.02866995, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 1.9452759067769267, "language_loss": 0.80524856, "learning_rate": 3.5298054483926658e-06, "loss": 0.82851934, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.756282329559326 }, { "auxiliary_loss_clip": 0.01272458, "auxiliary_loss_mlp": 0.01037605, "balance_loss_clip": 1.06160808, "balance_loss_mlp": 1.02802682, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 2.2521245691047396, "language_loss": 0.82735711, "learning_rate": 3.5293035610487187e-06, "loss": 0.85045779, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 2.7086665630340576 }, { "auxiliary_loss_clip": 0.0121328, "auxiliary_loss_mlp": 0.01017324, "balance_loss_clip": 1.02761531, "balance_loss_mlp": 1.01607251, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.725926517229073, "language_loss": 0.61961913, "learning_rate": 3.5288014417106374e-06, "loss": 0.64192522, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.3027215003967285 }, { "auxiliary_loss_clip": 0.01362284, "auxiliary_loss_mlp": 0.01030611, "balance_loss_clip": 1.05705214, "balance_loss_mlp": 1.02165222, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.9391540580481164, "language_loss": 0.76025289, "learning_rate": 3.528299090454593e-06, "loss": 0.78418183, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 3.712082624435425 }, { "auxiliary_loss_clip": 0.01265602, "auxiliary_loss_mlp": 0.01036677, "balance_loss_clip": 1.05806434, "balance_loss_mlp": 1.02678216, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.433731197839955, "language_loss": 0.82634252, "learning_rate": 3.527796507356792e-06, "loss": 0.84936529, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.8650405406951904 }, { "auxiliary_loss_clip": 0.01187206, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.06058455, "balance_loss_mlp": 1.02498877, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 3.5796500636831614, "language_loss": 0.89902198, "learning_rate": 3.527293692493475e-06, "loss": 0.921229, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.6718039512634277 }, { "auxiliary_loss_clip": 0.01268805, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.05908835, "balance_loss_mlp": 1.02586162, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 3.0697322782814473, "language_loss": 0.73400229, "learning_rate": 3.52679064594092e-06, "loss": 0.75703788, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.6493000984191895 }, { "auxiliary_loss_clip": 0.01405501, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.04668212, "balance_loss_mlp": 1.02524567, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.656216529934147, "language_loss": 0.74858856, "learning_rate": 3.5262873677754375e-06, "loss": 0.77298653, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.833987236022949 }, { "auxiliary_loss_clip": 0.01215841, "auxiliary_loss_mlp": 0.01030403, "balance_loss_clip": 1.06273711, "balance_loss_mlp": 1.02198696, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 2.3586044359649803, "language_loss": 0.80327135, "learning_rate": 3.5257838580733745e-06, "loss": 0.82573378, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.614351511001587 }, { "auxiliary_loss_clip": 0.01271289, "auxiliary_loss_mlp": 0.01029563, "balance_loss_clip": 1.06232548, "balance_loss_mlp": 1.02093208, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 1.8678067558067382, "language_loss": 0.872558, "learning_rate": 3.5252801169111138e-06, "loss": 0.89556658, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.6933176517486572 }, { "auxiliary_loss_clip": 0.01311357, "auxiliary_loss_mlp": 0.01032143, "balance_loss_clip": 1.06051016, "balance_loss_mlp": 1.02333951, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.881456010199446, "language_loss": 0.80060267, "learning_rate": 3.524776144365072e-06, "loss": 0.82403773, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.70794939994812 }, { "auxiliary_loss_clip": 0.01310598, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.06386697, "balance_loss_mlp": 1.03139591, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.5915427943763425, "language_loss": 0.79380101, "learning_rate": 3.5242719405117016e-06, "loss": 0.81730425, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.7502689361572266 }, { "auxiliary_loss_clip": 0.01325637, "auxiliary_loss_mlp": 0.02577951, "balance_loss_clip": 1.06519699, "balance_loss_mlp": 1.00025189, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 13.530613666607833, "language_loss": 0.751293, "learning_rate": 3.5237675054274893e-06, "loss": 0.79032892, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.7842822074890137 }, { "auxiliary_loss_clip": 0.01265437, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.0591104, "balance_loss_mlp": 1.02621365, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.9006991718800461, "language_loss": 0.80257404, "learning_rate": 3.5232628391889584e-06, "loss": 0.82558048, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.6563146114349365 }, { "auxiliary_loss_clip": 0.01406328, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.05485487, "balance_loss_mlp": 1.02862573, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 3.398369017724221, "language_loss": 0.64552951, "learning_rate": 3.522757941872666e-06, "loss": 0.66996145, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.80312180519104 }, { "auxiliary_loss_clip": 0.01216729, "auxiliary_loss_mlp": 0.02576031, "balance_loss_clip": 1.06388235, "balance_loss_mlp": 1.00015664, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.8342288610258561, "language_loss": 0.82701504, "learning_rate": 3.5222528135552042e-06, "loss": 0.86494255, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.6286051273345947 }, { "auxiliary_loss_clip": 0.01263599, "auxiliary_loss_mlp": 0.0103872, "balance_loss_clip": 1.06345177, "balance_loss_mlp": 1.02899241, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 1.884168704698469, "language_loss": 0.80819267, "learning_rate": 3.521747454313201e-06, "loss": 0.8312158, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.966130018234253 }, { "auxiliary_loss_clip": 0.01356995, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.03365314, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 2.2550812402142735, "language_loss": 0.66750771, "learning_rate": 3.521241864223319e-06, "loss": 0.69150895, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.7609410285949707 }, { "auxiliary_loss_clip": 0.01229684, "auxiliary_loss_mlp": 0.01005501, "balance_loss_clip": 1.03521538, "balance_loss_mlp": 1.00415981, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.7905224995353267, "language_loss": 0.61910224, "learning_rate": 3.5207360433622552e-06, "loss": 0.6414541, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.2126245498657227 }, { "auxiliary_loss_clip": 0.01307836, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.05751967, "balance_loss_mlp": 1.02943039, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.8369356770645453, "language_loss": 0.74975824, "learning_rate": 3.5202299918067437e-06, "loss": 0.77321815, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 3.8206698894500732 }, { "auxiliary_loss_clip": 0.01266233, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.06242728, "balance_loss_mlp": 1.02756763, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.271287958378783, "language_loss": 0.69314259, "learning_rate": 3.519723709633551e-06, "loss": 0.71616757, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 3.4979257583618164 }, { "auxiliary_loss_clip": 0.01309233, "auxiliary_loss_mlp": 0.01036134, "balance_loss_clip": 1.05864716, "balance_loss_mlp": 1.02747929, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 3.596672403844209, "language_loss": 0.83592469, "learning_rate": 3.519217196919479e-06, "loss": 0.8593784, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.772091865539551 }, { "auxiliary_loss_clip": 0.01320331, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.06511903, "balance_loss_mlp": 1.03062487, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 2.040256574424427, "language_loss": 0.7362448, "learning_rate": 3.518710453741367e-06, "loss": 0.75984538, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.7082159519195557 }, { "auxiliary_loss_clip": 0.01306977, "auxiliary_loss_mlp": 0.02577202, "balance_loss_clip": 1.05878472, "balance_loss_mlp": 1.00013542, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.232891729698551, "language_loss": 0.68204629, "learning_rate": 3.518203480176086e-06, "loss": 0.72088808, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.714195489883423 }, { "auxiliary_loss_clip": 0.01515056, "auxiliary_loss_mlp": 0.01038182, "balance_loss_clip": 1.04883313, "balance_loss_mlp": 1.02938437, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.7380605899636838, "language_loss": 0.80938196, "learning_rate": 3.517696276300545e-06, "loss": 0.83491433, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 3.9173824787139893 }, { "auxiliary_loss_clip": 0.01267178, "auxiliary_loss_mlp": 0.01037671, "balance_loss_clip": 1.06422997, "balance_loss_mlp": 1.02857506, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.7710384864394872, "language_loss": 0.69061017, "learning_rate": 3.517188842191685e-06, "loss": 0.71365869, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 2.7400810718536377 }, { "auxiliary_loss_clip": 0.01263235, "auxiliary_loss_mlp": 0.01038811, "balance_loss_clip": 1.05906463, "balance_loss_mlp": 1.02979326, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.6441911484839018, "language_loss": 0.74405682, "learning_rate": 3.5166811779264837e-06, "loss": 0.76707733, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.7787890434265137 }, { "auxiliary_loss_clip": 0.01214691, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.06024885, "balance_loss_mlp": 1.0280931, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 2.931356372241926, "language_loss": 0.78199726, "learning_rate": 3.5161732835819545e-06, "loss": 0.80451804, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 2.659229278564453 }, { "auxiliary_loss_clip": 0.01217002, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.06382632, "balance_loss_mlp": 1.02474248, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 2.5786485815105347, "language_loss": 0.83692467, "learning_rate": 3.515665159235143e-06, "loss": 0.85943037, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.5719804763793945 }, { "auxiliary_loss_clip": 0.01313583, "auxiliary_loss_mlp": 0.01036494, "balance_loss_clip": 1.05682135, "balance_loss_mlp": 1.02792275, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.6004943147790314, "language_loss": 0.75116974, "learning_rate": 3.5151568049631318e-06, "loss": 0.77467048, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 3.5829272270202637 }, { "auxiliary_loss_clip": 0.01216331, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.0626719, "balance_loss_mlp": 1.02746558, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 1.8209681672205338, "language_loss": 0.80366945, "learning_rate": 3.5146482208430385e-06, "loss": 0.82619488, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.6953911781311035 }, { "auxiliary_loss_clip": 0.01451232, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.0471431, "balance_loss_mlp": 1.03296268, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 1.9934084007231414, "language_loss": 0.6764158, "learning_rate": 3.514139406952014e-06, "loss": 0.70134854, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.9102559089660645 }, { "auxiliary_loss_clip": 0.01262839, "auxiliary_loss_mlp": 0.0103704, "balance_loss_clip": 1.06114376, "balance_loss_mlp": 1.02856374, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 1.819495316238732, "language_loss": 0.83178645, "learning_rate": 3.5136303633672454e-06, "loss": 0.85478526, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.685487985610962 }, { "auxiliary_loss_clip": 0.01373109, "auxiliary_loss_mlp": 0.02577556, "balance_loss_clip": 1.05872834, "balance_loss_mlp": 1.00014973, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.984031145120208, "language_loss": 0.7449106, "learning_rate": 3.5131210901659544e-06, "loss": 0.78441727, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.7983696460723877 }, { "auxiliary_loss_clip": 0.0135926, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 1.05435407, "balance_loss_mlp": 1.02195418, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.5604702567001403, "language_loss": 0.82191527, "learning_rate": 3.5126115874253967e-06, "loss": 0.84581602, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.787261486053467 }, { "auxiliary_loss_clip": 0.01362858, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.06048417, "balance_loss_mlp": 1.02560043, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 2.275655687113159, "language_loss": 0.81241137, "learning_rate": 3.5121018552228644e-06, "loss": 0.83638638, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.72613525390625 }, { "auxiliary_loss_clip": 0.01358166, "auxiliary_loss_mlp": 0.01035561, "balance_loss_clip": 1.05580187, "balance_loss_mlp": 1.02657247, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 2.2809020667008215, "language_loss": 0.76461506, "learning_rate": 3.5115918936356827e-06, "loss": 0.7885524, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.704159736633301 }, { "auxiliary_loss_clip": 0.01347205, "auxiliary_loss_mlp": 0.01043672, "balance_loss_clip": 1.05248833, "balance_loss_mlp": 1.03479671, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 3.3356737692011627, "language_loss": 0.78992558, "learning_rate": 3.5110817027412123e-06, "loss": 0.81383431, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.7328057289123535 }, { "auxiliary_loss_clip": 0.01355304, "auxiliary_loss_mlp": 0.0103647, "balance_loss_clip": 1.05081332, "balance_loss_mlp": 1.02788138, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 7.720349247777, "language_loss": 0.69066954, "learning_rate": 3.5105712826168493e-06, "loss": 0.71458733, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.7854232788085938 }, { "auxiliary_loss_clip": 0.01262027, "auxiliary_loss_mlp": 0.02573487, "balance_loss_clip": 1.05865371, "balance_loss_mlp": 1.00013638, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 2.0461252280800974, "language_loss": 0.70707971, "learning_rate": 3.5100606333400235e-06, "loss": 0.74543488, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.8493502140045166 }, { "auxiliary_loss_clip": 0.01324959, "auxiliary_loss_mlp": 0.01040409, "balance_loss_clip": 1.05917573, "balance_loss_mlp": 1.03066349, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.381979659880762, "language_loss": 0.77453971, "learning_rate": 3.5095497549882006e-06, "loss": 0.79819334, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.706061840057373 }, { "auxiliary_loss_clip": 0.01262505, "auxiliary_loss_mlp": 0.01032039, "balance_loss_clip": 1.06166458, "balance_loss_mlp": 1.02312803, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 2.5951480949247347, "language_loss": 0.72836936, "learning_rate": 3.50903864763888e-06, "loss": 0.75131482, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.6468725204467773 }, { "auxiliary_loss_clip": 0.01267711, "auxiliary_loss_mlp": 0.01033946, "balance_loss_clip": 1.06054032, "balance_loss_mlp": 1.02467763, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 3.341761868234344, "language_loss": 0.75963843, "learning_rate": 3.5085273113695965e-06, "loss": 0.782655, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 3.0599327087402344 }, { "auxiliary_loss_clip": 0.01215827, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.06173551, "balance_loss_mlp": 1.02600074, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 5.370533109864265, "language_loss": 0.78668475, "learning_rate": 3.508015746257919e-06, "loss": 0.80920076, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.658648729324341 }, { "auxiliary_loss_clip": 0.01367281, "auxiliary_loss_mlp": 0.01034568, "balance_loss_clip": 1.05711246, "balance_loss_mlp": 1.02547836, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 2.365317586953289, "language_loss": 0.83471346, "learning_rate": 3.5075039523814518e-06, "loss": 0.85873193, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 3.559981107711792 }, { "auxiliary_loss_clip": 0.01269705, "auxiliary_loss_mlp": 0.01036562, "balance_loss_clip": 1.05855513, "balance_loss_mlp": 1.02684093, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.5464428688938465, "language_loss": 0.81914479, "learning_rate": 3.506991929817834e-06, "loss": 0.84220743, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.6777875423431396 }, { "auxiliary_loss_clip": 0.01214903, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.06559289, "balance_loss_mlp": 1.02812123, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.7995155141542611, "language_loss": 0.82720071, "learning_rate": 3.506479678644738e-06, "loss": 0.84971744, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 3.496401071548462 }, { "auxiliary_loss_clip": 0.01404798, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.05598974, "balance_loss_mlp": 1.02740157, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 10.322329295364929, "language_loss": 0.74012482, "learning_rate": 3.505967198939873e-06, "loss": 0.76453578, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.777081251144409 }, { "auxiliary_loss_clip": 0.01309739, "auxiliary_loss_mlp": 0.01027965, "balance_loss_clip": 1.05637324, "balance_loss_mlp": 1.0193696, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 3.519736407577444, "language_loss": 0.7864567, "learning_rate": 3.5054544907809813e-06, "loss": 0.80983377, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.840287208557129 }, { "auxiliary_loss_clip": 0.0131405, "auxiliary_loss_mlp": 0.02578828, "balance_loss_clip": 1.06157148, "balance_loss_mlp": 1.00014591, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.0835205638558203, "language_loss": 0.80542094, "learning_rate": 3.50494155424584e-06, "loss": 0.84434968, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.691432237625122 }, { "auxiliary_loss_clip": 0.01269231, "auxiliary_loss_mlp": 0.01037925, "balance_loss_clip": 1.06118941, "balance_loss_mlp": 1.02913332, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 4.383030156319143, "language_loss": 0.83278072, "learning_rate": 3.504428389412262e-06, "loss": 0.85585225, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 3.5834972858428955 }, { "auxiliary_loss_clip": 0.01264733, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.05970764, "balance_loss_mlp": 1.02519965, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 1.9332186087415986, "language_loss": 0.73045325, "learning_rate": 3.5039149963580927e-06, "loss": 0.75344467, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 2.6844053268432617 }, { "auxiliary_loss_clip": 0.01308548, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.06223452, "balance_loss_mlp": 1.01862955, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.210299983510216, "language_loss": 0.70183492, "learning_rate": 3.503401375161215e-06, "loss": 0.72519535, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.7357375621795654 }, { "auxiliary_loss_clip": 0.01212898, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.06154084, "balance_loss_mlp": 1.02339637, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.698581230253737, "language_loss": 0.83839226, "learning_rate": 3.502887525899544e-06, "loss": 0.8608458, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 2.6516335010528564 }, { "auxiliary_loss_clip": 0.01315169, "auxiliary_loss_mlp": 0.0103238, "balance_loss_clip": 1.05985165, "balance_loss_mlp": 1.02324879, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 1.9194913849806892, "language_loss": 0.8302691, "learning_rate": 3.50237344865103e-06, "loss": 0.85374463, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.732715606689453 }, { "auxiliary_loss_clip": 0.01214603, "auxiliary_loss_mlp": 0.01030996, "balance_loss_clip": 1.06175947, "balance_loss_mlp": 1.021608, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 3.222781087631139, "language_loss": 0.7661972, "learning_rate": 3.501859143493658e-06, "loss": 0.78865314, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.7172234058380127 }, { "auxiliary_loss_clip": 0.01111341, "auxiliary_loss_mlp": 0.01022349, "balance_loss_clip": 1.03062022, "balance_loss_mlp": 1.02121055, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9308656554730081, "language_loss": 0.60547173, "learning_rate": 3.5013446105054488e-06, "loss": 0.62680864, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 3.869779586791992 }, { "auxiliary_loss_clip": 0.01349736, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.05434442, "balance_loss_mlp": 1.019593, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 2.4838000500464403, "language_loss": 0.7491219, "learning_rate": 3.5008298497644555e-06, "loss": 0.77290326, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.7732651233673096 }, { "auxiliary_loss_clip": 0.01282295, "auxiliary_loss_mlp": 0.0102852, "balance_loss_clip": 1.05800617, "balance_loss_mlp": 1.01984775, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 1.5680943777453873, "language_loss": 0.88067371, "learning_rate": 3.500314861348767e-06, "loss": 0.90378183, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.741002321243286 }, { "auxiliary_loss_clip": 0.01357787, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.05819452, "balance_loss_mlp": 1.02327919, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 3.0069140865927637, "language_loss": 0.77396315, "learning_rate": 3.499799645336507e-06, "loss": 0.79786068, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.7395431995391846 }, { "auxiliary_loss_clip": 0.01267866, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.06406355, "balance_loss_mlp": 1.02443802, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.806552502484042, "language_loss": 0.8708961, "learning_rate": 3.4992842018058336e-06, "loss": 0.89390206, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.699760675430298 }, { "auxiliary_loss_clip": 0.01366258, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.05774283, "balance_loss_mlp": 1.02613854, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.347927930181925, "language_loss": 0.88648367, "learning_rate": 3.4987685308349384e-06, "loss": 0.91048938, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.73976469039917 }, { "auxiliary_loss_clip": 0.01364231, "auxiliary_loss_mlp": 0.01030372, "balance_loss_clip": 1.0533793, "balance_loss_mlp": 1.02163339, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 1.9446106240353969, "language_loss": 0.61257857, "learning_rate": 3.4982526325020497e-06, "loss": 0.63652456, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.6917262077331543 }, { "auxiliary_loss_clip": 0.01231487, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.05866194, "balance_loss_mlp": 1.02841973, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 3.1136731344595012, "language_loss": 0.82037055, "learning_rate": 3.4977365068854273e-06, "loss": 0.84305227, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.644906759262085 }, { "auxiliary_loss_clip": 0.01303609, "auxiliary_loss_mlp": 0.01030153, "balance_loss_clip": 1.05538356, "balance_loss_mlp": 1.02087283, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.7421840485645965, "language_loss": 0.73786056, "learning_rate": 3.4972201540633676e-06, "loss": 0.76119816, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.7079527378082275 }, { "auxiliary_loss_clip": 0.01307567, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.05738401, "balance_loss_mlp": 1.02396488, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 2.001815661947714, "language_loss": 0.85154188, "learning_rate": 3.4967035741142008e-06, "loss": 0.87494487, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.7456154823303223 }, { "auxiliary_loss_clip": 0.01312543, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.06704307, "balance_loss_mlp": 1.02301931, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.9320800262121287, "language_loss": 0.81844884, "learning_rate": 3.4961867671162917e-06, "loss": 0.84188437, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.8515405654907227 }, { "auxiliary_loss_clip": 0.01216944, "auxiliary_loss_mlp": 0.01026929, "balance_loss_clip": 1.06266534, "balance_loss_mlp": 1.01783895, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 3.222156962600459, "language_loss": 0.77175558, "learning_rate": 3.4956697331480402e-06, "loss": 0.79419422, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.629101276397705 }, { "auxiliary_loss_clip": 0.01280014, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.05560708, "balance_loss_mlp": 1.02471566, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 2.4477968651293134, "language_loss": 0.8010484, "learning_rate": 3.495152472287879e-06, "loss": 0.82417965, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.8409862518310547 }, { "auxiliary_loss_clip": 0.01362869, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.05951631, "balance_loss_mlp": 1.02268529, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 2.1068506950526804, "language_loss": 0.73762524, "learning_rate": 3.4946349846142766e-06, "loss": 0.76156878, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.777482509613037 }, { "auxiliary_loss_clip": 0.01215275, "auxiliary_loss_mlp": 0.01030958, "balance_loss_clip": 1.06265545, "balance_loss_mlp": 1.02221417, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 2.0367403871445653, "language_loss": 0.76138771, "learning_rate": 3.4941172702057353e-06, "loss": 0.78385001, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.6294572353363037 }, { "auxiliary_loss_clip": 0.01311714, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.06073093, "balance_loss_mlp": 1.02690554, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.0375575724219854, "language_loss": 0.80608559, "learning_rate": 3.4935993291407924e-06, "loss": 0.82955635, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 3.7872841358184814 }, { "auxiliary_loss_clip": 0.01313282, "auxiliary_loss_mlp": 0.01037773, "balance_loss_clip": 1.0598073, "balance_loss_mlp": 1.02858758, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.693397627287194, "language_loss": 0.71834236, "learning_rate": 3.4930811614980183e-06, "loss": 0.74185288, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 3.549647331237793 }, { "auxiliary_loss_clip": 0.01261112, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.05947089, "balance_loss_mlp": 1.02412987, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.6329885786635523, "language_loss": 0.79339677, "learning_rate": 3.4925627673560198e-06, "loss": 0.81634045, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.7023160457611084 }, { "auxiliary_loss_clip": 0.01368019, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.06162167, "balance_loss_mlp": 1.03008473, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 1.748343717781985, "language_loss": 0.88977683, "learning_rate": 3.4920441467934357e-06, "loss": 0.91384697, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.7547953128814697 }, { "auxiliary_loss_clip": 0.0135153, "auxiliary_loss_mlp": 0.01025982, "balance_loss_clip": 1.05479753, "balance_loss_mlp": 1.0174768, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 3.414135646689568, "language_loss": 0.82969439, "learning_rate": 3.491525299888941e-06, "loss": 0.85346961, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.8044161796569824 }, { "auxiliary_loss_clip": 0.01204114, "auxiliary_loss_mlp": 0.02521632, "balance_loss_clip": 1.02987528, "balance_loss_mlp": 1.0000689, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8780792939835298, "language_loss": 0.62616408, "learning_rate": 3.491006226721244e-06, "loss": 0.66342151, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 4.208857297897339 }, { "auxiliary_loss_clip": 0.01230002, "auxiliary_loss_mlp": 0.02571317, "balance_loss_clip": 1.06028676, "balance_loss_mlp": 1.00005519, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.0934304193269595, "language_loss": 0.77699196, "learning_rate": 3.4904869273690882e-06, "loss": 0.81500518, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.639619827270508 }, { "auxiliary_loss_clip": 0.01264669, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.0594703, "balance_loss_mlp": 1.0256319, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 1.953977431628158, "language_loss": 0.89135075, "learning_rate": 3.489967401911251e-06, "loss": 0.9143368, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 2.7364842891693115 }, { "auxiliary_loss_clip": 0.01217278, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.06302881, "balance_loss_mlp": 1.02516341, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.653358442155824, "language_loss": 0.69801664, "learning_rate": 3.4894476504265428e-06, "loss": 0.72052842, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.909564971923828 }, { "auxiliary_loss_clip": 0.0115686, "auxiliary_loss_mlp": 0.0100029, "balance_loss_clip": 1.02848577, "balance_loss_mlp": 0.99898452, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7641333655239898, "language_loss": 0.54396951, "learning_rate": 3.4889276729938104e-06, "loss": 0.56554103, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.2201597690582275 }, { "auxiliary_loss_clip": 0.01308352, "auxiliary_loss_mlp": 0.01037915, "balance_loss_clip": 1.05929077, "balance_loss_mlp": 1.02974296, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 1.8839439402009803, "language_loss": 0.80878019, "learning_rate": 3.488407469691934e-06, "loss": 0.83224297, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 3.6668334007263184 }, { "auxiliary_loss_clip": 0.01310863, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.05634379, "balance_loss_mlp": 1.02807057, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 2.1059810600546074, "language_loss": 0.80860889, "learning_rate": 3.487887040599828e-06, "loss": 0.83209205, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.7118170261383057 }, { "auxiliary_loss_clip": 0.01216608, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.06557226, "balance_loss_mlp": 1.02480292, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.550759557262873, "language_loss": 0.76030374, "learning_rate": 3.4873663857964407e-06, "loss": 0.78279978, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.6365292072296143 }, { "auxiliary_loss_clip": 0.01416604, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.05992651, "balance_loss_mlp": 1.02133739, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.9092480929834914, "language_loss": 0.6660322, "learning_rate": 3.4868455053607556e-06, "loss": 0.69049752, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.8564846515655518 }, { "auxiliary_loss_clip": 0.0126667, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.05996919, "balance_loss_mlp": 1.02335978, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 5.453924127160086, "language_loss": 0.71404397, "learning_rate": 3.486324399371789e-06, "loss": 0.73703611, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.7104945182800293 }, { "auxiliary_loss_clip": 0.01354298, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.05785, "balance_loss_mlp": 1.02517796, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.9516594667820069, "language_loss": 0.78773093, "learning_rate": 3.485803067908593e-06, "loss": 0.81160414, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.849024534225464 }, { "auxiliary_loss_clip": 0.0144156, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.04723954, "balance_loss_mlp": 1.0268513, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 2.278550821955213, "language_loss": 0.7987932, "learning_rate": 3.485281511050253e-06, "loss": 0.82356924, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.8973453044891357 }, { "auxiliary_loss_clip": 0.01178337, "auxiliary_loss_mlp": 0.0102757, "balance_loss_clip": 1.05980527, "balance_loss_mlp": 1.01973486, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 3.351097008584336, "language_loss": 0.89820993, "learning_rate": 3.484759728875889e-06, "loss": 0.92026901, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.6023073196411133 }, { "auxiliary_loss_clip": 0.0140278, "auxiliary_loss_mlp": 0.01032962, "balance_loss_clip": 1.05456519, "balance_loss_mlp": 1.02492762, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 2.013686792094541, "language_loss": 0.81158507, "learning_rate": 3.4842377214646543e-06, "loss": 0.83594251, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.718172788619995 }, { "auxiliary_loss_clip": 0.01212453, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.06246853, "balance_loss_mlp": 1.02512753, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 2.212225082069032, "language_loss": 0.66970968, "learning_rate": 3.483715488895737e-06, "loss": 0.69216752, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.633713960647583 }, { "auxiliary_loss_clip": 0.01406867, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.05231833, "balance_loss_mlp": 1.02032495, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 1.8359237349706017, "language_loss": 0.78877747, "learning_rate": 3.48319303124836e-06, "loss": 0.81313348, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.7595648765563965 }, { "auxiliary_loss_clip": 0.01317327, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.06658173, "balance_loss_mlp": 1.02578926, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.057350873921361, "language_loss": 0.67038524, "learning_rate": 3.4826703486017798e-06, "loss": 0.6939007, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.6950290203094482 }, { "auxiliary_loss_clip": 0.0125704, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.06164169, "balance_loss_mlp": 1.02042866, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.6260119460030162, "language_loss": 0.76814294, "learning_rate": 3.4821474410352867e-06, "loss": 0.79099876, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.6531729698181152 }, { "auxiliary_loss_clip": 0.01197909, "auxiliary_loss_mlp": 0.01003125, "balance_loss_clip": 1.04515016, "balance_loss_mlp": 1.00176001, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.895463969643903, "language_loss": 0.62661147, "learning_rate": 3.481624308628205e-06, "loss": 0.6486218, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.3645691871643066 }, { "auxiliary_loss_clip": 0.01311375, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.05718839, "balance_loss_mlp": 1.0214088, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 2.6627487861583616, "language_loss": 1.00097632, "learning_rate": 3.481100951459893e-06, "loss": 1.02439189, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.7363739013671875 }, { "auxiliary_loss_clip": 0.01255906, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.05988514, "balance_loss_mlp": 1.01995385, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.614839190866429, "language_loss": 0.78698885, "learning_rate": 3.4805773696097453e-06, "loss": 0.80983007, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 3.542092800140381 }, { "auxiliary_loss_clip": 0.01311008, "auxiliary_loss_mlp": 0.01030036, "balance_loss_clip": 1.06189549, "balance_loss_mlp": 1.02152455, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 4.0727654795966455, "language_loss": 0.8760947, "learning_rate": 3.4800535631571874e-06, "loss": 0.89950514, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.670546293258667 }, { "auxiliary_loss_clip": 0.01312193, "auxiliary_loss_mlp": 0.01028991, "balance_loss_clip": 1.05746603, "balance_loss_mlp": 1.02037275, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 2.111043317527862, "language_loss": 0.76303661, "learning_rate": 3.4795295321816804e-06, "loss": 0.78644848, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 3.582796812057495 }, { "auxiliary_loss_clip": 0.01300538, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.0535419, "balance_loss_mlp": 1.01777732, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 2.218194824346025, "language_loss": 0.91104865, "learning_rate": 3.47900527676272e-06, "loss": 0.93431664, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.6335608959198 }, { "auxiliary_loss_clip": 0.01213455, "auxiliary_loss_mlp": 0.01032263, "balance_loss_clip": 1.06459355, "balance_loss_mlp": 1.02401996, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 2.3601264285681354, "language_loss": 0.8820107, "learning_rate": 3.478480796979835e-06, "loss": 0.90446788, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.586411952972412 }, { "auxiliary_loss_clip": 0.01305246, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.05886507, "balance_loss_mlp": 1.01950192, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.5552183395315247, "language_loss": 0.77801859, "learning_rate": 3.4779560929125894e-06, "loss": 0.80134797, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.750880241394043 }, { "auxiliary_loss_clip": 0.01260602, "auxiliary_loss_mlp": 0.0100086, "balance_loss_clip": 1.02807033, "balance_loss_mlp": 0.99957818, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6644409971108939, "language_loss": 0.56896412, "learning_rate": 3.4774311646405783e-06, "loss": 0.59157872, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 4.318928480148315 }, { "auxiliary_loss_clip": 0.01350184, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.0511831, "balance_loss_mlp": 1.02812481, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 2.2999836092756785, "language_loss": 0.83569765, "learning_rate": 3.476906012243435e-06, "loss": 0.85955924, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.7770752906799316 }, { "auxiliary_loss_clip": 0.01256421, "auxiliary_loss_mlp": 0.01035196, "balance_loss_clip": 1.06003165, "balance_loss_mlp": 1.02651143, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.5561622398827049, "language_loss": 0.81080431, "learning_rate": 3.476380635800824e-06, "loss": 0.83372045, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.696713447570801 }, { "auxiliary_loss_clip": 0.01304782, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.05850255, "balance_loss_mlp": 1.02723861, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 5.912714476774329, "language_loss": 0.86369789, "learning_rate": 3.475855035392444e-06, "loss": 0.88710845, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 2.702138900756836 }, { "auxiliary_loss_clip": 0.01370675, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.05369925, "balance_loss_mlp": 1.02542794, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 2.0529203924295523, "language_loss": 0.71160114, "learning_rate": 3.475329211098029e-06, "loss": 0.73564482, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.126304864883423 }, { "auxiliary_loss_clip": 0.01411115, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.05805981, "balance_loss_mlp": 1.02750611, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.9289744925876198, "language_loss": 0.82287586, "learning_rate": 3.4748031629973453e-06, "loss": 0.84735161, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 3.882110118865967 }, { "auxiliary_loss_clip": 0.01306306, "auxiliary_loss_mlp": 0.00999177, "balance_loss_clip": 1.02674031, "balance_loss_mlp": 0.99797863, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 1.085221336813201, "language_loss": 0.56521481, "learning_rate": 3.4742768911701944e-06, "loss": 0.58826965, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.651494264602661 }, { "auxiliary_loss_clip": 0.0126657, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.06186259, "balance_loss_mlp": 1.02460885, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.928310236166233, "language_loss": 0.70375329, "learning_rate": 3.4737503956964113e-06, "loss": 0.72675651, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.722292900085449 }, { "auxiliary_loss_clip": 0.01308795, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.05649471, "balance_loss_mlp": 1.02971768, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.1934606914579065, "language_loss": 0.67327601, "learning_rate": 3.473223676655865e-06, "loss": 0.69675577, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.6231279373168945 }, { "auxiliary_loss_clip": 0.01308309, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.05446088, "balance_loss_mlp": 1.02422929, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.9500598531246045, "language_loss": 0.80168486, "learning_rate": 3.472696734128459e-06, "loss": 0.82510245, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.6766035556793213 }, { "auxiliary_loss_clip": 0.01260596, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.0606643, "balance_loss_mlp": 1.02574325, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 1.848576009106137, "language_loss": 0.75996858, "learning_rate": 3.4721695681941286e-06, "loss": 0.78292316, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.702679395675659 }, { "auxiliary_loss_clip": 0.01309242, "auxiliary_loss_mlp": 0.02575626, "balance_loss_clip": 1.05899811, "balance_loss_mlp": 1.00029778, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 2.8692325798842293, "language_loss": 0.8181957, "learning_rate": 3.471642178932845e-06, "loss": 0.8570444, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.769883871078491 }, { "auxiliary_loss_clip": 0.01309373, "auxiliary_loss_mlp": 0.01029832, "balance_loss_clip": 1.0546844, "balance_loss_mlp": 1.02102208, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 2.5708488598250714, "language_loss": 0.89839524, "learning_rate": 3.471114566424613e-06, "loss": 0.92178726, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.6563563346862793 }, { "auxiliary_loss_clip": 0.01307689, "auxiliary_loss_mlp": 0.01025996, "balance_loss_clip": 1.06100154, "balance_loss_mlp": 1.01767516, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 1.9000342737733447, "language_loss": 0.7587713, "learning_rate": 3.4705867307494715e-06, "loss": 0.78210819, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.7112088203430176 }, { "auxiliary_loss_clip": 0.01262551, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.05742741, "balance_loss_mlp": 1.02325368, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.2682545951961335, "language_loss": 0.84973633, "learning_rate": 3.470058671987492e-06, "loss": 0.87268043, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.6640963554382324 }, { "auxiliary_loss_clip": 0.01265128, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.058393, "balance_loss_mlp": 1.02576852, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 2.2252646183173064, "language_loss": 0.84596586, "learning_rate": 3.4695303902187805e-06, "loss": 0.86896259, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.6923372745513916 }, { "auxiliary_loss_clip": 0.01275174, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.05404139, "balance_loss_mlp": 1.0255239, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 2.10416482096449, "language_loss": 0.78618997, "learning_rate": 3.469001885523478e-06, "loss": 0.80927801, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.7587087154388428 }, { "auxiliary_loss_clip": 0.01211113, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.06160021, "balance_loss_mlp": 1.0275625, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.9594474788474612, "language_loss": 0.81260365, "learning_rate": 3.4684731579817568e-06, "loss": 0.83507437, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.678591251373291 }, { "auxiliary_loss_clip": 0.0145181, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.05309439, "balance_loss_mlp": 1.02300656, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.6135391186986425, "language_loss": 0.76677185, "learning_rate": 3.4679442076738247e-06, "loss": 0.79160643, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.8043630123138428 }, { "auxiliary_loss_clip": 0.01212493, "auxiliary_loss_mlp": 0.01030858, "balance_loss_clip": 1.06074166, "balance_loss_mlp": 1.02132702, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 3.5028602317290782, "language_loss": 0.83535039, "learning_rate": 3.4674150346799245e-06, "loss": 0.85778391, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.655402183532715 }, { "auxiliary_loss_clip": 0.01310796, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.05978656, "balance_loss_mlp": 1.0195787, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.2155517150885915, "language_loss": 0.80461729, "learning_rate": 3.4668856390803295e-06, "loss": 0.82801324, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 3.6878297328948975 }, { "auxiliary_loss_clip": 0.01249905, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 1.05549598, "balance_loss_mlp": 1.01956844, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 2.0342034949662007, "language_loss": 0.89934254, "learning_rate": 3.4663560209553495e-06, "loss": 0.92212439, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.6436030864715576 }, { "auxiliary_loss_clip": 0.01306574, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.05713665, "balance_loss_mlp": 1.03242266, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.8700466334659973, "language_loss": 0.79644406, "learning_rate": 3.4658261803853267e-06, "loss": 0.81992656, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 3.6166367530822754 }, { "auxiliary_loss_clip": 0.01305467, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.0581708, "balance_loss_mlp": 1.02310765, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 4.396741786005645, "language_loss": 0.80692291, "learning_rate": 3.4652961174506383e-06, "loss": 0.8302986, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.6488633155822754 }, { "auxiliary_loss_clip": 0.01153297, "auxiliary_loss_mlp": 0.01000152, "balance_loss_clip": 1.02814722, "balance_loss_mlp": 0.99893612, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9632052947704106, "language_loss": 0.58078551, "learning_rate": 3.464765832231694e-06, "loss": 0.60232008, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.1993191242218018 }, { "auxiliary_loss_clip": 0.01259552, "auxiliary_loss_mlp": 0.01030994, "balance_loss_clip": 1.05998707, "balance_loss_mlp": 1.02280402, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.896866464081546, "language_loss": 0.7088238, "learning_rate": 3.4642353248089373e-06, "loss": 0.73172927, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.6495962142944336 }, { "auxiliary_loss_clip": 0.01304748, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.05709219, "balance_loss_mlp": 1.0272398, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.904770715445126, "language_loss": 0.80498481, "learning_rate": 3.463704595262846e-06, "loss": 0.82839513, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 3.6258015632629395 }, { "auxiliary_loss_clip": 0.01357444, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.05771828, "balance_loss_mlp": 1.02434587, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 2.730367454747495, "language_loss": 0.70706838, "learning_rate": 3.463173643673931e-06, "loss": 0.73096806, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.8494210243225098 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.01001995, "balance_loss_clip": 1.02827978, "balance_loss_mlp": 1.00076151, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.8990415152395262, "language_loss": 0.6352731, "learning_rate": 3.4626424701227387e-06, "loss": 0.65693486, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.1439626216888428 }, { "auxiliary_loss_clip": 0.01108413, "auxiliary_loss_mlp": 0.01002137, "balance_loss_clip": 1.029392, "balance_loss_mlp": 1.00090909, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8255215866016168, "language_loss": 0.55822289, "learning_rate": 3.4621110746898452e-06, "loss": 0.57932842, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 3.3095543384552 }, { "auxiliary_loss_clip": 0.01263895, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.05957651, "balance_loss_mlp": 1.02010632, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.5898420584295405, "language_loss": 0.74800754, "learning_rate": 3.4615794574558654e-06, "loss": 0.77093434, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.6709303855895996 }, { "auxiliary_loss_clip": 0.01306139, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 1.05608845, "balance_loss_mlp": 1.02104378, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 4.342899274853047, "language_loss": 0.8427195, "learning_rate": 3.4610476185014436e-06, "loss": 0.86607623, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 3.6585946083068848 }, { "auxiliary_loss_clip": 0.01212546, "auxiliary_loss_mlp": 0.01028524, "balance_loss_clip": 1.06013799, "balance_loss_mlp": 1.01916647, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 2.3235009259842125, "language_loss": 0.79412639, "learning_rate": 3.4605155579072597e-06, "loss": 0.81653702, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.668334722518921 }, { "auxiliary_loss_clip": 0.01410866, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.05399191, "balance_loss_mlp": 1.01980472, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.8668416330993023, "language_loss": 0.71226531, "learning_rate": 3.459983275754027e-06, "loss": 0.73666054, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.822453498840332 }, { "auxiliary_loss_clip": 0.01209594, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.06019068, "balance_loss_mlp": 1.01945853, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 2.413608914997237, "language_loss": 0.79700464, "learning_rate": 3.4594507721224918e-06, "loss": 0.8193835, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.586663007736206 }, { "auxiliary_loss_clip": 0.01310299, "auxiliary_loss_mlp": 0.01032503, "balance_loss_clip": 1.05669332, "balance_loss_mlp": 1.02420604, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 1.6957093972117632, "language_loss": 0.82224262, "learning_rate": 3.4589180470934353e-06, "loss": 0.84567058, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.7564425468444824 }, { "auxiliary_loss_clip": 0.01178322, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.0558008, "balance_loss_mlp": 1.0244205, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 2.1181531397235376, "language_loss": 0.7675845, "learning_rate": 3.4583851007476713e-06, "loss": 0.78970516, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.6043379306793213 }, { "auxiliary_loss_clip": 0.01359876, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.05735862, "balance_loss_mlp": 1.02301311, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.191422171046224, "language_loss": 0.68511546, "learning_rate": 3.4578519331660464e-06, "loss": 0.70903671, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.7517526149749756 }, { "auxiliary_loss_clip": 0.01256012, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.06010413, "balance_loss_mlp": 1.02245498, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.020668483165944, "language_loss": 0.82321018, "learning_rate": 3.4573185444294426e-06, "loss": 0.84607637, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.6699087619781494 }, { "auxiliary_loss_clip": 0.01311753, "auxiliary_loss_mlp": 0.02576972, "balance_loss_clip": 1.05778396, "balance_loss_mlp": 1.00027156, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.6966713552667727, "language_loss": 0.78527445, "learning_rate": 3.456784934618774e-06, "loss": 0.82416165, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.676276922225952 }, { "auxiliary_loss_clip": 0.01308881, "auxiliary_loss_mlp": 0.0103333, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.02531922, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 2.5754527447692697, "language_loss": 0.80148149, "learning_rate": 3.4562511038149897e-06, "loss": 0.82490361, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.6550188064575195 }, { "auxiliary_loss_clip": 0.01312204, "auxiliary_loss_mlp": 0.01000158, "balance_loss_clip": 1.02466989, "balance_loss_mlp": 0.99899596, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8585703582039322, "language_loss": 0.57713628, "learning_rate": 3.4557170520990705e-06, "loss": 0.6002599, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.32315993309021 }, { "auxiliary_loss_clip": 0.01253925, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.05826592, "balance_loss_mlp": 1.02206707, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 1.5871908819980611, "language_loss": 0.86337012, "learning_rate": 3.4551827795520324e-06, "loss": 0.8862108, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.7211551666259766 }, { "auxiliary_loss_clip": 0.01257008, "auxiliary_loss_mlp": 0.010293, "balance_loss_clip": 1.05553055, "balance_loss_mlp": 1.02084196, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.8309520368702337, "language_loss": 0.85086465, "learning_rate": 3.4546482862549226e-06, "loss": 0.87372768, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.6574840545654297 }, { "auxiliary_loss_clip": 0.01353524, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.05337071, "balance_loss_mlp": 1.02332044, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.6165937983710856, "language_loss": 0.78626543, "learning_rate": 3.4541135722888253e-06, "loss": 0.81012541, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.720285177230835 }, { "auxiliary_loss_clip": 0.01208786, "auxiliary_loss_mlp": 0.01032398, "balance_loss_clip": 1.05835497, "balance_loss_mlp": 1.02427971, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.7530555626194175, "language_loss": 0.80310857, "learning_rate": 3.453578637734854e-06, "loss": 0.8255204, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.6565017700195312 }, { "auxiliary_loss_clip": 0.0120903, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.06154275, "balance_loss_mlp": 1.02563095, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 7.8753897067186225, "language_loss": 0.78800148, "learning_rate": 3.4530434826741605e-06, "loss": 0.81043196, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 3.5613107681274414 }, { "auxiliary_loss_clip": 0.01298886, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.05559313, "balance_loss_mlp": 1.02409351, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 3.7609442697821556, "language_loss": 0.68898088, "learning_rate": 3.452508107187926e-06, "loss": 0.71228701, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 3.800170660018921 }, { "auxiliary_loss_clip": 0.01454196, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.04904962, "balance_loss_mlp": 1.01919472, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 1.9133199867494894, "language_loss": 0.77657568, "learning_rate": 3.451972511357366e-06, "loss": 0.80139637, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.819772481918335 }, { "auxiliary_loss_clip": 0.01255349, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.05920684, "balance_loss_mlp": 1.02766168, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.9125108629388943, "language_loss": 0.85526001, "learning_rate": 3.45143669526373e-06, "loss": 0.87817019, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.670285940170288 }, { "auxiliary_loss_clip": 0.01217042, "auxiliary_loss_mlp": 0.01003974, "balance_loss_clip": 1.02832842, "balance_loss_mlp": 1.00282407, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7945682572921017, "language_loss": 0.63162959, "learning_rate": 3.450900658988302e-06, "loss": 0.65383977, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.1840710639953613 }, { "auxiliary_loss_clip": 0.01303367, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.05733371, "balance_loss_mlp": 1.02520609, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 18.66571005151277, "language_loss": 0.77674276, "learning_rate": 3.450364402612397e-06, "loss": 0.80011523, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.7286102771759033 }, { "auxiliary_loss_clip": 0.0130618, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.05849516, "balance_loss_mlp": 1.02757335, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 2.262737306146407, "language_loss": 0.83795154, "learning_rate": 3.449827926217366e-06, "loss": 0.86137468, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 3.707426071166992 }, { "auxiliary_loss_clip": 0.0130825, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.05212545, "balance_loss_mlp": 1.02073169, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 1.9118968921945272, "language_loss": 0.80878794, "learning_rate": 3.449291229884591e-06, "loss": 0.83215964, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.735761880874634 }, { "auxiliary_loss_clip": 0.013586, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.05233216, "balance_loss_mlp": 1.02041173, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 2.3161687143870897, "language_loss": 0.86763632, "learning_rate": 3.4487543136954887e-06, "loss": 0.89151067, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 2.7610201835632324 }, { "auxiliary_loss_clip": 0.01356625, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.05595529, "balance_loss_mlp": 1.02379203, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 2.1005705068334652, "language_loss": 0.91323596, "learning_rate": 3.448217177731509e-06, "loss": 0.93712175, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.759852647781372 }, { "auxiliary_loss_clip": 0.01302662, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.05847597, "balance_loss_mlp": 1.02124202, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 2.8633939709614187, "language_loss": 0.77860755, "learning_rate": 3.4476798220741348e-06, "loss": 0.80192715, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.711859941482544 }, { "auxiliary_loss_clip": 0.01208512, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.06245565, "balance_loss_mlp": 1.02128673, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.8749670780893746, "language_loss": 0.78714418, "learning_rate": 3.4471422468048826e-06, "loss": 0.80952168, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 3.5018718242645264 }, { "auxiliary_loss_clip": 0.01256463, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 1.05937278, "balance_loss_mlp": 1.0279963, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 3.129493055962416, "language_loss": 0.73017782, "learning_rate": 3.4466044520053022e-06, "loss": 0.75310254, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.6898462772369385 }, { "auxiliary_loss_clip": 0.01300226, "auxiliary_loss_mlp": 0.0103309, "balance_loss_clip": 1.05374718, "balance_loss_mlp": 1.02438152, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 2.1676731050768248, "language_loss": 0.60782105, "learning_rate": 3.446066437756977e-06, "loss": 0.63115418, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.8188843727111816 }, { "auxiliary_loss_clip": 0.01301972, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.05732989, "balance_loss_mlp": 1.02300835, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 2.113123205853615, "language_loss": 0.75010455, "learning_rate": 3.4455282041415224e-06, "loss": 0.77343833, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.679535150527954 }, { "auxiliary_loss_clip": 0.01355235, "auxiliary_loss_mlp": 0.01027896, "balance_loss_clip": 1.054726, "balance_loss_mlp": 1.02022159, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.1465409334470014, "language_loss": 0.87365985, "learning_rate": 3.4449897512405894e-06, "loss": 0.89749116, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.7511255741119385 }, { "auxiliary_loss_clip": 0.01460282, "auxiliary_loss_mlp": 0.02573596, "balance_loss_clip": 1.05364299, "balance_loss_mlp": 1.00032461, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 3.01021503377063, "language_loss": 0.75371206, "learning_rate": 3.444451079135859e-06, "loss": 0.79405081, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.7451672554016113 }, { "auxiliary_loss_clip": 0.01400127, "auxiliary_loss_mlp": 0.02576239, "balance_loss_clip": 1.05143046, "balance_loss_mlp": 1.00023222, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 2.9912683586121274, "language_loss": 0.74284518, "learning_rate": 3.4439121879090493e-06, "loss": 0.78260881, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.7600314617156982 }, { "auxiliary_loss_clip": 0.0131266, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.05809152, "balance_loss_mlp": 1.02130473, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 2.3896075704837663, "language_loss": 0.83249259, "learning_rate": 3.4433730776419082e-06, "loss": 0.85591388, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.6980814933776855 }, { "auxiliary_loss_clip": 0.0126074, "auxiliary_loss_mlp": 0.02578361, "balance_loss_clip": 1.05625486, "balance_loss_mlp": 1.00024557, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 4.325716000888862, "language_loss": 0.80308843, "learning_rate": 3.4428337484162183e-06, "loss": 0.84147942, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.714698553085327 }, { "auxiliary_loss_clip": 0.01310096, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.05698991, "balance_loss_mlp": 1.02292669, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 2.021624370153554, "language_loss": 0.84433997, "learning_rate": 3.442294200313797e-06, "loss": 0.86775339, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.694211959838867 }, { "auxiliary_loss_clip": 0.01108624, "auxiliary_loss_mlp": 0.01004433, "balance_loss_clip": 1.02962065, "balance_loss_mlp": 1.00327051, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.7751258591500861, "language_loss": 0.52676392, "learning_rate": 3.4417544334164916e-06, "loss": 0.54789448, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.2301928997039795 }, { "auxiliary_loss_clip": 0.01352753, "auxiliary_loss_mlp": 0.01028762, "balance_loss_clip": 1.05644512, "balance_loss_mlp": 1.02057242, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.677211483779638, "language_loss": 0.77777588, "learning_rate": 3.4412144478061854e-06, "loss": 0.80159104, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.7869880199432373 }, { "auxiliary_loss_clip": 0.01548902, "auxiliary_loss_mlp": 0.01029101, "balance_loss_clip": 1.04621625, "balance_loss_mlp": 1.01986861, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 1.961456495998587, "language_loss": 0.75445759, "learning_rate": 3.4406742435647925e-06, "loss": 0.78023767, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 2.9682061672210693 }, { "auxiliary_loss_clip": 0.0125605, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.06058288, "balance_loss_mlp": 1.02078235, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 1.9920701290668184, "language_loss": 0.79104972, "learning_rate": 3.440133820774263e-06, "loss": 0.81389463, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.7921273708343506 }, { "auxiliary_loss_clip": 0.01314106, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.06044245, "balance_loss_mlp": 1.02077138, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 2.622645677879936, "language_loss": 0.81705242, "learning_rate": 3.439593179516578e-06, "loss": 0.84048879, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.6976125240325928 }, { "auxiliary_loss_clip": 0.01225672, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.05829585, "balance_loss_mlp": 1.01961434, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 1.71356998757466, "language_loss": 0.80897051, "learning_rate": 3.4390523198737524e-06, "loss": 0.83151096, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 3.7354772090911865 }, { "auxiliary_loss_clip": 0.01213128, "auxiliary_loss_mlp": 0.02575314, "balance_loss_clip": 1.06489289, "balance_loss_mlp": 1.00025105, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 1.7745544422052872, "language_loss": 0.73662317, "learning_rate": 3.4385112419278333e-06, "loss": 0.77450764, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.675638198852539 }, { "auxiliary_loss_clip": 0.01162473, "auxiliary_loss_mlp": 0.01002386, "balance_loss_clip": 1.02819371, "balance_loss_mlp": 1.0012002, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7959930301610104, "language_loss": 0.64838123, "learning_rate": 3.4379699457609033e-06, "loss": 0.67002988, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 4.362829923629761 }, { "auxiliary_loss_clip": 0.01300583, "auxiliary_loss_mlp": 0.01038313, "balance_loss_clip": 1.05372369, "balance_loss_mlp": 1.02946806, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 1.8409490324270186, "language_loss": 0.8975569, "learning_rate": 3.4374284314550755e-06, "loss": 0.92094576, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.6963257789611816 }, { "auxiliary_loss_clip": 0.01209684, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.06194115, "balance_loss_mlp": 1.0228579, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 2.108961661964907, "language_loss": 0.811652, "learning_rate": 3.436886699092498e-06, "loss": 0.83405799, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.6519532203674316 }, { "auxiliary_loss_clip": 0.0121392, "auxiliary_loss_mlp": 0.01021699, "balance_loss_clip": 1.06305563, "balance_loss_mlp": 1.01357532, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 2.584344598599819, "language_loss": 0.71643776, "learning_rate": 3.4363447487553502e-06, "loss": 0.73879391, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.5250654220581055 }, { "auxiliary_loss_clip": 0.01302372, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.05868101, "balance_loss_mlp": 1.02060175, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 4.503296894435826, "language_loss": 0.77824223, "learning_rate": 3.4358025805258455e-06, "loss": 0.80155528, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 3.674898386001587 }, { "auxiliary_loss_clip": 0.01408621, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.05205584, "balance_loss_mlp": 1.02504921, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 2.6743673785916853, "language_loss": 0.83585536, "learning_rate": 3.435260194486232e-06, "loss": 0.8602758, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.7341690063476562 }, { "auxiliary_loss_clip": 0.01304135, "auxiliary_loss_mlp": 0.01026301, "balance_loss_clip": 1.05797482, "balance_loss_mlp": 1.01749182, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.940056532592233, "language_loss": 0.8286407, "learning_rate": 3.4347175907187875e-06, "loss": 0.85194498, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.6963300704956055 }, { "auxiliary_loss_clip": 0.01255531, "auxiliary_loss_mlp": 0.010334, "balance_loss_clip": 1.06068635, "balance_loss_mlp": 1.02517414, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 2.042931775374046, "language_loss": 0.88015574, "learning_rate": 3.4341747693058254e-06, "loss": 0.90304512, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 2.662877082824707 }, { "auxiliary_loss_clip": 0.01462907, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.04955673, "balance_loss_mlp": 1.02779078, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.844620000122029, "language_loss": 0.77217197, "learning_rate": 3.4336317303296916e-06, "loss": 0.79715788, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.1450953483581543 }, { "auxiliary_loss_clip": 0.01252605, "auxiliary_loss_mlp": 0.01031132, "balance_loss_clip": 1.05693913, "balance_loss_mlp": 1.02300191, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.7667070184691696, "language_loss": 0.75921911, "learning_rate": 3.4330884738727635e-06, "loss": 0.78205645, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 3.925947427749634 }, { "auxiliary_loss_clip": 0.0139868, "auxiliary_loss_mlp": 0.01032236, "balance_loss_clip": 1.05292761, "balance_loss_mlp": 1.02353954, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 2.2646099026373974, "language_loss": 0.70839602, "learning_rate": 3.4325450000174535e-06, "loss": 0.73270518, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.7861697673797607 }, { "auxiliary_loss_clip": 0.01402246, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.05507231, "balance_loss_mlp": 1.03288722, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 1.9561881453842593, "language_loss": 0.74454737, "learning_rate": 3.4320013088462067e-06, "loss": 0.76898313, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.8341996669769287 }, { "auxiliary_loss_clip": 0.01270666, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.05266333, "balance_loss_mlp": 1.02397227, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.5810351264132023, "language_loss": 0.81526488, "learning_rate": 3.431457400441499e-06, "loss": 0.83829021, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.792424440383911 }, { "auxiliary_loss_clip": 0.01354092, "auxiliary_loss_mlp": 0.01003826, "balance_loss_clip": 1.02460051, "balance_loss_mlp": 1.00275922, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9360237582998867, "language_loss": 0.60814869, "learning_rate": 3.4309132748858424e-06, "loss": 0.63172793, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.421807050704956 }, { "auxiliary_loss_clip": 0.01252879, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.05873132, "balance_loss_mlp": 1.02206326, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.680188141236984, "language_loss": 0.84151936, "learning_rate": 3.430368932261779e-06, "loss": 0.86434829, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 3.197848081588745 }, { "auxiliary_loss_clip": 0.01302308, "auxiliary_loss_mlp": 0.01041888, "balance_loss_clip": 1.05779815, "balance_loss_mlp": 1.03314447, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 2.5861482592588136, "language_loss": 0.75358331, "learning_rate": 3.429824372651886e-06, "loss": 0.77702528, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.6434123516082764 }, { "auxiliary_loss_clip": 0.01410504, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.05348134, "balance_loss_mlp": 1.02264571, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 1.9799642601153071, "language_loss": 0.8372665, "learning_rate": 3.4292795961387732e-06, "loss": 0.86168313, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.750469207763672 }, { "auxiliary_loss_clip": 0.01209318, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.06108594, "balance_loss_mlp": 1.01903248, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.2431645521100014, "language_loss": 0.87833065, "learning_rate": 3.4287346028050818e-06, "loss": 0.9006986, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.687189817428589 }, { "auxiliary_loss_clip": 0.01219999, "auxiliary_loss_mlp": 0.01026297, "balance_loss_clip": 1.05858684, "balance_loss_mlp": 1.0189178, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.6215028833952119, "language_loss": 0.79891664, "learning_rate": 3.4281893927334866e-06, "loss": 0.82137954, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.7871153354644775 }, { "auxiliary_loss_clip": 0.01258799, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.06021535, "balance_loss_mlp": 1.02278781, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 2.140392465923096, "language_loss": 0.75282848, "learning_rate": 3.4276439660066963e-06, "loss": 0.77573049, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.71162748336792 }, { "auxiliary_loss_clip": 0.01206099, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05934763, "balance_loss_mlp": 1.02723765, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.346018234484839, "language_loss": 0.84284461, "learning_rate": 3.427098322707452e-06, "loss": 0.86525643, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.5823709964752197 }, { "auxiliary_loss_clip": 0.0126593, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.06856799, "balance_loss_mlp": 1.0236963, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 2.3044478383670848, "language_loss": 0.89533424, "learning_rate": 3.426552462918526e-06, "loss": 0.91831529, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.643319606781006 }, { "auxiliary_loss_clip": 0.01204853, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.06084585, "balance_loss_mlp": 1.03420591, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 3.4533351838510944, "language_loss": 0.73375797, "learning_rate": 3.426006386722726e-06, "loss": 0.75622654, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.5544040203094482 }, { "auxiliary_loss_clip": 0.01358604, "auxiliary_loss_mlp": 0.01032319, "balance_loss_clip": 1.06020951, "balance_loss_mlp": 1.02367592, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 2.285368716320637, "language_loss": 0.9237873, "learning_rate": 3.4254600942028914e-06, "loss": 0.94769651, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.785475969314575 }, { "auxiliary_loss_clip": 0.01304244, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.05891109, "balance_loss_mlp": 1.02291, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.028490073975747, "language_loss": 0.82885307, "learning_rate": 3.424913585441893e-06, "loss": 0.85220563, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 3.983516216278076 }, { "auxiliary_loss_clip": 0.0125288, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.05833101, "balance_loss_mlp": 1.02404642, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 5.292291579007133, "language_loss": 0.87729412, "learning_rate": 3.4243668605226374e-06, "loss": 0.9001497, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 3.4898061752319336 }, { "auxiliary_loss_clip": 0.01353987, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.05725956, "balance_loss_mlp": 1.02070296, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 5.895027134945218, "language_loss": 0.82784092, "learning_rate": 3.423819919528061e-06, "loss": 0.85167241, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.7674291133880615 }, { "auxiliary_loss_clip": 0.01408611, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.05144417, "balance_loss_mlp": 1.02413511, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 2.21814528994253, "language_loss": 0.78496587, "learning_rate": 3.4232727625411355e-06, "loss": 0.80938119, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.7340168952941895 }, { "auxiliary_loss_clip": 0.01451073, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.0532763, "balance_loss_mlp": 1.02412093, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.8205373090918058, "language_loss": 0.86430144, "learning_rate": 3.4227253896448626e-06, "loss": 0.88913238, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.8119969367980957 }, { "auxiliary_loss_clip": 0.01203579, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.05783296, "balance_loss_mlp": 1.02165902, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 3.064669861039408, "language_loss": 0.8238315, "learning_rate": 3.42217780092228e-06, "loss": 0.84616661, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 3.5525686740875244 }, { "auxiliary_loss_clip": 0.0127154, "auxiliary_loss_mlp": 0.00999432, "balance_loss_clip": 1.02813625, "balance_loss_mlp": 0.99826378, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.7969878011177948, "language_loss": 0.60292602, "learning_rate": 3.421629996456456e-06, "loss": 0.62563574, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.2010695934295654 }, { "auxiliary_loss_clip": 0.01255509, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.05698931, "balance_loss_mlp": 1.02588606, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 3.0264380539318503, "language_loss": 0.82891726, "learning_rate": 3.421081976330491e-06, "loss": 0.85180998, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.6774861812591553 }, { "auxiliary_loss_clip": 0.01302734, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.05590713, "balance_loss_mlp": 1.0182451, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 1.9296032878570766, "language_loss": 0.88061094, "learning_rate": 3.4205337406275207e-06, "loss": 0.90390235, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.6691739559173584 }, { "auxiliary_loss_clip": 0.01207446, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.05979681, "balance_loss_mlp": 1.01920366, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.5568780004767495, "language_loss": 0.75408423, "learning_rate": 3.4199852894307114e-06, "loss": 0.77643836, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.6249403953552246 }, { "auxiliary_loss_clip": 0.01462064, "auxiliary_loss_mlp": 0.01039577, "balance_loss_clip": 1.05843568, "balance_loss_mlp": 1.0310533, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.2771082053113543, "language_loss": 0.78607541, "learning_rate": 3.419436622823262e-06, "loss": 0.81109178, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.815152406692505 }, { "auxiliary_loss_clip": 0.0130543, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.05677223, "balance_loss_mlp": 1.02108467, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.941425020564697, "language_loss": 0.74291301, "learning_rate": 3.4188877408884063e-06, "loss": 0.76626098, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 3.6279125213623047 }, { "auxiliary_loss_clip": 0.01302597, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.05608165, "balance_loss_mlp": 1.02193832, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.6410487979879003, "language_loss": 0.64943308, "learning_rate": 3.4183386437094088e-06, "loss": 0.67275625, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.7657570838928223 }, { "auxiliary_loss_clip": 0.01307333, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.05589485, "balance_loss_mlp": 1.02043247, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 2.200527518705088, "language_loss": 0.82473743, "learning_rate": 3.417789331369565e-06, "loss": 0.84809506, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.629396438598633 }, { "auxiliary_loss_clip": 0.01210718, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.06191754, "balance_loss_mlp": 1.023862, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 11.862988576049597, "language_loss": 0.91308522, "learning_rate": 3.4172398039522088e-06, "loss": 0.93551505, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.6797196865081787 }, { "auxiliary_loss_clip": 0.01258655, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 1.05889058, "balance_loss_mlp": 1.02235258, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 5.097336061987405, "language_loss": 0.79734349, "learning_rate": 3.4166900615407e-06, "loss": 0.82024616, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.724379539489746 }, { "auxiliary_loss_clip": 0.01255626, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.06023502, "balance_loss_mlp": 1.0202719, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 2.174030734026544, "language_loss": 0.75576943, "learning_rate": 3.416140104218436e-06, "loss": 0.77860582, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.774895429611206 }, { "auxiliary_loss_clip": 0.01205469, "auxiliary_loss_mlp": 0.02520682, "balance_loss_clip": 1.02429676, "balance_loss_mlp": 0.9997322, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8407574266846619, "language_loss": 0.69649374, "learning_rate": 3.4155899320688437e-06, "loss": 0.73375523, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.2911365032196045 }, { "auxiliary_loss_clip": 0.0145579, "auxiliary_loss_mlp": 0.01027826, "balance_loss_clip": 1.05636597, "balance_loss_mlp": 1.0197612, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.099892700934939, "language_loss": 0.74424922, "learning_rate": 3.415039545175384e-06, "loss": 0.76908535, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.716639518737793 }, { "auxiliary_loss_clip": 0.01257575, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.05835247, "balance_loss_mlp": 1.02154005, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 1.9833147999141445, "language_loss": 0.65270567, "learning_rate": 3.414488943621551e-06, "loss": 0.67557794, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.672616720199585 }, { "auxiliary_loss_clip": 0.0126816, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.06794882, "balance_loss_mlp": 1.02826309, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 2.2244409880949725, "language_loss": 0.73618203, "learning_rate": 3.41393812749087e-06, "loss": 0.75923395, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.6146247386932373 }, { "auxiliary_loss_clip": 0.0130513, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.06307554, "balance_loss_mlp": 1.02508616, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.294904595109277, "language_loss": 0.71653467, "learning_rate": 3.4133870968668984e-06, "loss": 0.73992234, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.831547498703003 }, { "auxiliary_loss_clip": 0.01305165, "auxiliary_loss_mlp": 0.01027742, "balance_loss_clip": 1.05695605, "balance_loss_mlp": 1.01972497, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 2.4127143825651167, "language_loss": 0.78516936, "learning_rate": 3.412835851833229e-06, "loss": 0.80849844, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.6964430809020996 }, { "auxiliary_loss_clip": 0.01261727, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.06378961, "balance_loss_mlp": 1.02195501, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 4.566889251546843, "language_loss": 0.78230941, "learning_rate": 3.4122843924734834e-06, "loss": 0.80523396, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.7146248817443848 }, { "auxiliary_loss_clip": 0.01303149, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.05495811, "balance_loss_mlp": 1.02194333, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 1.8370133314273813, "language_loss": 0.88351017, "learning_rate": 3.411732718871319e-06, "loss": 0.90685904, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.636880874633789 }, { "auxiliary_loss_clip": 0.01209648, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.06534481, "balance_loss_mlp": 1.02411449, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.9148348850873054, "language_loss": 0.78907281, "learning_rate": 3.4111808311104227e-06, "loss": 0.81148815, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 3.5168445110321045 }, { "auxiliary_loss_clip": 0.0131471, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.05677986, "balance_loss_mlp": 1.01817298, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 2.0663198542256955, "language_loss": 0.69673967, "learning_rate": 3.410628729274517e-06, "loss": 0.72015691, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.8559274673461914 }, { "auxiliary_loss_clip": 0.01304721, "auxiliary_loss_mlp": 0.02573047, "balance_loss_clip": 1.05755782, "balance_loss_mlp": 1.00029016, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 2.2425006950187187, "language_loss": 0.82739151, "learning_rate": 3.4100764134473546e-06, "loss": 0.86616921, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 3.6453490257263184 }, { "auxiliary_loss_clip": 0.01212133, "auxiliary_loss_mlp": 0.01030012, "balance_loss_clip": 1.06493711, "balance_loss_mlp": 1.02190626, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.4079408620392106, "language_loss": 0.85147882, "learning_rate": 3.4095238837127215e-06, "loss": 0.87390018, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.7708542346954346 }, { "auxiliary_loss_clip": 0.01352815, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.05451572, "balance_loss_mlp": 1.01553965, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 6.552964479297912, "language_loss": 0.80012208, "learning_rate": 3.4089711401544355e-06, "loss": 0.82388669, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.7555465698242188 }, { "auxiliary_loss_clip": 0.01259444, "auxiliary_loss_mlp": 0.0103378, "balance_loss_clip": 1.05923212, "balance_loss_mlp": 1.02514911, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.795441716519085, "language_loss": 0.67652833, "learning_rate": 3.4084181828563486e-06, "loss": 0.69946063, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.623030662536621 }, { "auxiliary_loss_clip": 0.01400126, "auxiliary_loss_mlp": 0.0103957, "balance_loss_clip": 1.05082119, "balance_loss_mlp": 1.03123116, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.8178333614470847, "language_loss": 0.70790237, "learning_rate": 3.4078650119023428e-06, "loss": 0.73229933, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.7602438926696777 }, { "auxiliary_loss_clip": 0.01375692, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.05107987, "balance_loss_mlp": 1.02329564, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 4.535963779593559, "language_loss": 0.74466717, "learning_rate": 3.4073116273763337e-06, "loss": 0.76874804, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 3.667773723602295 }, { "auxiliary_loss_clip": 0.01309626, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.05731463, "balance_loss_mlp": 1.0270853, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 2.148122388502644, "language_loss": 0.81014013, "learning_rate": 3.40675802936227e-06, "loss": 0.8336004, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.760514497756958 }, { "auxiliary_loss_clip": 0.01302452, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.05764866, "balance_loss_mlp": 1.02717853, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 2.4742830875087085, "language_loss": 0.71738565, "learning_rate": 3.4062042179441318e-06, "loss": 0.74076623, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 2.750361204147339 }, { "auxiliary_loss_clip": 0.01257185, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.0621258, "balance_loss_mlp": 1.02323115, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.4317219392131295, "language_loss": 0.8054533, "learning_rate": 3.4056501932059314e-06, "loss": 0.82834148, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.6658520698547363 }, { "auxiliary_loss_clip": 0.0110398, "auxiliary_loss_mlp": 0.01008248, "balance_loss_clip": 1.02697122, "balance_loss_mlp": 1.00701416, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.7670006561349062, "language_loss": 0.58118641, "learning_rate": 3.405095955231715e-06, "loss": 0.60230869, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.120814085006714 }, { "auxiliary_loss_clip": 0.01259286, "auxiliary_loss_mlp": 0.01031585, "balance_loss_clip": 1.0567596, "balance_loss_mlp": 1.02352643, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 2.6055711400925707, "language_loss": 0.94478101, "learning_rate": 3.4045415041055585e-06, "loss": 0.96768975, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 3.5337467193603516 }, { "auxiliary_loss_clip": 0.01308847, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.05852675, "balance_loss_mlp": 1.01736224, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.275972546818995, "language_loss": 0.77737677, "learning_rate": 3.4039868399115728e-06, "loss": 0.8007313, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.7079529762268066 }, { "auxiliary_loss_clip": 0.01459151, "auxiliary_loss_mlp": 0.01028782, "balance_loss_clip": 1.058079, "balance_loss_mlp": 1.02051532, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.8024362534546061, "language_loss": 0.80537164, "learning_rate": 3.4034319627339003e-06, "loss": 0.83025098, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.7310242652893066 }, { "auxiliary_loss_clip": 0.01309945, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.05929422, "balance_loss_mlp": 1.02293825, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.787052680496394, "language_loss": 0.69610429, "learning_rate": 3.402876872656715e-06, "loss": 0.71951771, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.723740339279175 }, { "auxiliary_loss_clip": 0.01305445, "auxiliary_loss_mlp": 0.01028865, "balance_loss_clip": 1.06001043, "balance_loss_mlp": 1.02013922, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 2.2250891606119563, "language_loss": 0.89689851, "learning_rate": 3.402321569764223e-06, "loss": 0.92024171, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.705022096633911 }, { "auxiliary_loss_clip": 0.01403404, "auxiliary_loss_mlp": 0.02574682, "balance_loss_clip": 1.05406153, "balance_loss_mlp": 1.00021195, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 2.2690495251972256, "language_loss": 0.83429074, "learning_rate": 3.4017660541406635e-06, "loss": 0.8740716, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.8192358016967773 }, { "auxiliary_loss_clip": 0.01225469, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.05422103, "balance_loss_mlp": 1.02563787, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.729412416867722, "language_loss": 0.74231815, "learning_rate": 3.4012103258703092e-06, "loss": 0.76491392, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.691594123840332 }, { "auxiliary_loss_clip": 0.01359011, "auxiliary_loss_mlp": 0.01035597, "balance_loss_clip": 1.05726647, "balance_loss_mlp": 1.02627528, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 2.291411373900019, "language_loss": 0.82928795, "learning_rate": 3.4006543850374616e-06, "loss": 0.85323405, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.7444300651550293 }, { "auxiliary_loss_clip": 0.01261731, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.05927229, "balance_loss_mlp": 1.02623785, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 1.9279865146350252, "language_loss": 0.74859905, "learning_rate": 3.400098231726458e-06, "loss": 0.77156234, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.625044584274292 }, { "auxiliary_loss_clip": 0.01361285, "auxiliary_loss_mlp": 0.01035542, "balance_loss_clip": 1.05387044, "balance_loss_mlp": 1.02661312, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 1.8380433769050215, "language_loss": 0.87242019, "learning_rate": 3.3995418660216657e-06, "loss": 0.89638841, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.6758062839508057 }, { "auxiliary_loss_clip": 0.01217116, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.06385732, "balance_loss_mlp": 1.02186596, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 3.223407809559123, "language_loss": 0.80401707, "learning_rate": 3.3989852880074848e-06, "loss": 0.82649589, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.6269705295562744 }, { "auxiliary_loss_clip": 0.01224672, "auxiliary_loss_mlp": 0.01005571, "balance_loss_clip": 1.03851163, "balance_loss_mlp": 1.00430763, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7387363283164082, "language_loss": 0.60610211, "learning_rate": 3.398428497768348e-06, "loss": 0.62840456, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.3732237815856934 }, { "auxiliary_loss_clip": 0.0136203, "auxiliary_loss_mlp": 0.01039415, "balance_loss_clip": 1.05175531, "balance_loss_mlp": 1.03123176, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 2.016646926325579, "language_loss": 0.72628248, "learning_rate": 3.3978714953887205e-06, "loss": 0.75029695, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.7861533164978027 }, { "auxiliary_loss_clip": 0.01389584, "auxiliary_loss_mlp": 0.01031234, "balance_loss_clip": 1.04646075, "balance_loss_mlp": 1.02306199, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 2.1314316643615028, "language_loss": 0.86146021, "learning_rate": 3.397314280953098e-06, "loss": 0.8856684, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.7623417377471924 }, { "auxiliary_loss_clip": 0.01303483, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.05599487, "balance_loss_mlp": 1.02287972, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 2.192641948407135, "language_loss": 0.80399495, "learning_rate": 3.3967568545460108e-06, "loss": 0.82733721, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 3.685112953186035 }, { "auxiliary_loss_clip": 0.0125449, "auxiliary_loss_mlp": 0.01029564, "balance_loss_clip": 1.05833173, "balance_loss_mlp": 1.02084351, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 1.9323695049660583, "language_loss": 0.80535918, "learning_rate": 3.3961992162520185e-06, "loss": 0.82819974, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.6405513286590576 }, { "auxiliary_loss_clip": 0.01255391, "auxiliary_loss_mlp": 0.01032465, "balance_loss_clip": 1.05762959, "balance_loss_mlp": 1.02444804, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.1535239087182534, "language_loss": 0.72019011, "learning_rate": 3.3956413661557156e-06, "loss": 0.7430687, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.6626012325286865 }, { "auxiliary_loss_clip": 0.01361281, "auxiliary_loss_mlp": 0.01032317, "balance_loss_clip": 1.05319476, "balance_loss_mlp": 1.02356696, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.1986072132945034, "language_loss": 0.66475987, "learning_rate": 3.3950833043417273e-06, "loss": 0.68869591, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 3.6959290504455566 }, { "auxiliary_loss_clip": 0.01262723, "auxiliary_loss_mlp": 0.01033516, "balance_loss_clip": 1.06317627, "balance_loss_mlp": 1.02501321, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 2.42803580789738, "language_loss": 0.73736453, "learning_rate": 3.3945250308947105e-06, "loss": 0.76032686, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.8495635986328125 }, { "auxiliary_loss_clip": 0.01162888, "auxiliary_loss_mlp": 0.01001522, "balance_loss_clip": 1.03273416, "balance_loss_mlp": 1.00038326, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2825802560359032, "language_loss": 0.68265045, "learning_rate": 3.3939665458993556e-06, "loss": 0.7042945, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.1996092796325684 }, { "auxiliary_loss_clip": 0.01361572, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.05411601, "balance_loss_mlp": 1.02639771, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 2.4114730479094764, "language_loss": 0.77016473, "learning_rate": 3.3934078494403843e-06, "loss": 0.79412901, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 3.633247137069702 }, { "auxiliary_loss_clip": 0.01419895, "auxiliary_loss_mlp": 0.02577151, "balance_loss_clip": 1.05080283, "balance_loss_mlp": 1.00028372, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 2.2849158312904625, "language_loss": 0.81152499, "learning_rate": 3.3928489416025495e-06, "loss": 0.85149544, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.870325803756714 }, { "auxiliary_loss_clip": 0.0130234, "auxiliary_loss_mlp": 0.01033287, "balance_loss_clip": 1.05662942, "balance_loss_mlp": 1.0239892, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.379660543833536, "language_loss": 0.79250735, "learning_rate": 3.392289822470638e-06, "loss": 0.81586361, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.960207939147949 }, { "auxiliary_loss_clip": 0.01306111, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.05655837, "balance_loss_mlp": 1.02596283, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 2.1727557766948538, "language_loss": 0.75675231, "learning_rate": 3.3917304921294674e-06, "loss": 0.78016126, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.8224289417266846 }, { "auxiliary_loss_clip": 0.01258817, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.05819726, "balance_loss_mlp": 1.02248955, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 1.5237439060168039, "language_loss": 0.80743128, "learning_rate": 3.3911709506638876e-06, "loss": 0.83033705, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.633056402206421 }, { "auxiliary_loss_clip": 0.01346405, "auxiliary_loss_mlp": 0.02576368, "balance_loss_clip": 1.05051768, "balance_loss_mlp": 1.00025272, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 3.6449421104696915, "language_loss": 0.80955911, "learning_rate": 3.390611198158781e-06, "loss": 0.84878677, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.7712318897247314 }, { "auxiliary_loss_clip": 0.0121057, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.06045449, "balance_loss_mlp": 1.02553558, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.394708770784709, "language_loss": 0.90106833, "learning_rate": 3.3900512346990612e-06, "loss": 0.92352009, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 3.7179393768310547 }, { "auxiliary_loss_clip": 0.01404057, "auxiliary_loss_mlp": 0.01030298, "balance_loss_clip": 1.04620528, "balance_loss_mlp": 1.02119064, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.9441428677055217, "language_loss": 0.65949106, "learning_rate": 3.389491060369674e-06, "loss": 0.68383461, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 2.9027559757232666 }, { "auxiliary_loss_clip": 0.01398195, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.05253983, "balance_loss_mlp": 1.02786398, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 2.1910172031858632, "language_loss": 0.90131491, "learning_rate": 3.388930675255598e-06, "loss": 0.9256627, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.8384487628936768 }, { "auxiliary_loss_clip": 0.01310313, "auxiliary_loss_mlp": 0.01035113, "balance_loss_clip": 1.05699801, "balance_loss_mlp": 1.02591586, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.428165474267086, "language_loss": 0.79657626, "learning_rate": 3.388370079441843e-06, "loss": 0.82003045, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.6483089923858643 }, { "auxiliary_loss_clip": 0.01354721, "auxiliary_loss_mlp": 0.01037969, "balance_loss_clip": 1.05797541, "balance_loss_mlp": 1.02970743, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.314469058331186, "language_loss": 0.9267109, "learning_rate": 3.3878092730134505e-06, "loss": 0.95063776, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.7238245010375977 }, { "auxiliary_loss_clip": 0.0125303, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 1.05626953, "balance_loss_mlp": 1.02211344, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 1.8742101149903125, "language_loss": 0.80758774, "learning_rate": 3.3872482560554947e-06, "loss": 0.83042884, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.624131441116333 }, { "auxiliary_loss_clip": 0.01164463, "auxiliary_loss_mlp": 0.00999785, "balance_loss_clip": 1.03176749, "balance_loss_mlp": 0.99856913, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.7954572740252815, "language_loss": 0.56912738, "learning_rate": 3.386687028653082e-06, "loss": 0.59076989, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.293079376220703 }, { "auxiliary_loss_clip": 0.01407443, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.05556273, "balance_loss_mlp": 1.0252943, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 2.3016803554896197, "language_loss": 0.85223299, "learning_rate": 3.386125590891349e-06, "loss": 0.87665009, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.7876710891723633 }, { "auxiliary_loss_clip": 0.01298577, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.05310607, "balance_loss_mlp": 1.01974022, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 3.4011118587612637, "language_loss": 0.83013904, "learning_rate": 3.3855639428554657e-06, "loss": 0.85340738, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.671074151992798 }, { "auxiliary_loss_clip": 0.01345252, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.05258155, "balance_loss_mlp": 1.02437758, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 3.497007402191324, "language_loss": 0.80490738, "learning_rate": 3.385002084630635e-06, "loss": 0.82868522, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.716866970062256 }, { "auxiliary_loss_clip": 0.01261586, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.05813766, "balance_loss_mlp": 1.02619934, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 2.4598548283145116, "language_loss": 0.84977657, "learning_rate": 3.384440016302088e-06, "loss": 0.87274325, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.609208345413208 }, { "auxiliary_loss_clip": 0.01252737, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.05728042, "balance_loss_mlp": 1.027511, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.286317959728214, "language_loss": 0.62319183, "learning_rate": 3.3838777379550923e-06, "loss": 0.64608586, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.721670389175415 }, { "auxiliary_loss_clip": 0.01219754, "auxiliary_loss_mlp": 0.0103904, "balance_loss_clip": 1.05737388, "balance_loss_mlp": 1.03128529, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 3.0517686408914924, "language_loss": 0.78709197, "learning_rate": 3.383315249674944e-06, "loss": 0.80967993, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.7195305824279785 }, { "auxiliary_loss_clip": 0.01356609, "auxiliary_loss_mlp": 0.01035896, "balance_loss_clip": 1.05670846, "balance_loss_mlp": 1.02718163, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 2.139985678189235, "language_loss": 0.86035407, "learning_rate": 3.3827525515469715e-06, "loss": 0.88427913, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.7428858280181885 }, { "auxiliary_loss_clip": 0.01348653, "auxiliary_loss_mlp": 0.01025979, "balance_loss_clip": 1.05197883, "balance_loss_mlp": 1.01735973, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 1.9901661670421955, "language_loss": 0.70654619, "learning_rate": 3.3821896436565367e-06, "loss": 0.7302925, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 3.6036741733551025 }, { "auxiliary_loss_clip": 0.01259656, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.06063962, "balance_loss_mlp": 1.02511573, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.759896654550285, "language_loss": 0.70373416, "learning_rate": 3.381626526089032e-06, "loss": 0.72666192, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.6391217708587646 }, { "auxiliary_loss_clip": 0.0130509, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.05282092, "balance_loss_mlp": 1.02202177, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 1.9565186118586222, "language_loss": 0.78650677, "learning_rate": 3.3810631989298815e-06, "loss": 0.80986559, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 3.573195219039917 }, { "auxiliary_loss_clip": 0.01410541, "auxiliary_loss_mlp": 0.01039027, "balance_loss_clip": 1.05362976, "balance_loss_mlp": 1.02992511, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.5454961563517635, "language_loss": 0.84228992, "learning_rate": 3.3804996622645423e-06, "loss": 0.86678565, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.7171247005462646 }, { "auxiliary_loss_clip": 0.01205614, "auxiliary_loss_mlp": 0.01036762, "balance_loss_clip": 1.05858064, "balance_loss_mlp": 1.02807164, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 2.557392275389734, "language_loss": 0.89461386, "learning_rate": 3.3799359161785015e-06, "loss": 0.91703761, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.7149624824523926 }, { "auxiliary_loss_clip": 0.0125541, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.05908549, "balance_loss_mlp": 1.02589357, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.575765000828965, "language_loss": 0.85622519, "learning_rate": 3.3793719607572798e-06, "loss": 0.87912774, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.6982479095458984 }, { "auxiliary_loss_clip": 0.01295713, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.05271423, "balance_loss_mlp": 1.02753329, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 3.05355724607244, "language_loss": 0.77224147, "learning_rate": 3.378807796086428e-06, "loss": 0.79555809, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.7917513847351074 }, { "auxiliary_loss_clip": 0.01211686, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.06192243, "balance_loss_mlp": 1.02668953, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 2.5074126165458073, "language_loss": 0.77485156, "learning_rate": 3.37824342225153e-06, "loss": 0.79731679, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 3.6638550758361816 }, { "auxiliary_loss_clip": 0.01409096, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.05766916, "balance_loss_mlp": 1.02556455, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 2.1546448683821646, "language_loss": 0.77889991, "learning_rate": 3.3776788393382006e-06, "loss": 0.80332935, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.812812566757202 }, { "auxiliary_loss_clip": 0.01209542, "auxiliary_loss_mlp": 0.01031885, "balance_loss_clip": 1.06174195, "balance_loss_mlp": 1.02347517, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 3.7943022488935325, "language_loss": 0.76549011, "learning_rate": 3.3771140474320872e-06, "loss": 0.78790438, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.6724226474761963 }, { "auxiliary_loss_clip": 0.01362761, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.05763173, "balance_loss_mlp": 1.02324283, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 3.005314974448947, "language_loss": 0.79882467, "learning_rate": 3.3765490466188664e-06, "loss": 0.82277453, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.69590425491333 }, { "auxiliary_loss_clip": 0.01356357, "auxiliary_loss_mlp": 0.0103719, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.02836204, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 2.2271239243823366, "language_loss": 0.73470414, "learning_rate": 3.3759838369842508e-06, "loss": 0.75863957, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.684208393096924 }, { "auxiliary_loss_clip": 0.01366047, "auxiliary_loss_mlp": 0.01031833, "balance_loss_clip": 1.06024361, "balance_loss_mlp": 1.02313113, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 2.06975658292684, "language_loss": 0.72830254, "learning_rate": 3.375418418613981e-06, "loss": 0.75228131, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 3.583998441696167 }, { "auxiliary_loss_clip": 0.01310142, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.06020069, "balance_loss_mlp": 1.02824295, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 2.257843173432874, "language_loss": 0.83678031, "learning_rate": 3.374852791593831e-06, "loss": 0.86025351, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 2.6838948726654053 }, { "auxiliary_loss_clip": 0.01328723, "auxiliary_loss_mlp": 0.01031246, "balance_loss_clip": 1.0542109, "balance_loss_mlp": 1.02272224, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 3.4944370699742398, "language_loss": 0.53974295, "learning_rate": 3.374286956009605e-06, "loss": 0.56334263, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.7462823390960693 }, { "auxiliary_loss_clip": 0.01256665, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.06091714, "balance_loss_mlp": 1.02814198, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.7179538726934354, "language_loss": 0.75567663, "learning_rate": 3.3737209119471405e-06, "loss": 0.77860653, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.6496145725250244 }, { "auxiliary_loss_clip": 0.01263882, "auxiliary_loss_mlp": 0.0103609, "balance_loss_clip": 1.05994201, "balance_loss_mlp": 1.0263803, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 2.4830097974249385, "language_loss": 0.63886046, "learning_rate": 3.373154659492306e-06, "loss": 0.66186011, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.660494089126587 }, { "auxiliary_loss_clip": 0.01224447, "auxiliary_loss_mlp": 0.01029482, "balance_loss_clip": 1.05896068, "balance_loss_mlp": 1.02156615, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 2.311958879624698, "language_loss": 0.8536588, "learning_rate": 3.3725881987310016e-06, "loss": 0.87619805, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.635190010070801 }, { "auxiliary_loss_clip": 0.01304577, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.05497563, "balance_loss_mlp": 1.02102447, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 2.0073503627286753, "language_loss": 0.87880731, "learning_rate": 3.372021529749159e-06, "loss": 0.90214992, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.7199554443359375 }, { "auxiliary_loss_clip": 0.01371502, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.05550575, "balance_loss_mlp": 1.02043939, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 2.1129819879504996, "language_loss": 0.92188132, "learning_rate": 3.3714546526327405e-06, "loss": 0.94588369, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.7650246620178223 }, { "auxiliary_loss_clip": 0.01359067, "auxiliary_loss_mlp": 0.01035228, "balance_loss_clip": 1.05386853, "balance_loss_mlp": 1.02663291, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 2.056495612434856, "language_loss": 0.88264632, "learning_rate": 3.3708875674677423e-06, "loss": 0.90658921, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.731826066970825 }, { "auxiliary_loss_clip": 0.01233475, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.06134665, "balance_loss_mlp": 1.02305818, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 2.0971311990259847, "language_loss": 0.83533764, "learning_rate": 3.37032027434019e-06, "loss": 0.85798979, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.6527647972106934 }, { "auxiliary_loss_clip": 0.01268151, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.0587728, "balance_loss_mlp": 1.02426863, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 207.15586357011608, "language_loss": 0.83190668, "learning_rate": 3.369752773336141e-06, "loss": 0.85493445, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.6819732189178467 }, { "auxiliary_loss_clip": 0.01307652, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.05686867, "balance_loss_mlp": 1.02435422, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.979928458911732, "language_loss": 0.78548181, "learning_rate": 3.3691850645416864e-06, "loss": 0.80889189, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.680032253265381 }, { "auxiliary_loss_clip": 0.01264289, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.05971336, "balance_loss_mlp": 1.02629781, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 2.6206834110930455, "language_loss": 0.83364558, "learning_rate": 3.368617148042945e-06, "loss": 0.85663772, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.666668176651001 }, { "auxiliary_loss_clip": 0.01305033, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.05292845, "balance_loss_mlp": 1.02354455, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 1.925541276046068, "language_loss": 0.84518659, "learning_rate": 3.368049023926071e-06, "loss": 0.86856246, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.654344081878662 }, { "auxiliary_loss_clip": 0.01260321, "auxiliary_loss_mlp": 0.01028092, "balance_loss_clip": 1.06191218, "balance_loss_mlp": 1.01944363, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.554198648494667, "language_loss": 0.83542854, "learning_rate": 3.3674806922772476e-06, "loss": 0.85831273, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.6809418201446533 }, { "auxiliary_loss_clip": 0.01273901, "auxiliary_loss_mlp": 0.01031755, "balance_loss_clip": 1.05596769, "balance_loss_mlp": 1.02360129, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 1.7361797599195932, "language_loss": 0.75082511, "learning_rate": 3.3669121531826904e-06, "loss": 0.77388167, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 3.707632303237915 }, { "auxiliary_loss_clip": 0.01356351, "auxiliary_loss_mlp": 0.01041806, "balance_loss_clip": 1.06139255, "balance_loss_mlp": 1.03282404, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.1884511172061925, "language_loss": 0.83159912, "learning_rate": 3.366343406728647e-06, "loss": 0.85558057, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.685370683670044 }, { "auxiliary_loss_clip": 0.01253362, "auxiliary_loss_mlp": 0.01034276, "balance_loss_clip": 1.05531955, "balance_loss_mlp": 1.02541256, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 2.2497039379319173, "language_loss": 0.68605727, "learning_rate": 3.3657744530013946e-06, "loss": 0.70893365, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 3.537548303604126 }, { "auxiliary_loss_clip": 0.01264944, "auxiliary_loss_mlp": 0.01038289, "balance_loss_clip": 1.06162953, "balance_loss_mlp": 1.02986097, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 2.28629676460402, "language_loss": 0.71539569, "learning_rate": 3.3652052920872437e-06, "loss": 0.738428, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.845522880554199 }, { "auxiliary_loss_clip": 0.01310438, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.05853653, "balance_loss_mlp": 1.0205977, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 2.061811263023273, "language_loss": 0.85616267, "learning_rate": 3.3646359240725355e-06, "loss": 0.87955666, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.688249111175537 }, { "auxiliary_loss_clip": 0.01260225, "auxiliary_loss_mlp": 0.02575047, "balance_loss_clip": 1.06037068, "balance_loss_mlp": 1.00029922, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.2623496620773973, "language_loss": 0.67490947, "learning_rate": 3.364066349043643e-06, "loss": 0.71326214, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 3.640286445617676 }, { "auxiliary_loss_clip": 0.01305896, "auxiliary_loss_mlp": 0.0102891, "balance_loss_clip": 1.05691886, "balance_loss_mlp": 1.02064848, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.6873160645560967, "language_loss": 0.82143593, "learning_rate": 3.363496567086969e-06, "loss": 0.84478396, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.6608974933624268 }, { "auxiliary_loss_clip": 0.01211635, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.06339359, "balance_loss_mlp": 1.02142954, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 2.01952887933366, "language_loss": 0.76015496, "learning_rate": 3.3629265782889506e-06, "loss": 0.78256869, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.7880594730377197 }, { "auxiliary_loss_clip": 0.01353771, "auxiliary_loss_mlp": 0.01033769, "balance_loss_clip": 1.05313921, "balance_loss_mlp": 1.02540386, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.3759832690772273, "language_loss": 0.72106922, "learning_rate": 3.362356382736054e-06, "loss": 0.74494469, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.774721384048462 }, { "auxiliary_loss_clip": 0.01270986, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.05237794, "balance_loss_mlp": 1.02022934, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 2.9278535885640187, "language_loss": 0.90567487, "learning_rate": 3.361785980514777e-06, "loss": 0.92867029, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 2.7116382122039795 }, { "auxiliary_loss_clip": 0.0145461, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.05715036, "balance_loss_mlp": 1.02909255, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 2.0317083771438935, "language_loss": 0.76494479, "learning_rate": 3.361215371711649e-06, "loss": 0.78987598, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.8017146587371826 }, { "auxiliary_loss_clip": 0.01347486, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.05471969, "balance_loss_mlp": 1.01906443, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 3.662069296918815, "language_loss": 0.83143044, "learning_rate": 3.3606445564132326e-06, "loss": 0.85517836, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 3.669661521911621 }, { "auxiliary_loss_clip": 0.01211894, "auxiliary_loss_mlp": 0.02571786, "balance_loss_clip": 1.0636127, "balance_loss_mlp": 1.00019491, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.3548569319525687, "language_loss": 0.81973541, "learning_rate": 3.360073534706118e-06, "loss": 0.8575722, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.625143527984619 }, { "auxiliary_loss_clip": 0.01313099, "auxiliary_loss_mlp": 0.01042116, "balance_loss_clip": 1.06110239, "balance_loss_mlp": 1.03316879, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.277143994979008, "language_loss": 0.76396263, "learning_rate": 3.35950230667693e-06, "loss": 0.78751481, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.826960325241089 }, { "auxiliary_loss_clip": 0.01262126, "auxiliary_loss_mlp": 0.01030701, "balance_loss_clip": 1.05997765, "balance_loss_mlp": 1.02282763, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.251863704565061, "language_loss": 0.86404681, "learning_rate": 3.358930872412323e-06, "loss": 0.88697505, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.618797779083252 }, { "auxiliary_loss_clip": 0.01259107, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.06147075, "balance_loss_mlp": 1.02328515, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.7900463829841267, "language_loss": 0.81065035, "learning_rate": 3.3583592319989825e-06, "loss": 0.83355719, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.6055939197540283 }, { "auxiliary_loss_clip": 0.01270014, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.06261873, "balance_loss_mlp": 1.0231111, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.1612976724064503, "language_loss": 0.68862122, "learning_rate": 3.357787385523627e-06, "loss": 0.71163988, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.803755521774292 }, { "auxiliary_loss_clip": 0.01461903, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.05118942, "balance_loss_mlp": 1.02070475, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 3.756720248948288, "language_loss": 0.82468861, "learning_rate": 3.3572153330730048e-06, "loss": 0.84960234, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.909616708755493 }, { "auxiliary_loss_clip": 0.01270167, "auxiliary_loss_mlp": 0.0099973, "balance_loss_clip": 1.03343463, "balance_loss_mlp": 0.99841255, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 1.707263767815247, "language_loss": 0.64647889, "learning_rate": 3.3566430747338956e-06, "loss": 0.66917789, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.2818734645843506 }, { "auxiliary_loss_clip": 0.01260064, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.05754399, "balance_loss_mlp": 1.02373815, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 2.1115862046720064, "language_loss": 0.8691352, "learning_rate": 3.35607061059311e-06, "loss": 0.89206767, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.690619707107544 }, { "auxiliary_loss_clip": 0.0121048, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 1.0635438, "balance_loss_mlp": 1.02238238, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 3.154948941316112, "language_loss": 0.75024807, "learning_rate": 3.3554979407374917e-06, "loss": 0.77266133, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.6628963947296143 }, { "auxiliary_loss_clip": 0.01259944, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.05869114, "balance_loss_mlp": 1.02143085, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 1.8169060607204155, "language_loss": 0.7394588, "learning_rate": 3.3549250652539134e-06, "loss": 0.76235294, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.6996805667877197 }, { "auxiliary_loss_clip": 0.01308177, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.0544796, "balance_loss_mlp": 1.02387333, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 1.897202019720728, "language_loss": 0.81528211, "learning_rate": 3.3543519842292794e-06, "loss": 0.83869773, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.688646078109741 }, { "auxiliary_loss_clip": 0.01212356, "auxiliary_loss_mlp": 0.02574624, "balance_loss_clip": 1.06385255, "balance_loss_mlp": 1.00024021, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 3.595441459188916, "language_loss": 0.83491933, "learning_rate": 3.353778697750527e-06, "loss": 0.87278914, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.6900582313537598 }, { "auxiliary_loss_clip": 0.0130175, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.02042723, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 5.455733538389054, "language_loss": 0.89568418, "learning_rate": 3.353205205904622e-06, "loss": 0.91899145, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.7007546424865723 }, { "auxiliary_loss_clip": 0.01303484, "auxiliary_loss_mlp": 0.01033649, "balance_loss_clip": 1.05763245, "balance_loss_mlp": 1.02474439, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 1.9745515813161378, "language_loss": 0.71873832, "learning_rate": 3.3526315087785637e-06, "loss": 0.74210954, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.8914403915405273 }, { "auxiliary_loss_clip": 0.01392683, "auxiliary_loss_mlp": 0.01034391, "balance_loss_clip": 1.0530982, "balance_loss_mlp": 1.02595675, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.9256781989474228, "language_loss": 0.8103832, "learning_rate": 3.3520576064593805e-06, "loss": 0.83465385, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 3.6553120613098145 }, { "auxiliary_loss_clip": 0.0126304, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.0605613, "balance_loss_mlp": 1.02798748, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.6546307272636671, "language_loss": 0.81805903, "learning_rate": 3.3514834990341337e-06, "loss": 0.8410545, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.6668803691864014 }, { "auxiliary_loss_clip": 0.0131261, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.05759442, "balance_loss_mlp": 1.02528524, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 2.926940839876949, "language_loss": 0.9271307, "learning_rate": 3.3509091865899144e-06, "loss": 0.95059246, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 3.524513006210327 }, { "auxiliary_loss_clip": 0.01210681, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.06133366, "balance_loss_mlp": 1.02554846, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 1.813035788219474, "language_loss": 0.70472074, "learning_rate": 3.350334669213846e-06, "loss": 0.72717386, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.6124229431152344 }, { "auxiliary_loss_clip": 0.01259296, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.06303334, "balance_loss_mlp": 1.02726126, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 3.7202335314661665, "language_loss": 0.75558496, "learning_rate": 3.3497599469930816e-06, "loss": 0.77852839, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.6928834915161133 }, { "auxiliary_loss_clip": 0.01207298, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.05916739, "balance_loss_mlp": 1.02513075, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.541910336602393, "language_loss": 0.83269465, "learning_rate": 3.349185020014807e-06, "loss": 0.85510367, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.6738297939300537 }, { "auxiliary_loss_clip": 0.01260768, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.05834126, "balance_loss_mlp": 1.02694893, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 1.8744138453808687, "language_loss": 0.74899179, "learning_rate": 3.348609888366237e-06, "loss": 0.77195925, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 3.6198854446411133 }, { "auxiliary_loss_clip": 0.01454885, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.05268824, "balance_loss_mlp": 1.0242995, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.0969070526246494, "language_loss": 0.62688816, "learning_rate": 3.348034552134619e-06, "loss": 0.65176022, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.865286111831665 }, { "auxiliary_loss_clip": 0.01403091, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.05743384, "balance_loss_mlp": 1.02609313, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 2.7576370384801936, "language_loss": 0.84305668, "learning_rate": 3.3474590114072316e-06, "loss": 0.86742932, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.777210235595703 }, { "auxiliary_loss_clip": 0.01347508, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.0555563, "balance_loss_mlp": 1.0283637, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 2.2219577042455394, "language_loss": 0.82996273, "learning_rate": 3.3468832662713836e-06, "loss": 0.85380757, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.7487494945526123 }, { "auxiliary_loss_clip": 0.01353168, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.0576055, "balance_loss_mlp": 1.0232321, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.524083249134626, "language_loss": 0.84134609, "learning_rate": 3.346307316814415e-06, "loss": 0.86519742, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.6626346111297607 }, { "auxiliary_loss_clip": 0.0125739, "auxiliary_loss_mlp": 0.01032143, "balance_loss_clip": 1.06147504, "balance_loss_mlp": 1.02348232, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 1.9854695061457712, "language_loss": 0.7589494, "learning_rate": 3.3457311631236965e-06, "loss": 0.78184474, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 3.5655486583709717 }, { "auxiliary_loss_clip": 0.01298833, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.05470157, "balance_loss_mlp": 1.02771139, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.935403461645632, "language_loss": 0.84262788, "learning_rate": 3.345154805286631e-06, "loss": 0.86598235, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.7266483306884766 }, { "auxiliary_loss_clip": 0.01253462, "auxiliary_loss_mlp": 0.01026231, "balance_loss_clip": 1.05624461, "balance_loss_mlp": 1.01826763, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 2.2601901057986913, "language_loss": 0.76521611, "learning_rate": 3.344578243390651e-06, "loss": 0.78801304, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.628401041030884 }, { "auxiliary_loss_clip": 0.01304331, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.05862379, "balance_loss_mlp": 1.0265733, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 2.256712608582304, "language_loss": 0.78350097, "learning_rate": 3.3440014775232206e-06, "loss": 0.80688918, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.682675838470459 }, { "auxiliary_loss_clip": 0.01271624, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.0566982, "balance_loss_mlp": 1.02525198, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.274558894428822, "language_loss": 0.71503782, "learning_rate": 3.343424507771834e-06, "loss": 0.73808384, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.722459316253662 }, { "auxiliary_loss_clip": 0.01351691, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.05509448, "balance_loss_mlp": 1.02261114, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 2.0733116085087766, "language_loss": 0.86842895, "learning_rate": 3.342847334224018e-06, "loss": 0.89224815, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.6888420581817627 }, { "auxiliary_loss_clip": 0.01163217, "auxiliary_loss_mlp": 0.01003986, "balance_loss_clip": 1.03518271, "balance_loss_mlp": 1.0027287, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9562889300801051, "language_loss": 0.6238997, "learning_rate": 3.342269956967329e-06, "loss": 0.64557171, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.3532893657684326 }, { "auxiliary_loss_clip": 0.012604, "auxiliary_loss_mlp": 0.01031677, "balance_loss_clip": 1.05892479, "balance_loss_mlp": 1.0225991, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.7244210861774816, "language_loss": 0.71975374, "learning_rate": 3.341692376089355e-06, "loss": 0.74267447, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.677497386932373 }, { "auxiliary_loss_clip": 0.0125377, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.05914521, "balance_loss_mlp": 1.01900911, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 3.1194070683483996, "language_loss": 0.84588116, "learning_rate": 3.3411145916777146e-06, "loss": 0.86868709, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.7279052734375 }, { "auxiliary_loss_clip": 0.01301978, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.05400419, "balance_loss_mlp": 1.02277887, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.327500724067234, "language_loss": 0.91304159, "learning_rate": 3.3405366038200566e-06, "loss": 0.9363699, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.6768481731414795 }, { "auxiliary_loss_clip": 0.01307516, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.0595144, "balance_loss_mlp": 1.02882755, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.8259063526231087, "language_loss": 0.85119009, "learning_rate": 3.3399584126040617e-06, "loss": 0.87463391, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.800168752670288 }, { "auxiliary_loss_clip": 0.01206942, "auxiliary_loss_mlp": 0.02567821, "balance_loss_clip": 1.05947375, "balance_loss_mlp": 1.0000639, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 2.1625489360353978, "language_loss": 0.9134798, "learning_rate": 3.339380018117441e-06, "loss": 0.95122743, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.633810520172119 }, { "auxiliary_loss_clip": 0.01250424, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.05866611, "balance_loss_mlp": 1.02130055, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 2.989422020357337, "language_loss": 0.77830386, "learning_rate": 3.3388014204479366e-06, "loss": 0.80110323, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.682804822921753 }, { "auxiliary_loss_clip": 0.0120944, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.06025994, "balance_loss_mlp": 1.02748668, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 2.183695888338757, "language_loss": 0.91801858, "learning_rate": 3.338222619683321e-06, "loss": 0.94047284, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.6036267280578613 }, { "auxiliary_loss_clip": 0.01307859, "auxiliary_loss_mlp": 0.01035453, "balance_loss_clip": 1.05959463, "balance_loss_mlp": 1.02647018, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 2.7442425722314154, "language_loss": 0.73687589, "learning_rate": 3.337643615911398e-06, "loss": 0.76030898, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.7123374938964844 }, { "auxiliary_loss_clip": 0.01256941, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.05711436, "balance_loss_mlp": 1.02516818, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 2.385954787163811, "language_loss": 0.79055029, "learning_rate": 3.3370644092200026e-06, "loss": 0.8134557, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 3.621412754058838 }, { "auxiliary_loss_clip": 0.01343834, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.04866958, "balance_loss_mlp": 1.02266419, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 2.6476164565864417, "language_loss": 0.78391576, "learning_rate": 3.3364849996969985e-06, "loss": 0.80766618, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.7525649070739746 }, { "auxiliary_loss_clip": 0.01250737, "auxiliary_loss_mlp": 0.01032179, "balance_loss_clip": 1.05620909, "balance_loss_mlp": 1.02400088, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 2.349834334502594, "language_loss": 0.8526274, "learning_rate": 3.335905387430283e-06, "loss": 0.87545657, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 3.736158847808838 }, { "auxiliary_loss_clip": 0.01311459, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.05561781, "balance_loss_mlp": 1.0279175, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 2.5435120340794826, "language_loss": 0.82905412, "learning_rate": 3.335325572507782e-06, "loss": 0.85252517, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.6724612712860107 }, { "auxiliary_loss_clip": 0.01208026, "auxiliary_loss_mlp": 0.02569586, "balance_loss_clip": 1.06212628, "balance_loss_mlp": 1.00017536, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.6248628898873405, "language_loss": 0.74211591, "learning_rate": 3.3347455550174537e-06, "loss": 0.77989203, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.616062641143799 }, { "auxiliary_loss_clip": 0.01351545, "auxiliary_loss_mlp": 0.01029258, "balance_loss_clip": 1.05212402, "balance_loss_mlp": 1.02122331, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.1518753140418774, "language_loss": 0.6820091, "learning_rate": 3.3341653350472864e-06, "loss": 0.70581716, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.7221713066101074 }, { "auxiliary_loss_clip": 0.01214993, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.06145084, "balance_loss_mlp": 1.0240196, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.4882509980367034, "language_loss": 0.69239891, "learning_rate": 3.333584912685298e-06, "loss": 0.71487689, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 3.7170522212982178 }, { "auxiliary_loss_clip": 0.01173071, "auxiliary_loss_mlp": 0.01007986, "balance_loss_clip": 1.03722954, "balance_loss_mlp": 1.00676382, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.9119556327156105, "language_loss": 0.55592555, "learning_rate": 3.3330042880195385e-06, "loss": 0.57773614, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.3272042274475098 }, { "auxiliary_loss_clip": 0.01306234, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.05594039, "balance_loss_mlp": 1.02102244, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 1.9099424653121524, "language_loss": 0.78462094, "learning_rate": 3.3324234611380888e-06, "loss": 0.8079679, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.68977952003479 }, { "auxiliary_loss_clip": 0.01351504, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.05662751, "balance_loss_mlp": 1.02329063, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 1.869506455980278, "language_loss": 0.81522155, "learning_rate": 3.3318424321290596e-06, "loss": 0.83904672, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.876106023788452 }, { "auxiliary_loss_clip": 0.01267726, "auxiliary_loss_mlp": 0.01002062, "balance_loss_clip": 1.03589785, "balance_loss_mlp": 1.00097084, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.8359375878246211, "language_loss": 0.59887516, "learning_rate": 3.3312612010805917e-06, "loss": 0.62157303, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 4.2374587059021 }, { "auxiliary_loss_clip": 0.01296039, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.05311751, "balance_loss_mlp": 1.02772784, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.63221163937246, "language_loss": 0.6993258, "learning_rate": 3.330679768080858e-06, "loss": 0.72265244, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.755849838256836 }, { "auxiliary_loss_clip": 0.01254559, "auxiliary_loss_mlp": 0.01031678, "balance_loss_clip": 1.05999231, "balance_loss_mlp": 1.02385187, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.163055314071408, "language_loss": 0.83366537, "learning_rate": 3.3300981332180627e-06, "loss": 0.85652775, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.7158291339874268 }, { "auxiliary_loss_clip": 0.01270838, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.05453253, "balance_loss_mlp": 1.02206945, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 5.231211085944058, "language_loss": 0.80469882, "learning_rate": 3.3295162965804373e-06, "loss": 0.82770818, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.748145818710327 }, { "auxiliary_loss_clip": 0.01351335, "auxiliary_loss_mlp": 0.01029746, "balance_loss_clip": 1.05564749, "balance_loss_mlp": 1.02238178, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.5712714367722693, "language_loss": 0.78742975, "learning_rate": 3.328934258256247e-06, "loss": 0.81124055, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.660404682159424 }, { "auxiliary_loss_clip": 0.01253389, "auxiliary_loss_mlp": 0.01029307, "balance_loss_clip": 1.05529714, "balance_loss_mlp": 1.02090263, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 2.1448073749761094, "language_loss": 0.67352986, "learning_rate": 3.3283520183337856e-06, "loss": 0.69635689, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.689288854598999 }, { "auxiliary_loss_clip": 0.01300763, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.05520678, "balance_loss_mlp": 1.02537632, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 3.273820632962703, "language_loss": 0.69004464, "learning_rate": 3.3277695769013797e-06, "loss": 0.71338272, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.653068780899048 }, { "auxiliary_loss_clip": 0.01254279, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.05846548, "balance_loss_mlp": 1.02435923, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.186299594273936, "language_loss": 0.77552795, "learning_rate": 3.327186934047385e-06, "loss": 0.79839915, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.6707561016082764 }, { "auxiliary_loss_clip": 0.01294903, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.04833269, "balance_loss_mlp": 1.02010274, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 2.1981556299533125, "language_loss": 0.65724701, "learning_rate": 3.3266040898601877e-06, "loss": 0.68047595, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.651920795440674 }, { "auxiliary_loss_clip": 0.01313774, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.05054164, "balance_loss_mlp": 1.02912748, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 2.20870533271388, "language_loss": 0.78191763, "learning_rate": 3.3260210444282045e-06, "loss": 0.80543244, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.7745282649993896 }, { "auxiliary_loss_clip": 0.01254838, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.05926752, "balance_loss_mlp": 1.0240401, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.4012708744566376, "language_loss": 0.73402047, "learning_rate": 3.325437797839883e-06, "loss": 0.75688934, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.629438877105713 }, { "auxiliary_loss_clip": 0.0120481, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.0579977, "balance_loss_mlp": 1.02333546, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.406913145609172, "language_loss": 0.75208116, "learning_rate": 3.3248543501837015e-06, "loss": 0.77444077, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.616438388824463 }, { "auxiliary_loss_clip": 0.01321202, "auxiliary_loss_mlp": 0.01031305, "balance_loss_clip": 1.05708981, "balance_loss_mlp": 1.02345562, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 2.8621430404638675, "language_loss": 0.77269328, "learning_rate": 3.3242707015481684e-06, "loss": 0.7962184, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.7535741329193115 }, { "auxiliary_loss_clip": 0.0130298, "auxiliary_loss_mlp": 0.01031506, "balance_loss_clip": 1.0521909, "balance_loss_mlp": 1.02364993, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 3.976127153118563, "language_loss": 0.812105, "learning_rate": 3.323686852021823e-06, "loss": 0.83544993, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.805311679840088 }, { "auxiliary_loss_clip": 0.01355602, "auxiliary_loss_mlp": 0.01025635, "balance_loss_clip": 1.0496192, "balance_loss_mlp": 1.0181551, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 2.6977594989689053, "language_loss": 0.80201054, "learning_rate": 3.323102801693235e-06, "loss": 0.82582295, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.7254743576049805 }, { "auxiliary_loss_clip": 0.01250341, "auxiliary_loss_mlp": 0.01029683, "balance_loss_clip": 1.05422735, "balance_loss_mlp": 1.0210464, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 11.310680429474695, "language_loss": 0.80773312, "learning_rate": 3.322518550651003e-06, "loss": 0.83053333, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.684202194213867 }, { "auxiliary_loss_clip": 0.01309655, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.05501306, "balance_loss_mlp": 1.02461433, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.7061678209372848, "language_loss": 0.81243908, "learning_rate": 3.3219340989837586e-06, "loss": 0.83585978, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 3.579921245574951 }, { "auxiliary_loss_clip": 0.01304012, "auxiliary_loss_mlp": 0.01031041, "balance_loss_clip": 1.05576372, "balance_loss_mlp": 1.02371836, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.916619280491183, "language_loss": 0.80449361, "learning_rate": 3.3213494467801625e-06, "loss": 0.82784414, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 3.5244944095611572 }, { "auxiliary_loss_clip": 0.01411851, "auxiliary_loss_mlp": 0.01028277, "balance_loss_clip": 1.04508066, "balance_loss_mlp": 1.019557, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.587714043096797, "language_loss": 0.71406895, "learning_rate": 3.3207645941289063e-06, "loss": 0.73847026, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.9059460163116455 }, { "auxiliary_loss_clip": 0.01252332, "auxiliary_loss_mlp": 0.02568644, "balance_loss_clip": 1.05757523, "balance_loss_mlp": 1.00003886, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 2.346601556102325, "language_loss": 0.80270302, "learning_rate": 3.320179541118711e-06, "loss": 0.84091276, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 2.862241744995117 }, { "auxiliary_loss_clip": 0.01075381, "auxiliary_loss_mlp": 0.01006319, "balance_loss_clip": 1.03682065, "balance_loss_mlp": 1.00507879, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0194224122888937, "language_loss": 0.60326058, "learning_rate": 3.3195942878383293e-06, "loss": 0.6240775, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.2145872116088867 }, { "auxiliary_loss_clip": 0.01253266, "auxiliary_loss_mlp": 0.01025052, "balance_loss_clip": 1.05741715, "balance_loss_mlp": 1.01710665, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 2.1502253986746966, "language_loss": 0.77897894, "learning_rate": 3.319008834376543e-06, "loss": 0.8017621, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 3.568903923034668 }, { "auxiliary_loss_clip": 0.01269546, "auxiliary_loss_mlp": 0.01030801, "balance_loss_clip": 1.04984081, "balance_loss_mlp": 1.02230096, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 3.8761370728434033, "language_loss": 0.88481569, "learning_rate": 3.3184231808221654e-06, "loss": 0.90781927, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.6711018085479736 }, { "auxiliary_loss_clip": 0.01365458, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 1.06053388, "balance_loss_mlp": 1.02603555, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.15432491440137, "language_loss": 0.63195324, "learning_rate": 3.3178373272640394e-06, "loss": 0.65595508, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.7754809856414795 }, { "auxiliary_loss_clip": 0.01203674, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.05931497, "balance_loss_mlp": 1.02672172, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 4.017419385215984, "language_loss": 0.85272753, "learning_rate": 3.3172512737910387e-06, "loss": 0.87511051, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.626894474029541 }, { "auxiliary_loss_clip": 0.01255811, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.05628252, "balance_loss_mlp": 1.02359533, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.80152413885797, "language_loss": 0.88492543, "learning_rate": 3.3166650204920674e-06, "loss": 0.90779662, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.706667184829712 }, { "auxiliary_loss_clip": 0.01255804, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.05957317, "balance_loss_mlp": 1.02434242, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.825812275592468, "language_loss": 0.81544787, "learning_rate": 3.316078567456059e-06, "loss": 0.83834267, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 3.8635995388031006 }, { "auxiliary_loss_clip": 0.01366805, "auxiliary_loss_mlp": 0.01034128, "balance_loss_clip": 1.05289316, "balance_loss_mlp": 1.02569962, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 1.8296922965760747, "language_loss": 0.75683904, "learning_rate": 3.3154919147719786e-06, "loss": 0.78084838, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.792072057723999 }, { "auxiliary_loss_clip": 0.01254509, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.05673003, "balance_loss_mlp": 1.02305245, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.088935258625219, "language_loss": 0.86834347, "learning_rate": 3.31490506252882e-06, "loss": 0.8912062, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.6605446338653564 }, { "auxiliary_loss_clip": 0.0134605, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.04967177, "balance_loss_mlp": 1.0188272, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.8811715965764475, "language_loss": 0.84255892, "learning_rate": 3.31431801081561e-06, "loss": 0.86628222, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.7363483905792236 }, { "auxiliary_loss_clip": 0.01211598, "auxiliary_loss_mlp": 0.01000193, "balance_loss_clip": 1.03096986, "balance_loss_mlp": 0.9989236, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.9154966025159244, "language_loss": 0.678949, "learning_rate": 3.313730759721402e-06, "loss": 0.70106685, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.3419809341430664 }, { "auxiliary_loss_clip": 0.0129955, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.05783296, "balance_loss_mlp": 1.02667403, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.219466330196458, "language_loss": 0.8668437, "learning_rate": 3.313143309335282e-06, "loss": 0.89018583, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.7035715579986572 }, { "auxiliary_loss_clip": 0.01261146, "auxiliary_loss_mlp": 0.01029797, "balance_loss_clip": 1.05520105, "balance_loss_mlp": 1.02183414, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 3.765547075673411, "language_loss": 0.8502965, "learning_rate": 3.3125556597463665e-06, "loss": 0.8732059, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.7154808044433594 }, { "auxiliary_loss_clip": 0.01252574, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.05861521, "balance_loss_mlp": 1.02359748, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 3.8981235628687614, "language_loss": 0.65745568, "learning_rate": 3.311967811043801e-06, "loss": 0.6802963, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.760721445083618 }, { "auxiliary_loss_clip": 0.01249699, "auxiliary_loss_mlp": 0.0103217, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.02467155, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 10.187299110923338, "language_loss": 0.81920373, "learning_rate": 3.3113797633167617e-06, "loss": 0.84202242, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.653059720993042 }, { "auxiliary_loss_clip": 0.01201703, "auxiliary_loss_mlp": 0.01028978, "balance_loss_clip": 1.05721068, "balance_loss_mlp": 1.02100253, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 3.486715775575015, "language_loss": 0.69353735, "learning_rate": 3.310791516654455e-06, "loss": 0.71584421, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.660759687423706 }, { "auxiliary_loss_clip": 0.01357948, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.05452061, "balance_loss_mlp": 1.02196598, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 2.714349753141441, "language_loss": 0.79711479, "learning_rate": 3.3102030711461177e-06, "loss": 0.82099742, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.709632158279419 }, { "auxiliary_loss_clip": 0.01353611, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.05403042, "balance_loss_mlp": 1.01890779, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 2.0562641476112575, "language_loss": 0.68445849, "learning_rate": 3.3096144268810156e-06, "loss": 0.70826328, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.7327301502227783 }, { "auxiliary_loss_clip": 0.01248858, "auxiliary_loss_mlp": 0.01028231, "balance_loss_clip": 1.05485666, "balance_loss_mlp": 1.0195049, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.9264981714129963, "language_loss": 0.72845453, "learning_rate": 3.3090255839484462e-06, "loss": 0.75122541, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.6395602226257324 }, { "auxiliary_loss_clip": 0.01301071, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.05174387, "balance_loss_mlp": 1.02175641, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 1.8184192361154947, "language_loss": 0.85588551, "learning_rate": 3.3084365424377366e-06, "loss": 0.87919575, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.705089569091797 }, { "auxiliary_loss_clip": 0.01340948, "auxiliary_loss_mlp": 0.01003451, "balance_loss_clip": 1.04611838, "balance_loss_mlp": 1.00206232, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7220568289583451, "language_loss": 0.55939472, "learning_rate": 3.307847302438245e-06, "loss": 0.58283877, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.182859420776367 }, { "auxiliary_loss_clip": 0.01305241, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.04686463, "balance_loss_mlp": 1.02613497, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.3658732403103726, "language_loss": 0.78011262, "learning_rate": 3.3072578640393562e-06, "loss": 0.80351114, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.715099573135376 }, { "auxiliary_loss_clip": 0.01304298, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.05603266, "balance_loss_mlp": 1.02828074, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 1.804049434426277, "language_loss": 0.79673111, "learning_rate": 3.3066682273304886e-06, "loss": 0.8201372, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 3.8757946491241455 }, { "auxiliary_loss_clip": 0.01259958, "auxiliary_loss_mlp": 0.02575341, "balance_loss_clip": 1.0588727, "balance_loss_mlp": 1.00002587, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 3.386523748038248, "language_loss": 0.79164529, "learning_rate": 3.3060783924010904e-06, "loss": 0.82999825, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 3.559276819229126 }, { "auxiliary_loss_clip": 0.01349447, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.05418038, "balance_loss_mlp": 1.02219999, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 3.13126806580031, "language_loss": 0.85169208, "learning_rate": 3.3054883593406387e-06, "loss": 0.87549615, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.7350945472717285 }, { "auxiliary_loss_clip": 0.01303937, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.05377865, "balance_loss_mlp": 1.02146363, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.106240031928633, "language_loss": 0.65391326, "learning_rate": 3.3048981282386404e-06, "loss": 0.67725295, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.776604652404785 }, { "auxiliary_loss_clip": 0.01340723, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.0522213, "balance_loss_mlp": 1.02711272, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 2.4547814874609255, "language_loss": 0.82926625, "learning_rate": 3.304307699184634e-06, "loss": 0.85302067, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.680156707763672 }, { "auxiliary_loss_clip": 0.01215441, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.05735648, "balance_loss_mlp": 1.02642632, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.7290338264729899, "language_loss": 0.78723174, "learning_rate": 3.3037170722681866e-06, "loss": 0.80972505, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 3.629185676574707 }, { "auxiliary_loss_clip": 0.01343995, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05328965, "balance_loss_mlp": 1.01958394, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 1.8536665677194815, "language_loss": 0.6813581, "learning_rate": 3.3031262475788956e-06, "loss": 0.70507711, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.6990041732788086 }, { "auxiliary_loss_clip": 0.01297691, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.05298018, "balance_loss_mlp": 1.02662587, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 2.0856619155517677, "language_loss": 0.73358285, "learning_rate": 3.3025352252063897e-06, "loss": 0.75691074, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.6421267986297607 }, { "auxiliary_loss_clip": 0.01256727, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.06235647, "balance_loss_mlp": 1.0314188, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.7974645321249672, "language_loss": 0.75161755, "learning_rate": 3.3019440052403252e-06, "loss": 0.77458465, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.6743111610412598 }, { "auxiliary_loss_clip": 0.01304447, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.05686355, "balance_loss_mlp": 1.02711225, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 2.4199596772604988, "language_loss": 0.71068013, "learning_rate": 3.30135258777039e-06, "loss": 0.73407096, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 2.6731889247894287 }, { "auxiliary_loss_clip": 0.01256014, "auxiliary_loss_mlp": 0.02571602, "balance_loss_clip": 1.05464983, "balance_loss_mlp": 0.99999762, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 2.083521418647369, "language_loss": 0.70998043, "learning_rate": 3.3007609728863024e-06, "loss": 0.74825656, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 3.527679920196533 }, { "auxiliary_loss_clip": 0.01438717, "auxiliary_loss_mlp": 0.01025238, "balance_loss_clip": 1.05315316, "balance_loss_mlp": 1.01722074, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 1.8865575624881366, "language_loss": 0.73082364, "learning_rate": 3.300169160677809e-06, "loss": 0.75546312, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.9511027336120605 }, { "auxiliary_loss_clip": 0.01362479, "auxiliary_loss_mlp": 0.01027533, "balance_loss_clip": 1.05762804, "balance_loss_mlp": 1.01855063, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.3211277102515133, "language_loss": 0.78067714, "learning_rate": 3.2995771512346878e-06, "loss": 0.80457723, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.862144708633423 }, { "auxiliary_loss_clip": 0.01208443, "auxiliary_loss_mlp": 0.02571869, "balance_loss_clip": 1.06017685, "balance_loss_mlp": 1.00010633, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 2.051564601187901, "language_loss": 0.7325623, "learning_rate": 3.298984944646746e-06, "loss": 0.77036548, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.654362201690674 }, { "auxiliary_loss_clip": 0.01261453, "auxiliary_loss_mlp": 0.0257048, "balance_loss_clip": 1.06211972, "balance_loss_mlp": 1.00002682, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 2.0329103936784425, "language_loss": 0.81709933, "learning_rate": 3.298392541003822e-06, "loss": 0.85541862, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.65533709526062 }, { "auxiliary_loss_clip": 0.01303707, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.05725658, "balance_loss_mlp": 1.02460694, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.9553620473417779, "language_loss": 0.89524221, "learning_rate": 3.2977999403957806e-06, "loss": 0.91860354, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.7032053470611572 }, { "auxiliary_loss_clip": 0.01208218, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.06241369, "balance_loss_mlp": 1.02577972, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.0950553785318635, "language_loss": 0.67345089, "learning_rate": 3.2972071429125207e-06, "loss": 0.69587862, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.6803512573242188 }, { "auxiliary_loss_clip": 0.01347581, "auxiliary_loss_mlp": 0.01028727, "balance_loss_clip": 1.05556238, "balance_loss_mlp": 1.02034688, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 4.897434976667871, "language_loss": 0.88538748, "learning_rate": 3.2966141486439682e-06, "loss": 0.9091506, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7468693256378174 }, { "auxiliary_loss_clip": 0.01453438, "auxiliary_loss_mlp": 0.01039338, "balance_loss_clip": 1.04729176, "balance_loss_mlp": 1.02949166, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.2395417854149335, "language_loss": 0.64162189, "learning_rate": 3.29602095768008e-06, "loss": 0.66654968, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.8340892791748047 }, { "auxiliary_loss_clip": 0.01298893, "auxiliary_loss_mlp": 0.0102693, "balance_loss_clip": 1.05829072, "balance_loss_mlp": 1.01893675, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 2.116260541091447, "language_loss": 0.64084899, "learning_rate": 3.2954275701108437e-06, "loss": 0.66410726, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.7778310775756836 }, { "auxiliary_loss_clip": 0.01398495, "auxiliary_loss_mlp": 0.01031097, "balance_loss_clip": 1.05008888, "balance_loss_mlp": 1.02278805, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 5.502083954575838, "language_loss": 0.68843186, "learning_rate": 3.294833986026275e-06, "loss": 0.71272779, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.8946938514709473 }, { "auxiliary_loss_clip": 0.01347512, "auxiliary_loss_mlp": 0.01037675, "balance_loss_clip": 1.05435324, "balance_loss_mlp": 1.02934265, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 1.9460918721614684, "language_loss": 0.85320818, "learning_rate": 3.29424020551642e-06, "loss": 0.87706012, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.7560973167419434 }, { "auxiliary_loss_clip": 0.01210844, "auxiliary_loss_mlp": 0.0103831, "balance_loss_clip": 1.0616672, "balance_loss_mlp": 1.02939892, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 4.477142794746547, "language_loss": 0.72506094, "learning_rate": 3.2936462286713546e-06, "loss": 0.74755245, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.6023271083831787 }, { "auxiliary_loss_clip": 0.01256731, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.05957937, "balance_loss_mlp": 1.02342987, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.2341980462468207, "language_loss": 0.77776021, "learning_rate": 3.2930520555811846e-06, "loss": 0.80064559, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.628906488418579 }, { "auxiliary_loss_clip": 0.01488495, "auxiliary_loss_mlp": 0.0258021, "balance_loss_clip": 1.04853261, "balance_loss_mlp": 0.99995112, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 2.5242023200673245, "language_loss": 0.80195558, "learning_rate": 3.292457686336046e-06, "loss": 0.84264266, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.859530210494995 }, { "auxiliary_loss_clip": 0.01217471, "auxiliary_loss_mlp": 0.01000185, "balance_loss_clip": 1.03311467, "balance_loss_mlp": 0.99883759, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.881713457947621, "language_loss": 0.61207497, "learning_rate": 3.291863121026105e-06, "loss": 0.63425159, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.286374092102051 }, { "auxiliary_loss_clip": 0.0125904, "auxiliary_loss_mlp": 0.01029346, "balance_loss_clip": 1.06095779, "balance_loss_mlp": 1.02157688, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.268814445818535, "language_loss": 0.76849103, "learning_rate": 3.2912683597415547e-06, "loss": 0.79137492, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 3.9402389526367188 }, { "auxiliary_loss_clip": 0.01357073, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.05665433, "balance_loss_mlp": 1.02431059, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.1179375329462973, "language_loss": 0.78100395, "learning_rate": 3.2906734025726213e-06, "loss": 0.80490202, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 3.753671884536743 }, { "auxiliary_loss_clip": 0.01261961, "auxiliary_loss_mlp": 0.0103332, "balance_loss_clip": 1.06038713, "balance_loss_mlp": 1.02450407, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 4.4167740202633485, "language_loss": 0.87793529, "learning_rate": 3.290078249609559e-06, "loss": 0.90088809, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.6557047367095947 }, { "auxiliary_loss_clip": 0.01254998, "auxiliary_loss_mlp": 0.01034621, "balance_loss_clip": 1.06177342, "balance_loss_mlp": 1.02628851, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.2316522580127383, "language_loss": 0.87978947, "learning_rate": 3.2894829009426514e-06, "loss": 0.90268564, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.6276180744171143 }, { "auxiliary_loss_clip": 0.01254783, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.06084168, "balance_loss_mlp": 1.0318675, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 2.8031807555751262, "language_loss": 0.7790221, "learning_rate": 3.288887356662213e-06, "loss": 0.80197191, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.6706464290618896 }, { "auxiliary_loss_clip": 0.01152771, "auxiliary_loss_mlp": 0.01001962, "balance_loss_clip": 1.03014588, "balance_loss_mlp": 1.00082958, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7685493840554839, "language_loss": 0.5964849, "learning_rate": 3.288291616858588e-06, "loss": 0.61803222, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 4.017135143280029 }, { "auxiliary_loss_clip": 0.01396067, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02408886, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.8919952901795025, "language_loss": 0.77090538, "learning_rate": 3.287695681622149e-06, "loss": 0.79519439, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.791639566421509 }, { "auxiliary_loss_clip": 0.01308975, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.05706346, "balance_loss_mlp": 1.02361941, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 2.4551575997281345, "language_loss": 0.81034553, "learning_rate": 3.2870995510432982e-06, "loss": 0.83374751, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.690185070037842 }, { "auxiliary_loss_clip": 0.01247138, "auxiliary_loss_mlp": 0.01028304, "balance_loss_clip": 1.05607939, "balance_loss_mlp": 1.020257, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 2.904165233449176, "language_loss": 0.77247298, "learning_rate": 3.2865032252124697e-06, "loss": 0.79522741, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.6864163875579834 }, { "auxiliary_loss_clip": 0.01303876, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.05391526, "balance_loss_mlp": 1.02705657, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.5118183032976868, "language_loss": 0.77688301, "learning_rate": 3.2859067042201243e-06, "loss": 0.80027425, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.797985553741455 }, { "auxiliary_loss_clip": 0.01435099, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.04864085, "balance_loss_mlp": 1.02300358, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 2.3209571078770823, "language_loss": 0.78406274, "learning_rate": 3.2853099881567544e-06, "loss": 0.80872285, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 3.682551860809326 }, { "auxiliary_loss_clip": 0.01199319, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.05780339, "balance_loss_mlp": 1.03145373, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 2.2236695296931135, "language_loss": 0.79516304, "learning_rate": 3.284713077112881e-06, "loss": 0.81754631, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.659133195877075 }, { "auxiliary_loss_clip": 0.01269293, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.05770266, "balance_loss_mlp": 1.02464616, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 2.8579652341768003, "language_loss": 0.86505914, "learning_rate": 3.284115971179056e-06, "loss": 0.88808799, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.689410924911499 }, { "auxiliary_loss_clip": 0.01369761, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.05650926, "balance_loss_mlp": 1.02487409, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.8087710971189521, "language_loss": 0.78676724, "learning_rate": 3.283518670445859e-06, "loss": 0.81079799, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.850482940673828 }, { "auxiliary_loss_clip": 0.01197069, "auxiliary_loss_mlp": 0.02521291, "balance_loss_clip": 1.02768946, "balance_loss_mlp": 0.99991953, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6946226507398222, "language_loss": 0.54346359, "learning_rate": 3.2829211750038995e-06, "loss": 0.58064717, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.340308666229248 }, { "auxiliary_loss_clip": 0.01347263, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.05280006, "balance_loss_mlp": 1.02680624, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 1.911317884690068, "language_loss": 0.89259964, "learning_rate": 3.2823234849438183e-06, "loss": 0.91642332, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.6934638023376465 }, { "auxiliary_loss_clip": 0.01216341, "auxiliary_loss_mlp": 0.0103408, "balance_loss_clip": 1.05638075, "balance_loss_mlp": 1.02612281, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 3.4069635290799916, "language_loss": 0.75624901, "learning_rate": 3.2817256003562836e-06, "loss": 0.77875328, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.74157452583313 }, { "auxiliary_loss_clip": 0.01452031, "auxiliary_loss_mlp": 0.01030845, "balance_loss_clip": 1.05287492, "balance_loss_mlp": 1.0227741, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.9557652572541206, "language_loss": 0.66164654, "learning_rate": 3.281127521331995e-06, "loss": 0.68647534, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.881704330444336 }, { "auxiliary_loss_clip": 0.01103138, "auxiliary_loss_mlp": 0.00999436, "balance_loss_clip": 1.02936113, "balance_loss_mlp": 0.99827921, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8881729053248144, "language_loss": 0.60671192, "learning_rate": 3.2805292479616798e-06, "loss": 0.62773764, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.351998805999756 }, { "auxiliary_loss_clip": 0.0130446, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.05677617, "balance_loss_mlp": 1.02336848, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.511704882822431, "language_loss": 0.91458797, "learning_rate": 3.2799307803360955e-06, "loss": 0.93794572, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.7682089805603027 }, { "auxiliary_loss_clip": 0.01200783, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05844462, "balance_loss_mlp": 1.01936972, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.4755093028622863, "language_loss": 0.81418228, "learning_rate": 3.27933211854603e-06, "loss": 0.83646166, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.669107675552368 }, { "auxiliary_loss_clip": 0.01307636, "auxiliary_loss_mlp": 0.0102934, "balance_loss_clip": 1.0588845, "balance_loss_mlp": 1.0210433, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.6281712331530631, "language_loss": 0.87167209, "learning_rate": 3.278733262682299e-06, "loss": 0.89504188, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.7508397102355957 }, { "auxiliary_loss_clip": 0.01203281, "auxiliary_loss_mlp": 0.01030033, "balance_loss_clip": 1.05846834, "balance_loss_mlp": 1.02175426, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.2394848529085176, "language_loss": 0.82618964, "learning_rate": 3.2781342128357484e-06, "loss": 0.84852278, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.667433023452759 }, { "auxiliary_loss_clip": 0.01352447, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.05354846, "balance_loss_mlp": 1.02241778, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 3.772289450812382, "language_loss": 0.8050971, "learning_rate": 3.2775349690972547e-06, "loss": 0.82891762, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.788083076477051 }, { "auxiliary_loss_clip": 0.01146112, "auxiliary_loss_mlp": 0.00997916, "balance_loss_clip": 1.02470422, "balance_loss_mlp": 0.99671239, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7649986665243198, "language_loss": 0.5181092, "learning_rate": 3.276935531557722e-06, "loss": 0.53954947, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.3167076110839844 }, { "auxiliary_loss_clip": 0.01406507, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.05279267, "balance_loss_mlp": 1.0191927, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.3968177886661217, "language_loss": 0.80119503, "learning_rate": 3.2763359003080837e-06, "loss": 0.82554162, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.738403081893921 }, { "auxiliary_loss_clip": 0.01207133, "auxiliary_loss_mlp": 0.01001026, "balance_loss_clip": 1.02871192, "balance_loss_mlp": 0.99989951, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8511917690897763, "language_loss": 0.62428123, "learning_rate": 3.2757360754393047e-06, "loss": 0.64636278, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 4.1711344718933105 }, { "auxiliary_loss_clip": 0.01250951, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.05664456, "balance_loss_mlp": 1.01781368, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 2.666689012782986, "language_loss": 0.64373064, "learning_rate": 3.2751360570423767e-06, "loss": 0.66650093, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 3.6207292079925537 }, { "auxiliary_loss_clip": 0.01301315, "auxiliary_loss_mlp": 0.01029921, "balance_loss_clip": 1.05598044, "balance_loss_mlp": 1.0214541, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.206503614236403, "language_loss": 0.75951266, "learning_rate": 3.2745358452083236e-06, "loss": 0.78282499, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.7514190673828125 }, { "auxiliary_loss_clip": 0.01251166, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.05731988, "balance_loss_mlp": 1.02875543, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 9.707340904701688, "language_loss": 0.82509643, "learning_rate": 3.2739354400281955e-06, "loss": 0.84797513, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.6871273517608643 }, { "auxiliary_loss_clip": 0.01260205, "auxiliary_loss_mlp": 0.02518282, "balance_loss_clip": 1.02779889, "balance_loss_mlp": 0.99988461, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8617400421587595, "language_loss": 0.63636732, "learning_rate": 3.2733348415930744e-06, "loss": 0.6741522, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.2972218990325928 }, { "auxiliary_loss_clip": 0.01349903, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.05601168, "balance_loss_mlp": 1.0235287, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 2.0978051565477887, "language_loss": 0.80559886, "learning_rate": 3.27273404999407e-06, "loss": 0.82941604, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 3.6827056407928467 }, { "auxiliary_loss_clip": 0.01212008, "auxiliary_loss_mlp": 0.01003841, "balance_loss_clip": 1.02590609, "balance_loss_mlp": 1.0027802, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.8059738882529697, "language_loss": 0.60482377, "learning_rate": 3.272133065322322e-06, "loss": 0.62698221, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.2287724018096924 }, { "auxiliary_loss_clip": 0.01196908, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.05489612, "balance_loss_mlp": 1.01777554, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.9067913819456626, "language_loss": 0.79661226, "learning_rate": 3.271531887669e-06, "loss": 0.81883931, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.6113736629486084 }, { "auxiliary_loss_clip": 0.01402489, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.05030942, "balance_loss_mlp": 1.01959896, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.4120813343208782, "language_loss": 0.63635933, "learning_rate": 3.2709305171253015e-06, "loss": 0.66066444, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.816473960876465 }, { "auxiliary_loss_clip": 0.01253352, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.0596801, "balance_loss_mlp": 1.02235842, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 1.9291426341541447, "language_loss": 0.78230619, "learning_rate": 3.2703289537824536e-06, "loss": 0.80514431, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.659670829772949 }, { "auxiliary_loss_clip": 0.01405767, "auxiliary_loss_mlp": 0.01028469, "balance_loss_clip": 1.05189574, "balance_loss_mlp": 1.01975524, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.581097721729028, "language_loss": 0.78297418, "learning_rate": 3.269727197731714e-06, "loss": 0.80731654, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 3.618957757949829 }, { "auxiliary_loss_clip": 0.01388501, "auxiliary_loss_mlp": 0.01025371, "balance_loss_clip": 1.05275977, "balance_loss_mlp": 1.01732492, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.7379422836747218, "language_loss": 0.78293955, "learning_rate": 3.269125249064367e-06, "loss": 0.80707824, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.740097999572754 }, { "auxiliary_loss_clip": 0.01204889, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.05744731, "balance_loss_mlp": 1.02411234, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 2.5117126580021503, "language_loss": 0.83436584, "learning_rate": 3.2685231078717297e-06, "loss": 0.85673571, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.6215996742248535 }, { "auxiliary_loss_clip": 0.01345159, "auxiliary_loss_mlp": 0.02570921, "balance_loss_clip": 1.05313766, "balance_loss_mlp": 0.9999668, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.065102545242766, "language_loss": 0.75668621, "learning_rate": 3.267920774245145e-06, "loss": 0.795847, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.760965585708618 }, { "auxiliary_loss_clip": 0.0125347, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.06022155, "balance_loss_mlp": 1.02682567, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 2.0336153876492133, "language_loss": 0.8483541, "learning_rate": 3.2673182482759876e-06, "loss": 0.87123954, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.7909414768218994 }, { "auxiliary_loss_clip": 0.01250476, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.05764914, "balance_loss_mlp": 1.0215435, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 2.1333381636288866, "language_loss": 0.65975022, "learning_rate": 3.266715530055659e-06, "loss": 0.68254858, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.645923137664795 }, { "auxiliary_loss_clip": 0.01246726, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.05373263, "balance_loss_mlp": 1.020895, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 2.1150068665353086, "language_loss": 0.80418289, "learning_rate": 3.2661126196755927e-06, "loss": 0.82694364, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.6276869773864746 }, { "auxiliary_loss_clip": 0.01100797, "auxiliary_loss_mlp": 0.0100363, "balance_loss_clip": 1.02777541, "balance_loss_mlp": 1.00255764, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.8447652536836038, "language_loss": 0.56015849, "learning_rate": 3.265509517227248e-06, "loss": 0.58120275, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.137282609939575 }, { "auxiliary_loss_clip": 0.01296308, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.0512681, "balance_loss_mlp": 1.01977885, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 3.0133564204999392, "language_loss": 0.80889857, "learning_rate": 3.264906222802115e-06, "loss": 0.83213353, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.704467535018921 }, { "auxiliary_loss_clip": 0.01202548, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.05699027, "balance_loss_mlp": 1.02195466, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 3.8277236661528726, "language_loss": 0.78340721, "learning_rate": 3.264302736491715e-06, "loss": 0.80572814, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.6257448196411133 }, { "auxiliary_loss_clip": 0.01256009, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.06261122, "balance_loss_mlp": 1.02698958, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 3.672758508548966, "language_loss": 0.87621808, "learning_rate": 3.263699058387594e-06, "loss": 0.89912575, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.624135732650757 }, { "auxiliary_loss_clip": 0.01348188, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.0503186, "balance_loss_mlp": 1.01864171, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.602370140889034, "language_loss": 0.90630329, "learning_rate": 3.2630951885813315e-06, "loss": 0.93004882, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.7357165813446045 }, { "auxiliary_loss_clip": 0.01301159, "auxiliary_loss_mlp": 0.01023443, "balance_loss_clip": 1.05262542, "balance_loss_mlp": 1.01564097, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 1.9663557165831282, "language_loss": 0.78358603, "learning_rate": 3.262491127164533e-06, "loss": 0.80683208, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.65051007270813 }, { "auxiliary_loss_clip": 0.01216331, "auxiliary_loss_mlp": 0.02570241, "balance_loss_clip": 1.05569923, "balance_loss_mlp": 1.0000906, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.2238041062136458, "language_loss": 0.80256325, "learning_rate": 3.2618868742288337e-06, "loss": 0.84042907, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.6889004707336426 }, { "auxiliary_loss_clip": 0.01250853, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.05871177, "balance_loss_mlp": 1.01942682, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 2.0666093877969867, "language_loss": 0.7263577, "learning_rate": 3.261282429865899e-06, "loss": 0.74913824, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.6811068058013916 }, { "auxiliary_loss_clip": 0.01308394, "auxiliary_loss_mlp": 0.02563249, "balance_loss_clip": 1.05719841, "balance_loss_mlp": 1.00000238, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.7200598045907363, "language_loss": 0.72689867, "learning_rate": 3.2606777941674225e-06, "loss": 0.76561511, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.67688250541687 }, { "auxiliary_loss_clip": 0.01385305, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.0494833, "balance_loss_mlp": 1.01508701, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.484047027518094, "language_loss": 0.84481812, "learning_rate": 3.2600729672251276e-06, "loss": 0.8689037, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 3.6778342723846436 }, { "auxiliary_loss_clip": 0.01202035, "auxiliary_loss_mlp": 0.02566142, "balance_loss_clip": 1.05827415, "balance_loss_mlp": 0.9999752, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 2.152050552038604, "language_loss": 0.65264976, "learning_rate": 3.259467949130765e-06, "loss": 0.69033158, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 3.5239288806915283 }, { "auxiliary_loss_clip": 0.01302494, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.05788243, "balance_loss_mlp": 1.02205324, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 16.648093819325613, "language_loss": 0.82615572, "learning_rate": 3.2588627399761164e-06, "loss": 0.84948552, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.676400899887085 }, { "auxiliary_loss_clip": 0.01299615, "auxiliary_loss_mlp": 0.01029067, "balance_loss_clip": 1.05609655, "balance_loss_mlp": 1.02147353, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.8250331740192618, "language_loss": 0.70759428, "learning_rate": 3.2582573398529903e-06, "loss": 0.73088109, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.6638200283050537 }, { "auxiliary_loss_clip": 0.01358672, "auxiliary_loss_mlp": 0.01026531, "balance_loss_clip": 1.05446672, "balance_loss_mlp": 1.01828456, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.041445112567677, "language_loss": 0.73794508, "learning_rate": 3.2576517488532265e-06, "loss": 0.76179707, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.767411708831787 }, { "auxiliary_loss_clip": 0.01250117, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.0551393, "balance_loss_mlp": 1.01688349, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.752339072959316, "language_loss": 0.8767013, "learning_rate": 3.257045967068692e-06, "loss": 0.89944679, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.638176918029785 }, { "auxiliary_loss_clip": 0.01203395, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.05896235, "balance_loss_mlp": 1.02376306, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.7513904494084631, "language_loss": 0.81915009, "learning_rate": 3.2564399945912848e-06, "loss": 0.84150696, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 3.564380645751953 }, { "auxiliary_loss_clip": 0.01316573, "auxiliary_loss_mlp": 0.01028897, "balance_loss_clip": 1.05197012, "balance_loss_mlp": 1.02145243, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.4562166509168666, "language_loss": 0.81860638, "learning_rate": 3.2558338315129287e-06, "loss": 0.84206104, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.7506237030029297 }, { "auxiliary_loss_clip": 0.01250637, "auxiliary_loss_mlp": 0.01034086, "balance_loss_clip": 1.05724478, "balance_loss_mlp": 1.02629602, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 4.197721539307683, "language_loss": 0.76268744, "learning_rate": 3.2552274779255785e-06, "loss": 0.78553474, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 2.712716817855835 }, { "auxiliary_loss_clip": 0.01248028, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.05657816, "balance_loss_mlp": 1.01810062, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 3.2018299524224565, "language_loss": 0.77009475, "learning_rate": 3.2546209339212184e-06, "loss": 0.79283762, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.600860834121704 }, { "auxiliary_loss_clip": 0.01301301, "auxiliary_loss_mlp": 0.01030602, "balance_loss_clip": 1.05394197, "balance_loss_mlp": 1.02267504, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.4375091055414382, "language_loss": 0.77807581, "learning_rate": 3.25401419959186e-06, "loss": 0.80139488, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 3.639772653579712 }, { "auxiliary_loss_clip": 0.01314342, "auxiliary_loss_mlp": 0.01029523, "balance_loss_clip": 1.06044197, "balance_loss_mlp": 1.02172685, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 1.789122574919612, "language_loss": 0.76102608, "learning_rate": 3.253407275029545e-06, "loss": 0.78446478, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.664422035217285 }, { "auxiliary_loss_clip": 0.01351116, "auxiliary_loss_mlp": 0.01029434, "balance_loss_clip": 1.05606711, "balance_loss_mlp": 1.02111363, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 1.8486770256519318, "language_loss": 0.80281311, "learning_rate": 3.2528001603263425e-06, "loss": 0.82661855, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.7870750427246094 }, { "auxiliary_loss_clip": 0.01259666, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.06380367, "balance_loss_mlp": 1.02705646, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.1314360701016057, "language_loss": 0.81658125, "learning_rate": 3.2521928555743514e-06, "loss": 0.83952475, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.590965747833252 }, { "auxiliary_loss_clip": 0.01295212, "auxiliary_loss_mlp": 0.02567539, "balance_loss_clip": 1.05312061, "balance_loss_mlp": 1.000072, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 2.0753982220814646, "language_loss": 0.67206252, "learning_rate": 3.2515853608657e-06, "loss": 0.71069008, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.8088858127593994 }, { "auxiliary_loss_clip": 0.01252896, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.05722165, "balance_loss_mlp": 1.02410746, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 3.703993399830302, "language_loss": 0.75077653, "learning_rate": 3.250977676292545e-06, "loss": 0.77363002, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.616637706756592 }, { "auxiliary_loss_clip": 0.01301348, "auxiliary_loss_mlp": 0.01031788, "balance_loss_clip": 1.05409527, "balance_loss_mlp": 1.02348495, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.675162577092554, "language_loss": 0.79185051, "learning_rate": 3.2503698019470712e-06, "loss": 0.81518185, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.638261318206787 }, { "auxiliary_loss_clip": 0.01251484, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.05490565, "balance_loss_mlp": 1.01926076, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.34030730150707, "language_loss": 0.77909076, "learning_rate": 3.249761737921492e-06, "loss": 0.80188429, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.5986745357513428 }, { "auxiliary_loss_clip": 0.01294499, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.05551744, "balance_loss_mlp": 1.02324939, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 2.0790126382708047, "language_loss": 0.74316883, "learning_rate": 3.249153484308051e-06, "loss": 0.76642579, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.6573047637939453 }, { "auxiliary_loss_clip": 0.01390645, "auxiliary_loss_mlp": 0.01034372, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.02584219, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 9.392472395951206, "language_loss": 0.78135347, "learning_rate": 3.2485450411990194e-06, "loss": 0.80560368, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.6613078117370605 }, { "auxiliary_loss_clip": 0.01202853, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.05706644, "balance_loss_mlp": 1.02401066, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 1.8091694738391721, "language_loss": 0.82667589, "learning_rate": 3.2479364086866983e-06, "loss": 0.84903133, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.672306776046753 }, { "auxiliary_loss_clip": 0.01308885, "auxiliary_loss_mlp": 0.02571101, "balance_loss_clip": 1.06052113, "balance_loss_mlp": 1.00004208, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.743075755029424, "language_loss": 0.81106478, "learning_rate": 3.247327586863416e-06, "loss": 0.84986466, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.717473268508911 }, { "auxiliary_loss_clip": 0.01355851, "auxiliary_loss_mlp": 0.01024643, "balance_loss_clip": 1.05311322, "balance_loss_mlp": 1.01635754, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.633923560465508, "language_loss": 0.7726649, "learning_rate": 3.2467185758215304e-06, "loss": 0.79646981, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.832066297531128 }, { "auxiliary_loss_clip": 0.01270114, "auxiliary_loss_mlp": 0.02568833, "balance_loss_clip": 1.05841279, "balance_loss_mlp": 1.00013447, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 9.682977915624967, "language_loss": 0.85518312, "learning_rate": 3.246109375653428e-06, "loss": 0.89357257, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.7564468383789062 }, { "auxiliary_loss_clip": 0.01201086, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.05741608, "balance_loss_mlp": 1.02064717, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 1.7988040101209102, "language_loss": 0.78632057, "learning_rate": 3.2454999864515243e-06, "loss": 0.80861896, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.569892406463623 }, { "auxiliary_loss_clip": 0.01295941, "auxiliary_loss_mlp": 0.02569396, "balance_loss_clip": 1.05587137, "balance_loss_mlp": 1.00008631, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 2.4858783993755433, "language_loss": 0.69388038, "learning_rate": 3.244890408308263e-06, "loss": 0.73253375, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.7188351154327393 }, { "auxiliary_loss_clip": 0.01396855, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.05011022, "balance_loss_mlp": 1.02358925, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.917766641519629, "language_loss": 0.61270016, "learning_rate": 3.2442806413161165e-06, "loss": 0.63698387, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 3.6266186237335205 }, { "auxiliary_loss_clip": 0.01315991, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.05416286, "balance_loss_mlp": 1.02263236, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.1564983286728068, "language_loss": 0.75827092, "learning_rate": 3.243670685567586e-06, "loss": 0.78174162, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 3.6124324798583984 }, { "auxiliary_loss_clip": 0.0129947, "auxiliary_loss_mlp": 0.02568029, "balance_loss_clip": 1.05536747, "balance_loss_mlp": 1.00007463, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.4450806792421003, "language_loss": 0.80385941, "learning_rate": 3.2430605411552012e-06, "loss": 0.84253442, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.7021894454956055 }, { "auxiliary_loss_clip": 0.01270938, "auxiliary_loss_mlp": 0.01001302, "balance_loss_clip": 1.03291464, "balance_loss_mlp": 1.00005674, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8965452254521274, "language_loss": 0.705531, "learning_rate": 3.2424502081715205e-06, "loss": 0.72825348, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.329286813735962 }, { "auxiliary_loss_clip": 0.01214356, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.05652273, "balance_loss_mlp": 1.02510071, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.983980680246325, "language_loss": 0.78371704, "learning_rate": 3.241839686709132e-06, "loss": 0.80619526, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.752105712890625 }, { "auxiliary_loss_clip": 0.01252223, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.05455589, "balance_loss_mlp": 1.01837468, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 4.015592856918152, "language_loss": 0.82638413, "learning_rate": 3.2412289768606495e-06, "loss": 0.84917426, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.6094212532043457 }, { "auxiliary_loss_clip": 0.01254934, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.05752993, "balance_loss_mlp": 1.02763319, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 1.9822157545160815, "language_loss": 0.82378417, "learning_rate": 3.240618078718718e-06, "loss": 0.84668875, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 3.6713907718658447 }, { "auxiliary_loss_clip": 0.01354066, "auxiliary_loss_mlp": 0.01039894, "balance_loss_clip": 1.05279922, "balance_loss_mlp": 1.03078032, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 1.9788295634654907, "language_loss": 0.73930508, "learning_rate": 3.240006992376011e-06, "loss": 0.76324475, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.740485906600952 }, { "auxiliary_loss_clip": 0.01309011, "auxiliary_loss_mlp": 0.01035593, "balance_loss_clip": 1.05854392, "balance_loss_mlp": 1.0272243, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 3.353193239022271, "language_loss": 0.76071918, "learning_rate": 3.2393957179252284e-06, "loss": 0.78416514, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.693664312362671 }, { "auxiliary_loss_clip": 0.01204481, "auxiliary_loss_mlp": 0.0103002, "balance_loss_clip": 1.05976403, "balance_loss_mlp": 1.02151406, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 2.224481049892084, "language_loss": 0.80255616, "learning_rate": 3.2387842554591016e-06, "loss": 0.82490122, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 3.6758198738098145 }, { "auxiliary_loss_clip": 0.01206593, "auxiliary_loss_mlp": 0.01035904, "balance_loss_clip": 1.06116414, "balance_loss_mlp": 1.02757168, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.7677424475190295, "language_loss": 0.88152587, "learning_rate": 3.238172605070388e-06, "loss": 0.90395081, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.585561990737915 }, { "auxiliary_loss_clip": 0.01252035, "auxiliary_loss_mlp": 0.0257224, "balance_loss_clip": 1.0574913, "balance_loss_mlp": 1.00007486, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 3.0914221368211208, "language_loss": 0.78696358, "learning_rate": 3.2375607668518745e-06, "loss": 0.82520628, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.6023786067962646 }, { "auxiliary_loss_clip": 0.01297431, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.0551827, "balance_loss_mlp": 1.02382493, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.4990489493216352, "language_loss": 0.90136087, "learning_rate": 3.236948740896377e-06, "loss": 0.92466009, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.7142250537872314 }, { "auxiliary_loss_clip": 0.01257607, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.06099617, "balance_loss_mlp": 1.02485895, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.567303887259897, "language_loss": 0.84342611, "learning_rate": 3.2363365272967384e-06, "loss": 0.86633515, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.727769374847412 }, { "auxiliary_loss_clip": 0.01262043, "auxiliary_loss_mlp": 0.01033711, "balance_loss_clip": 1.06499553, "balance_loss_mlp": 1.02450168, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 2.0399627037753842, "language_loss": 0.81581736, "learning_rate": 3.235724126145832e-06, "loss": 0.83877492, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.6365392208099365 }, { "auxiliary_loss_clip": 0.01245663, "auxiliary_loss_mlp": 0.0103035, "balance_loss_clip": 1.05361378, "balance_loss_mlp": 1.02160573, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.6509954668390137, "language_loss": 0.77672851, "learning_rate": 3.235111537536558e-06, "loss": 0.7994886, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.6493844985961914 }, { "auxiliary_loss_clip": 0.01254801, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.05844808, "balance_loss_mlp": 1.02398968, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 2.560953187789091, "language_loss": 0.83241045, "learning_rate": 3.2344987615618456e-06, "loss": 0.85527581, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.734856128692627 }, { "auxiliary_loss_clip": 0.01358466, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.05991209, "balance_loss_mlp": 1.02445948, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.8082393891677595, "language_loss": 0.7883271, "learning_rate": 3.2338857983146533e-06, "loss": 0.81223631, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.8065104484558105 }, { "auxiliary_loss_clip": 0.01296711, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.05661082, "balance_loss_mlp": 1.02561784, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.9312607628618408, "language_loss": 0.76604176, "learning_rate": 3.233272647887966e-06, "loss": 0.78935432, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.7169435024261475 }, { "auxiliary_loss_clip": 0.01207463, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.06110811, "balance_loss_mlp": 1.0279386, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 5.819643060748056, "language_loss": 0.90072691, "learning_rate": 3.2326593103747985e-06, "loss": 0.92316878, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.6125731468200684 }, { "auxiliary_loss_clip": 0.01254327, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.06139898, "balance_loss_mlp": 1.02413511, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 2.2000721076735457, "language_loss": 0.85142398, "learning_rate": 3.2320457858681936e-06, "loss": 0.87429029, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.653794050216675 }, { "auxiliary_loss_clip": 0.01300898, "auxiliary_loss_mlp": 0.0103566, "balance_loss_clip": 1.05523396, "balance_loss_mlp": 1.02783382, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.6754616918353036, "language_loss": 0.85365307, "learning_rate": 3.2314320744612228e-06, "loss": 0.87701869, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.685758113861084 }, { "auxiliary_loss_clip": 0.01248189, "auxiliary_loss_mlp": 0.01036298, "balance_loss_clip": 1.05706644, "balance_loss_mlp": 1.02825129, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 3.136061493129395, "language_loss": 0.76703787, "learning_rate": 3.2308181762469854e-06, "loss": 0.78988278, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.6801700592041016 }, { "auxiliary_loss_clip": 0.01209423, "auxiliary_loss_mlp": 0.01029757, "balance_loss_clip": 1.06164432, "balance_loss_mlp": 1.02154326, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.272375049921895, "language_loss": 0.78607845, "learning_rate": 3.230204091318609e-06, "loss": 0.80847025, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.690950393676758 }, { "auxiliary_loss_clip": 0.01203913, "auxiliary_loss_mlp": 0.02566156, "balance_loss_clip": 1.05988336, "balance_loss_mlp": 1.00005782, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 1.8734187265432434, "language_loss": 0.84478903, "learning_rate": 3.2295898197692503e-06, "loss": 0.88248968, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.6488585472106934 }, { "auxiliary_loss_clip": 0.01203201, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.0593245, "balance_loss_mlp": 1.02148962, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 3.0773850247500896, "language_loss": 0.79427254, "learning_rate": 3.228975361692094e-06, "loss": 0.81660312, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 3.688056230545044 }, { "auxiliary_loss_clip": 0.01257344, "auxiliary_loss_mlp": 0.02572953, "balance_loss_clip": 1.05572379, "balance_loss_mlp": 1.00009215, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.280309071984574, "language_loss": 0.80451769, "learning_rate": 3.228360717180352e-06, "loss": 0.84282064, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.6695268154144287 }, { "auxiliary_loss_clip": 0.01107311, "auxiliary_loss_mlp": 0.02517025, "balance_loss_clip": 1.0330776, "balance_loss_mlp": 1.00003421, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8104947979672548, "language_loss": 0.59378445, "learning_rate": 3.227745886327266e-06, "loss": 0.63002783, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.9291703701019287 }, { "auxiliary_loss_clip": 0.01107244, "auxiliary_loss_mlp": 0.01004126, "balance_loss_clip": 1.03292561, "balance_loss_mlp": 1.00304151, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8323980271178039, "language_loss": 0.55842137, "learning_rate": 3.227130869226105e-06, "loss": 0.57953513, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.197234869003296 }, { "auxiliary_loss_clip": 0.01256879, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.05958319, "balance_loss_mlp": 1.02170539, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 2.6996845583101314, "language_loss": 0.82914019, "learning_rate": 3.226515665970167e-06, "loss": 0.85200846, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.6235191822052 }, { "auxiliary_loss_clip": 0.01253638, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.05981708, "balance_loss_mlp": 1.01983023, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.3039205507084506, "language_loss": 0.86523569, "learning_rate": 3.225900276652777e-06, "loss": 0.88805568, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.6641392707824707 }, { "auxiliary_loss_clip": 0.01309584, "auxiliary_loss_mlp": 0.01035138, "balance_loss_clip": 1.05664945, "balance_loss_mlp": 1.02785993, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.6719137740631647, "language_loss": 0.75417501, "learning_rate": 3.2252847013672906e-06, "loss": 0.77762222, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.733449935913086 }, { "auxiliary_loss_clip": 0.01349142, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.0538888, "balance_loss_mlp": 1.02962804, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 2.0349243057469755, "language_loss": 0.75895315, "learning_rate": 3.224668940207089e-06, "loss": 0.78282607, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 3.760448455810547 }, { "auxiliary_loss_clip": 0.01398454, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.04899609, "balance_loss_mlp": 1.02186847, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 2.212640223224331, "language_loss": 0.87108296, "learning_rate": 3.2240529932655828e-06, "loss": 0.89536929, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.8786935806274414 }, { "auxiliary_loss_clip": 0.0130561, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.06143069, "balance_loss_mlp": 1.02124488, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 2.6297873591271523, "language_loss": 0.88620102, "learning_rate": 3.223436860636211e-06, "loss": 0.90955216, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.753444194793701 }, { "auxiliary_loss_clip": 0.01205636, "auxiliary_loss_mlp": 0.01028164, "balance_loss_clip": 1.06051803, "balance_loss_mlp": 1.02057612, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.7254291564189985, "language_loss": 0.73916161, "learning_rate": 3.2228205424124403e-06, "loss": 0.76149964, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 3.726764440536499 }, { "auxiliary_loss_clip": 0.01293893, "auxiliary_loss_mlp": 0.01035372, "balance_loss_clip": 1.05482888, "balance_loss_mlp": 1.02752244, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.4478722514610975, "language_loss": 0.74964136, "learning_rate": 3.222204038687765e-06, "loss": 0.77293396, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.620241641998291 }, { "auxiliary_loss_clip": 0.01254651, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.06047249, "balance_loss_mlp": 1.02171564, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.8520011119193083, "language_loss": 0.88272351, "learning_rate": 3.221587349555709e-06, "loss": 0.90556479, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.724022388458252 }, { "auxiliary_loss_clip": 0.01310072, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.05961442, "balance_loss_mlp": 1.0208478, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.667087980491596, "language_loss": 0.69487178, "learning_rate": 3.2209704751098236e-06, "loss": 0.7182585, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.6754043102264404 }, { "auxiliary_loss_clip": 0.01308717, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.06111884, "balance_loss_mlp": 1.02802432, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 2.1665709191409817, "language_loss": 0.8284201, "learning_rate": 3.2203534154436875e-06, "loss": 0.85186696, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.647808313369751 }, { "auxiliary_loss_clip": 0.01456195, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.05263519, "balance_loss_mlp": 1.02793562, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 3.228391160571132, "language_loss": 0.7574358, "learning_rate": 3.2197361706509084e-06, "loss": 0.78235978, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.9194610118865967 }, { "auxiliary_loss_clip": 0.01208846, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.06102347, "balance_loss_mlp": 1.02255738, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.568158748483563, "language_loss": 0.83345377, "learning_rate": 3.2191187408251228e-06, "loss": 0.85586399, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.7066690921783447 }, { "auxiliary_loss_clip": 0.01172101, "auxiliary_loss_mlp": 0.01033967, "balance_loss_clip": 1.05897069, "balance_loss_mlp": 1.02449012, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.2744671706200794, "language_loss": 0.79495949, "learning_rate": 3.218501126059993e-06, "loss": 0.81702012, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.6325268745422363 }, { "auxiliary_loss_clip": 0.01256971, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 1.05627525, "balance_loss_mlp": 1.02150726, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 2.3161342847400683, "language_loss": 0.8171888, "learning_rate": 3.2178833264492116e-06, "loss": 0.84005857, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.641519784927368 }, { "auxiliary_loss_clip": 0.01263344, "auxiliary_loss_mlp": 0.01031188, "balance_loss_clip": 1.06111741, "balance_loss_mlp": 1.02302837, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 2.189328587643372, "language_loss": 0.76024026, "learning_rate": 3.217265342086498e-06, "loss": 0.7831856, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.8644754886627197 }, { "auxiliary_loss_clip": 0.01367165, "auxiliary_loss_mlp": 0.02576441, "balance_loss_clip": 1.06262064, "balance_loss_mlp": 1.00011706, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 2.3854601730618055, "language_loss": 0.73357391, "learning_rate": 3.216647173065599e-06, "loss": 0.77301002, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.7110767364501953 }, { "auxiliary_loss_clip": 0.0130672, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.06281352, "balance_loss_mlp": 1.02442908, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 2.0629840639839996, "language_loss": 0.7400552, "learning_rate": 3.216028819480292e-06, "loss": 0.76345313, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.97039532661438 }, { "auxiliary_loss_clip": 0.01300112, "auxiliary_loss_mlp": 0.01025335, "balance_loss_clip": 1.05938387, "balance_loss_mlp": 1.01755977, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 1.8865230500203014, "language_loss": 0.75654948, "learning_rate": 3.2154102814243793e-06, "loss": 0.77980399, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.712132453918457 }, { "auxiliary_loss_clip": 0.01359426, "auxiliary_loss_mlp": 0.01032345, "balance_loss_clip": 1.05772161, "balance_loss_mlp": 1.02432859, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 1.936714142345138, "language_loss": 0.67032695, "learning_rate": 3.2147915589916937e-06, "loss": 0.69424462, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.7964301109313965 }, { "auxiliary_loss_clip": 0.01301666, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.05456531, "balance_loss_mlp": 1.02519774, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 1.977524694433244, "language_loss": 0.83059335, "learning_rate": 3.2141726522760938e-06, "loss": 0.85394251, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.657794237136841 }, { "auxiliary_loss_clip": 0.01153889, "auxiliary_loss_mlp": 0.01006151, "balance_loss_clip": 1.03073955, "balance_loss_mlp": 1.00495899, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7050518329998078, "language_loss": 0.52671015, "learning_rate": 3.213553561371469e-06, "loss": 0.54831058, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.2950243949890137 }, { "auxiliary_loss_clip": 0.0140312, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.05880332, "balance_loss_mlp": 1.02329648, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.4308381432049107, "language_loss": 0.95758235, "learning_rate": 3.212934286371733e-06, "loss": 0.98192644, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 3.6874406337738037 }, { "auxiliary_loss_clip": 0.01265827, "auxiliary_loss_mlp": 0.01040609, "balance_loss_clip": 1.06736541, "balance_loss_mlp": 1.03182912, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 2.2437586119147745, "language_loss": 0.83287954, "learning_rate": 3.2123148273708304e-06, "loss": 0.85594392, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.901592493057251 }, { "auxiliary_loss_clip": 0.01206533, "auxiliary_loss_mlp": 0.01029842, "balance_loss_clip": 1.06112742, "balance_loss_mlp": 1.02199793, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 1.9885285550459908, "language_loss": 0.76903784, "learning_rate": 3.211695184462733e-06, "loss": 0.79140151, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 3.5375256538391113 }, { "auxiliary_loss_clip": 0.0116732, "auxiliary_loss_mlp": 0.01002378, "balance_loss_clip": 1.02844024, "balance_loss_mlp": 1.0011797, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.8951932624017819, "language_loss": 0.60478055, "learning_rate": 3.2110753577414383e-06, "loss": 0.6264776, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.2703185081481934 }, { "auxiliary_loss_clip": 0.01304312, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.05470657, "balance_loss_mlp": 1.02642918, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 1.8547883805677348, "language_loss": 0.79282504, "learning_rate": 3.2104553473009757e-06, "loss": 0.81621206, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.6808419227600098 }, { "auxiliary_loss_clip": 0.01397867, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.05474305, "balance_loss_mlp": 1.02238166, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 3.2184628539957, "language_loss": 0.67738903, "learning_rate": 3.209835153235399e-06, "loss": 0.70167446, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.8863797187805176 }, { "auxiliary_loss_clip": 0.01258476, "auxiliary_loss_mlp": 0.01035241, "balance_loss_clip": 1.05202198, "balance_loss_mlp": 1.02742672, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.7007504015998862, "language_loss": 0.67697024, "learning_rate": 3.2092147756387916e-06, "loss": 0.69990736, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.790973424911499 }, { "auxiliary_loss_clip": 0.01299107, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.05315101, "balance_loss_mlp": 1.0202111, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 1.8453704888393034, "language_loss": 0.83759099, "learning_rate": 3.208594214605264e-06, "loss": 0.86086917, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 3.732304334640503 }, { "auxiliary_loss_clip": 0.01294631, "auxiliary_loss_mlp": 0.0102312, "balance_loss_clip": 1.05522943, "balance_loss_mlp": 1.01566029, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 2.1848674092657827, "language_loss": 0.77311689, "learning_rate": 3.2079734702289553e-06, "loss": 0.79629439, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.718733072280884 }, { "auxiliary_loss_clip": 0.01146531, "auxiliary_loss_mlp": 0.02516265, "balance_loss_clip": 1.02582407, "balance_loss_mlp": 0.99998778, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8017900479752503, "language_loss": 0.60366929, "learning_rate": 3.207352542604031e-06, "loss": 0.64029723, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.3148999214172363 }, { "auxiliary_loss_clip": 0.01342324, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.05376124, "balance_loss_mlp": 1.02497911, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 2.390757715221807, "language_loss": 0.78314531, "learning_rate": 3.2067314318246864e-06, "loss": 0.80689275, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 3.7029073238372803 }, { "auxiliary_loss_clip": 0.0135603, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.05998504, "balance_loss_mlp": 1.01958954, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 4.402638896751761, "language_loss": 0.77667427, "learning_rate": 3.206110137985143e-06, "loss": 0.80051684, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.825878620147705 }, { "auxiliary_loss_clip": 0.01345006, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.05534995, "balance_loss_mlp": 1.02106762, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.0470862330913158, "language_loss": 0.92449665, "learning_rate": 3.2054886611796505e-06, "loss": 0.94823557, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.74845552444458 }, { "auxiliary_loss_clip": 0.01100579, "auxiliary_loss_mlp": 0.01000162, "balance_loss_clip": 1.02807856, "balance_loss_mlp": 0.99902326, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8852333712054345, "language_loss": 0.63478971, "learning_rate": 3.204867001502487e-06, "loss": 0.65579712, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.215188980102539 }, { "auxiliary_loss_clip": 0.0120637, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.06150568, "balance_loss_mlp": 1.02317023, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 3.127188905721485, "language_loss": 0.81127769, "learning_rate": 3.2042451590479567e-06, "loss": 0.8336581, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.687311887741089 }, { "auxiliary_loss_clip": 0.0120062, "auxiliary_loss_mlp": 0.01028356, "balance_loss_clip": 1.05941081, "balance_loss_mlp": 1.02081609, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 2.0302451979005802, "language_loss": 0.87010062, "learning_rate": 3.203623133910394e-06, "loss": 0.89239037, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.6549482345581055 }, { "auxiliary_loss_clip": 0.01358466, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.0503552, "balance_loss_mlp": 1.02105403, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.3108245918482395, "language_loss": 0.77736545, "learning_rate": 3.203000926184158e-06, "loss": 0.80124372, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.8824033737182617 }, { "auxiliary_loss_clip": 0.0120297, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.06040978, "balance_loss_mlp": 1.02139211, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.9777753666211824, "language_loss": 0.77584028, "learning_rate": 3.202378535963639e-06, "loss": 0.79815948, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.6417059898376465 }, { "auxiliary_loss_clip": 0.01294119, "auxiliary_loss_mlp": 0.02574313, "balance_loss_clip": 1.05333233, "balance_loss_mlp": 1.00019026, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.7332815697587876, "language_loss": 0.83744431, "learning_rate": 3.2017559633432516e-06, "loss": 0.87612861, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.7136871814727783 }, { "auxiliary_loss_clip": 0.01220465, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.05743194, "balance_loss_mlp": 1.02423668, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 2.1370897876276964, "language_loss": 0.66224897, "learning_rate": 3.2011332084174398e-06, "loss": 0.68477345, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.7201805114746094 }, { "auxiliary_loss_clip": 0.01251433, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.05958056, "balance_loss_mlp": 1.02392292, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.8572268910444019, "language_loss": 0.89153993, "learning_rate": 3.2005102712806756e-06, "loss": 0.91437864, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.699655771255493 }, { "auxiliary_loss_clip": 0.01258004, "auxiliary_loss_mlp": 0.01040211, "balance_loss_clip": 1.05971277, "balance_loss_mlp": 1.03190827, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.371543973308338, "language_loss": 0.73298109, "learning_rate": 3.1998871520274575e-06, "loss": 0.75596333, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.5854902267456055 }, { "auxiliary_loss_clip": 0.01305815, "auxiliary_loss_mlp": 0.01034943, "balance_loss_clip": 1.05491173, "balance_loss_mlp": 1.02657402, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 1.978370651424368, "language_loss": 0.85087037, "learning_rate": 3.199263850752312e-06, "loss": 0.87427801, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.6568939685821533 }, { "auxiliary_loss_clip": 0.01254642, "auxiliary_loss_mlp": 0.01028002, "balance_loss_clip": 1.05916095, "balance_loss_mlp": 1.01972342, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.193516818986262, "language_loss": 0.86128891, "learning_rate": 3.198640367549795e-06, "loss": 0.8841154, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.5120790004730225 }, { "auxiliary_loss_clip": 0.01251963, "auxiliary_loss_mlp": 0.02566585, "balance_loss_clip": 1.05615687, "balance_loss_mlp": 1.00010931, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.8682370984114252, "language_loss": 0.85918224, "learning_rate": 3.198016702514487e-06, "loss": 0.89736766, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.58004093170166 }, { "auxiliary_loss_clip": 0.01202193, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05894196, "balance_loss_mlp": 1.02525783, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.8367441601476888, "language_loss": 0.84710979, "learning_rate": 3.1973928557409972e-06, "loss": 0.86946195, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.5049662590026855 }, { "auxiliary_loss_clip": 0.01199098, "auxiliary_loss_mlp": 0.01030581, "balance_loss_clip": 1.05780101, "balance_loss_mlp": 1.02304077, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 1.811254197334729, "language_loss": 0.71188992, "learning_rate": 3.1967688273239636e-06, "loss": 0.73418677, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 3.4778122901916504 }, { "auxiliary_loss_clip": 0.01256921, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.0547595, "balance_loss_mlp": 1.02283263, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.8445643560214395, "language_loss": 0.82240534, "learning_rate": 3.1961446173580503e-06, "loss": 0.84528816, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.648057222366333 }, { "auxiliary_loss_clip": 0.01298445, "auxiliary_loss_mlp": 0.01040947, "balance_loss_clip": 1.05530131, "balance_loss_mlp": 1.0331502, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 1.9843795439982141, "language_loss": 0.7735458, "learning_rate": 3.1955202259379502e-06, "loss": 0.79693967, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 3.7504994869232178 }, { "auxiliary_loss_clip": 0.01248224, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.05383098, "balance_loss_mlp": 1.02465045, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 1.8478426675070594, "language_loss": 0.82787496, "learning_rate": 3.194895653158381e-06, "loss": 0.85068685, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.719618558883667 }, { "auxiliary_loss_clip": 0.01099874, "auxiliary_loss_mlp": 0.01000709, "balance_loss_clip": 1.02831197, "balance_loss_mlp": 0.99962467, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.7699304627334224, "language_loss": 0.55478692, "learning_rate": 3.194270899114093e-06, "loss": 0.57579279, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.2789809703826904 }, { "auxiliary_loss_clip": 0.01261305, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.05980361, "balance_loss_mlp": 1.02248597, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 1.6886904271210978, "language_loss": 0.82046837, "learning_rate": 3.193645963899858e-06, "loss": 0.84339261, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.581281900405884 }, { "auxiliary_loss_clip": 0.0129699, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.05566645, "balance_loss_mlp": 1.01725078, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.8294497974816792, "language_loss": 0.83934969, "learning_rate": 3.193020847610479e-06, "loss": 0.86257052, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.7501516342163086 }, { "auxiliary_loss_clip": 0.01299596, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.05723834, "balance_loss_mlp": 1.02171373, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.51364756056584, "language_loss": 0.71619183, "learning_rate": 3.192395550340787e-06, "loss": 0.73948711, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.7007617950439453 }, { "auxiliary_loss_clip": 0.0125213, "auxiliary_loss_mlp": 0.01027827, "balance_loss_clip": 1.06042695, "balance_loss_mlp": 1.01972675, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 2.612765236694129, "language_loss": 0.76524568, "learning_rate": 3.191770072185638e-06, "loss": 0.78804529, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 3.5311789512634277 }, { "auxiliary_loss_clip": 0.01249309, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.05841804, "balance_loss_mlp": 1.02616262, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 3.0292223950007764, "language_loss": 0.7276386, "learning_rate": 3.191144413239916e-06, "loss": 0.7504729, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.6155741214752197 }, { "auxiliary_loss_clip": 0.01303033, "auxiliary_loss_mlp": 0.01035237, "balance_loss_clip": 1.05682254, "balance_loss_mlp": 1.0263617, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 2.291325710728498, "language_loss": 0.88244081, "learning_rate": 3.190518573598534e-06, "loss": 0.90582347, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 3.6767804622650146 }, { "auxiliary_loss_clip": 0.01268043, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.05399013, "balance_loss_mlp": 1.0249238, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 2.0747474300194906, "language_loss": 0.77235401, "learning_rate": 3.1898925533564308e-06, "loss": 0.79536545, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.7842373847961426 }, { "auxiliary_loss_clip": 0.01348471, "auxiliary_loss_mlp": 0.01029112, "balance_loss_clip": 1.05457294, "balance_loss_mlp": 1.02098155, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 3.6533042660697745, "language_loss": 0.64172709, "learning_rate": 3.1892663526085733e-06, "loss": 0.66550291, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.6983659267425537 }, { "auxiliary_loss_clip": 0.01097223, "auxiliary_loss_mlp": 0.0100254, "balance_loss_clip": 1.02583635, "balance_loss_mlp": 1.00140762, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7556654036943657, "language_loss": 0.56970489, "learning_rate": 3.188639971449956e-06, "loss": 0.59070253, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.1917014122009277 }, { "auxiliary_loss_clip": 0.01208223, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.0625751, "balance_loss_mlp": 1.02716517, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.0433909391935483, "language_loss": 0.72406197, "learning_rate": 3.1880134099756e-06, "loss": 0.74650168, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.7830114364624023 }, { "auxiliary_loss_clip": 0.01251075, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.05611086, "balance_loss_mlp": 1.02210212, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 2.676414912668214, "language_loss": 0.69560033, "learning_rate": 3.1873866682805535e-06, "loss": 0.7184093, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.7680039405822754 }, { "auxiliary_loss_clip": 0.01212968, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.05739355, "balance_loss_mlp": 1.02449739, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 2.5013457451435017, "language_loss": 0.88686419, "learning_rate": 3.186759746459894e-06, "loss": 0.90932286, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.674445867538452 }, { "auxiliary_loss_clip": 0.01306501, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.05873799, "balance_loss_mlp": 1.02599287, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 2.2054625124492713, "language_loss": 0.79478741, "learning_rate": 3.1861326446087246e-06, "loss": 0.81818908, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.6881103515625 }, { "auxiliary_loss_clip": 0.01257402, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.05914283, "balance_loss_mlp": 1.0237149, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.0193220124165023, "language_loss": 0.71967405, "learning_rate": 3.1855053628221763e-06, "loss": 0.74256474, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.6911089420318604 }, { "auxiliary_loss_clip": 0.01345185, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.05012465, "balance_loss_mlp": 1.02523935, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 3.6443780678865227, "language_loss": 0.90298426, "learning_rate": 3.184877901195407e-06, "loss": 0.92677146, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.7021780014038086 }, { "auxiliary_loss_clip": 0.01223602, "auxiliary_loss_mlp": 0.01021292, "balance_loss_clip": 1.0370307, "balance_loss_mlp": 1.01974225, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.807462854414125, "language_loss": 0.62830091, "learning_rate": 3.184250259823602e-06, "loss": 0.6507498, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.3388729095458984 }, { "auxiliary_loss_clip": 0.01351347, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.05535078, "balance_loss_mlp": 1.0276804, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.2257390597809823, "language_loss": 0.81077009, "learning_rate": 3.183622438801974e-06, "loss": 0.83464617, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.7454466819763184 }, { "auxiliary_loss_clip": 0.01205387, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.06143773, "balance_loss_mlp": 1.02164245, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 2.0960073255799125, "language_loss": 0.75652945, "learning_rate": 3.1829944382257637e-06, "loss": 0.77888483, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.6610891819000244 }, { "auxiliary_loss_clip": 0.0124764, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.05850732, "balance_loss_mlp": 1.01935923, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.3456357389572813, "language_loss": 0.81617439, "learning_rate": 3.1823662581902373e-06, "loss": 0.83892596, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.6747899055480957 }, { "auxiliary_loss_clip": 0.0134123, "auxiliary_loss_mlp": 0.01033146, "balance_loss_clip": 1.04483783, "balance_loss_mlp": 1.02509332, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.533359844005296, "language_loss": 0.74684155, "learning_rate": 3.1817378987906896e-06, "loss": 0.7705853, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.7658181190490723 }, { "auxiliary_loss_clip": 0.0139937, "auxiliary_loss_mlp": 0.01038627, "balance_loss_clip": 1.05697954, "balance_loss_mlp": 1.0303185, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 2.197911236475607, "language_loss": 0.79761493, "learning_rate": 3.181109360122442e-06, "loss": 0.8219949, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.746469259262085 }, { "auxiliary_loss_clip": 0.01355573, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.05725932, "balance_loss_mlp": 1.02682257, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 2.4000800667214053, "language_loss": 0.78407705, "learning_rate": 3.1804806422808445e-06, "loss": 0.80797887, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 3.675178289413452 }, { "auxiliary_loss_clip": 0.01294723, "auxiliary_loss_mlp": 0.01029817, "balance_loss_clip": 1.05417264, "balance_loss_mlp": 1.02197611, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.6917754394645284, "language_loss": 0.72940731, "learning_rate": 3.1798517453612714e-06, "loss": 0.75265265, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.781967878341675 }, { "auxiliary_loss_clip": 0.0125556, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.06351423, "balance_loss_mlp": 1.02320182, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.888160123523702, "language_loss": 0.75253433, "learning_rate": 3.1792226694591265e-06, "loss": 0.77540469, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 3.6914520263671875 }, { "auxiliary_loss_clip": 0.01349098, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.05612767, "balance_loss_mlp": 1.02452612, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.320748410826593, "language_loss": 0.80468017, "learning_rate": 3.178593414669841e-06, "loss": 0.82849604, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.7475204467773438 }, { "auxiliary_loss_clip": 0.0126098, "auxiliary_loss_mlp": 0.01030925, "balance_loss_clip": 1.06090569, "balance_loss_mlp": 1.02199054, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 2.210410556760773, "language_loss": 0.70701647, "learning_rate": 3.1779639810888707e-06, "loss": 0.72993553, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.7112438678741455 }, { "auxiliary_loss_clip": 0.01249318, "auxiliary_loss_mlp": 0.01029794, "balance_loss_clip": 1.05921412, "balance_loss_mlp": 1.0215385, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 1.807298733297278, "language_loss": 0.761729, "learning_rate": 3.1773343688117013e-06, "loss": 0.78452009, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.6557557582855225 }, { "auxiliary_loss_clip": 0.01303419, "auxiliary_loss_mlp": 0.02569291, "balance_loss_clip": 1.05209088, "balance_loss_mlp": 1.00013876, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 2.169867215241575, "language_loss": 0.84351921, "learning_rate": 3.1767045779338445e-06, "loss": 0.88224632, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.7146055698394775 }, { "auxiliary_loss_clip": 0.01248969, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.05317998, "balance_loss_mlp": 1.02205276, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.02096144481064, "language_loss": 0.91592193, "learning_rate": 3.176074608550839e-06, "loss": 0.9387095, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.6530983448028564 }, { "auxiliary_loss_clip": 0.01447817, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.05010104, "balance_loss_mlp": 1.02421355, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 3.041283556750985, "language_loss": 0.82518566, "learning_rate": 3.17544446075825e-06, "loss": 0.8499912, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 3.676870822906494 }, { "auxiliary_loss_clip": 0.01307251, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.0570029, "balance_loss_mlp": 1.0198853, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.6975284600865148, "language_loss": 0.71061444, "learning_rate": 3.174814134651671e-06, "loss": 0.73396277, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.827807664871216 }, { "auxiliary_loss_clip": 0.01199361, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.05888462, "balance_loss_mlp": 1.02488589, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 2.055195090173691, "language_loss": 0.80585253, "learning_rate": 3.1741836303267215e-06, "loss": 0.82816851, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 3.5749707221984863 }, { "auxiliary_loss_clip": 0.01197167, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.05715752, "balance_loss_mlp": 1.02288449, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 2.0947846991571515, "language_loss": 0.75601339, "learning_rate": 3.1735529478790496e-06, "loss": 0.77829814, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.7049672603607178 }, { "auxiliary_loss_clip": 0.01249699, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.05672681, "balance_loss_mlp": 1.01767027, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 1.8860422552764113, "language_loss": 0.79278916, "learning_rate": 3.172922087404328e-06, "loss": 0.81555939, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.9103660583496094 }, { "auxiliary_loss_clip": 0.01096692, "auxiliary_loss_mlp": 0.01006167, "balance_loss_clip": 1.02585328, "balance_loss_mlp": 1.00502861, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.9004829041994183, "language_loss": 0.5522089, "learning_rate": 3.1722910489982586e-06, "loss": 0.57323754, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.2663371562957764 }, { "auxiliary_loss_clip": 0.01300003, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.0542351, "balance_loss_mlp": 1.02257133, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.6462135902519155, "language_loss": 0.79856074, "learning_rate": 3.1716598327565694e-06, "loss": 0.82187551, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.708505630493164 }, { "auxiliary_loss_clip": 0.01200327, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.05839396, "balance_loss_mlp": 1.02274477, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.6521142600906313, "language_loss": 0.84207749, "learning_rate": 3.171028438775015e-06, "loss": 0.86438733, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.6117968559265137 }, { "auxiliary_loss_clip": 0.01199121, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.05804706, "balance_loss_mlp": 1.02738857, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 2.533823683933743, "language_loss": 0.84315681, "learning_rate": 3.170396867149377e-06, "loss": 0.86550122, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.599835157394409 }, { "auxiliary_loss_clip": 0.01399348, "auxiliary_loss_mlp": 0.01036028, "balance_loss_clip": 1.05526435, "balance_loss_mlp": 1.02689099, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.928019297885561, "language_loss": 0.86072004, "learning_rate": 3.1697651179754653e-06, "loss": 0.88507378, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.762666940689087 }, { "auxiliary_loss_clip": 0.01356509, "auxiliary_loss_mlp": 0.01035434, "balance_loss_clip": 1.06065023, "balance_loss_mlp": 1.02793241, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.8598915589200222, "language_loss": 0.72788143, "learning_rate": 3.1691331913491153e-06, "loss": 0.75180089, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.8999311923980713 }, { "auxiliary_loss_clip": 0.01201075, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.05786967, "balance_loss_mlp": 1.01919782, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 11.200623426292253, "language_loss": 0.85053432, "learning_rate": 3.1685010873661898e-06, "loss": 0.87281698, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.606022357940674 }, { "auxiliary_loss_clip": 0.01247704, "auxiliary_loss_mlp": 0.01028926, "balance_loss_clip": 1.0533576, "balance_loss_mlp": 1.01999104, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 2.0127288814647577, "language_loss": 0.79814029, "learning_rate": 3.167868806122578e-06, "loss": 0.82090664, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.6618988513946533 }, { "auxiliary_loss_clip": 0.01301077, "auxiliary_loss_mlp": 0.0103391, "balance_loss_clip": 1.05601668, "balance_loss_mlp": 1.02511191, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 2.7006781681756604, "language_loss": 0.66573679, "learning_rate": 3.1672363477141968e-06, "loss": 0.68908668, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.839826822280884 }, { "auxiliary_loss_clip": 0.01213098, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.05555391, "balance_loss_mlp": 1.02612031, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 3.659116136755836, "language_loss": 0.85059512, "learning_rate": 3.1666037122369903e-06, "loss": 0.87308049, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.79789400100708 }, { "auxiliary_loss_clip": 0.01245079, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.05150712, "balance_loss_mlp": 1.02622509, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.7499796876889984, "language_loss": 0.87176812, "learning_rate": 3.165970899786928e-06, "loss": 0.89456677, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.673337697982788 }, { "auxiliary_loss_clip": 0.01361558, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.0579257, "balance_loss_mlp": 1.02918923, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.8335723795070467, "language_loss": 0.75211835, "learning_rate": 3.1653379104600067e-06, "loss": 0.77611077, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.743190288543701 }, { "auxiliary_loss_clip": 0.01251255, "auxiliary_loss_mlp": 0.0104586, "balance_loss_clip": 1.05506945, "balance_loss_mlp": 1.03787541, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.8390135993318508, "language_loss": 0.69537568, "learning_rate": 3.164704744352251e-06, "loss": 0.71834677, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.9319770336151123 }, { "auxiliary_loss_clip": 0.01246461, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.05719376, "balance_loss_mlp": 1.02328491, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.6668472030021257, "language_loss": 0.81029308, "learning_rate": 3.164071401559713e-06, "loss": 0.8330707, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 3.5935428142547607 }, { "auxiliary_loss_clip": 0.01300622, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.05454493, "balance_loss_mlp": 1.02652621, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.726403617664475, "language_loss": 0.70973527, "learning_rate": 3.1634378821784674e-06, "loss": 0.73308754, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 2.769500970840454 }, { "auxiliary_loss_clip": 0.01267223, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.06019187, "balance_loss_mlp": 1.02743423, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 3.741592053632237, "language_loss": 0.7437495, "learning_rate": 3.1628041863046208e-06, "loss": 0.7667796, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 3.652128219604492 }, { "auxiliary_loss_clip": 0.01205832, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.05795693, "balance_loss_mlp": 1.02627826, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.2730975824057853, "language_loss": 0.91826642, "learning_rate": 3.162170314034304e-06, "loss": 0.94067496, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.555877923965454 }, { "auxiliary_loss_clip": 0.01206944, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.06004953, "balance_loss_mlp": 1.02464187, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.728162886965683, "language_loss": 0.81147045, "learning_rate": 3.1615362654636738e-06, "loss": 0.8338728, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.6387429237365723 }, { "auxiliary_loss_clip": 0.01347572, "auxiliary_loss_mlp": 0.01028808, "balance_loss_clip": 1.05959427, "balance_loss_mlp": 1.02078557, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.804482876654117, "language_loss": 0.8752107, "learning_rate": 3.1609020406889163e-06, "loss": 0.89897454, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.648016929626465 }, { "auxiliary_loss_clip": 0.01302934, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.05276978, "balance_loss_mlp": 1.02321434, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.825475608215507, "language_loss": 0.85039198, "learning_rate": 3.1602676398062416e-06, "loss": 0.87373728, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.720710277557373 }, { "auxiliary_loss_clip": 0.01249433, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.05765831, "balance_loss_mlp": 1.02099144, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 3.255465086246207, "language_loss": 0.61510026, "learning_rate": 3.1596330629118886e-06, "loss": 0.6378879, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.662487268447876 }, { "auxiliary_loss_clip": 0.01395656, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.05285788, "balance_loss_mlp": 1.03470945, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.0221531881941455, "language_loss": 0.73507369, "learning_rate": 3.1589983101021223e-06, "loss": 0.75945956, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 3.7396678924560547 }, { "auxiliary_loss_clip": 0.0120652, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.05406713, "balance_loss_mlp": 1.02245903, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.1726652987806068, "language_loss": 0.84681112, "learning_rate": 3.1583633814732337e-06, "loss": 0.86917794, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.752586603164673 }, { "auxiliary_loss_clip": 0.0120156, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.05761719, "balance_loss_mlp": 1.02849209, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 2.4564126277819938, "language_loss": 0.72262216, "learning_rate": 3.157728277121541e-06, "loss": 0.74499524, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 3.486793279647827 }, { "auxiliary_loss_clip": 0.0119903, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.05582964, "balance_loss_mlp": 1.02266407, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 2.6760876769634763, "language_loss": 0.78489769, "learning_rate": 3.1570929971433897e-06, "loss": 0.80720043, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.624271869659424 }, { "auxiliary_loss_clip": 0.01250472, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.06001472, "balance_loss_mlp": 1.02862203, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 2.090111841393296, "language_loss": 0.83882022, "learning_rate": 3.1564575416351504e-06, "loss": 0.86168635, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.640345573425293 }, { "auxiliary_loss_clip": 0.01201705, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.05930185, "balance_loss_mlp": 1.02515411, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 2.528403943366054, "language_loss": 0.74045771, "learning_rate": 3.155821910693221e-06, "loss": 0.76281303, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.666980743408203 }, { "auxiliary_loss_clip": 0.01297906, "auxiliary_loss_mlp": 0.01027461, "balance_loss_clip": 1.05345881, "balance_loss_mlp": 1.01946783, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.753813393330224, "language_loss": 0.85898125, "learning_rate": 3.1551861044140275e-06, "loss": 0.88223493, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.6877663135528564 }, { "auxiliary_loss_clip": 0.01394427, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.05441284, "balance_loss_mlp": 1.0250504, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 1.9341699088037936, "language_loss": 0.77689505, "learning_rate": 3.15455012289402e-06, "loss": 0.80116558, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.7777981758117676 }, { "auxiliary_loss_clip": 0.01253299, "auxiliary_loss_mlp": 0.01023574, "balance_loss_clip": 1.06148016, "balance_loss_mlp": 1.01553345, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 2.1913580478920602, "language_loss": 0.84535629, "learning_rate": 3.153913966229677e-06, "loss": 0.86812502, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.6978988647460938 }, { "auxiliary_loss_clip": 0.01151295, "auxiliary_loss_mlp": 0.01002636, "balance_loss_clip": 1.02740276, "balance_loss_mlp": 1.00143778, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6492005053943973, "language_loss": 0.50248885, "learning_rate": 3.1532776345175027e-06, "loss": 0.52402818, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.224557399749756 }, { "auxiliary_loss_clip": 0.01199718, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 1.05813551, "balance_loss_mlp": 1.01998663, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 2.036504734461978, "language_loss": 0.78612828, "learning_rate": 3.1526411278540285e-06, "loss": 0.80840647, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.6768252849578857 }, { "auxiliary_loss_clip": 0.01303388, "auxiliary_loss_mlp": 0.01025141, "balance_loss_clip": 1.05163574, "balance_loss_mlp": 1.01721406, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.64655289777107, "language_loss": 0.81937814, "learning_rate": 3.1520044463358116e-06, "loss": 0.84266341, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.8092055320739746 }, { "auxiliary_loss_clip": 0.01248236, "auxiliary_loss_mlp": 0.01027983, "balance_loss_clip": 1.05809903, "balance_loss_mlp": 1.02028537, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.7526615119352826, "language_loss": 0.80299735, "learning_rate": 3.151367590059436e-06, "loss": 0.82575953, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.5971014499664307 }, { "auxiliary_loss_clip": 0.01201646, "auxiliary_loss_mlp": 0.02570519, "balance_loss_clip": 1.05873036, "balance_loss_mlp": 1.00020051, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 2.120228681362872, "language_loss": 0.86753309, "learning_rate": 3.1507305591215117e-06, "loss": 0.90525472, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.6127495765686035 }, { "auxiliary_loss_clip": 0.01150317, "auxiliary_loss_mlp": 0.01002556, "balance_loss_clip": 1.02636123, "balance_loss_mlp": 1.00135231, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6693941908245172, "language_loss": 0.55737078, "learning_rate": 3.150093353618677e-06, "loss": 0.5788995, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.2743947505950928 }, { "auxiliary_loss_clip": 0.01258712, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.05826187, "balance_loss_mlp": 1.02491879, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.3486047909484538, "language_loss": 0.88244325, "learning_rate": 3.149455973647596e-06, "loss": 0.90536034, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.6772823333740234 }, { "auxiliary_loss_clip": 0.01343333, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.04941642, "balance_loss_mlp": 1.02713227, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 2.2887707533302994, "language_loss": 0.77255648, "learning_rate": 3.1488184193049563e-06, "loss": 0.79634482, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.7534680366516113 }, { "auxiliary_loss_clip": 0.0120118, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 1.06007075, "balance_loss_mlp": 1.02250218, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.8789473669912855, "language_loss": 0.7227661, "learning_rate": 3.1481806906874767e-06, "loss": 0.74508011, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.6472201347351074 }, { "auxiliary_loss_clip": 0.0120073, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.05796003, "balance_loss_mlp": 1.02846289, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.5646970557830702, "language_loss": 0.87960982, "learning_rate": 3.147542787891899e-06, "loss": 0.9019767, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 3.5429935455322266 }, { "auxiliary_loss_clip": 0.01216298, "auxiliary_loss_mlp": 0.01030018, "balance_loss_clip": 1.06032062, "balance_loss_mlp": 1.02203119, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 2.2223682904386557, "language_loss": 0.75253254, "learning_rate": 3.1469047110149926e-06, "loss": 0.77499568, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.7069613933563232 }, { "auxiliary_loss_clip": 0.01388833, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.05148554, "balance_loss_mlp": 1.02455711, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 1.9303086890530632, "language_loss": 0.84997845, "learning_rate": 3.146266460153554e-06, "loss": 0.87419796, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 3.6838464736938477 }, { "auxiliary_loss_clip": 0.01296485, "auxiliary_loss_mlp": 0.02572833, "balance_loss_clip": 1.05482388, "balance_loss_mlp": 1.00006914, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.72140840737589, "language_loss": 0.80046725, "learning_rate": 3.145628035404404e-06, "loss": 0.83916044, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.6835856437683105 }, { "auxiliary_loss_clip": 0.01145539, "auxiliary_loss_mlp": 0.00999502, "balance_loss_clip": 1.02383518, "balance_loss_mlp": 0.99838156, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8839941635247627, "language_loss": 0.57369941, "learning_rate": 3.1449894368643922e-06, "loss": 0.59514987, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.3026413917541504 }, { "auxiliary_loss_clip": 0.01349342, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.05751276, "balance_loss_mlp": 1.02365041, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.47649415651115, "language_loss": 0.7143749, "learning_rate": 3.1443506646303934e-06, "loss": 0.73818076, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.751838445663452 }, { "auxiliary_loss_clip": 0.01252996, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.05713952, "balance_loss_mlp": 1.02142406, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 1.8550433059288485, "language_loss": 0.66699737, "learning_rate": 3.1437117187993086e-06, "loss": 0.68982404, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.7447216510772705 }, { "auxiliary_loss_clip": 0.01344656, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.05015028, "balance_loss_mlp": 1.02615213, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.839564392190203, "language_loss": 0.80000794, "learning_rate": 3.143072599468065e-06, "loss": 0.82379699, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.731677293777466 }, { "auxiliary_loss_clip": 0.01300137, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.05931759, "balance_loss_mlp": 1.02476811, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.5821147848385706, "language_loss": 0.75952768, "learning_rate": 3.1424333067336174e-06, "loss": 0.78285366, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 3.6843934059143066 }, { "auxiliary_loss_clip": 0.01253598, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.05627835, "balance_loss_mlp": 1.02404761, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.8030696304466916, "language_loss": 0.78501964, "learning_rate": 3.141793840692945e-06, "loss": 0.80787814, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.728302478790283 }, { "auxiliary_loss_clip": 0.01294341, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.05418205, "balance_loss_mlp": 1.02168798, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 2.318011180572133, "language_loss": 0.62029028, "learning_rate": 3.1411542014430553e-06, "loss": 0.64353669, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 3.6193501949310303 }, { "auxiliary_loss_clip": 0.01348491, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.05036318, "balance_loss_mlp": 1.02588248, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 2.016418904797021, "language_loss": 0.8142038, "learning_rate": 3.1405143890809804e-06, "loss": 0.83801848, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.737915515899658 }, { "auxiliary_loss_clip": 0.0129545, "auxiliary_loss_mlp": 0.01027927, "balance_loss_clip": 1.05452204, "balance_loss_mlp": 1.02057219, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.9383053045172085, "language_loss": 0.70025009, "learning_rate": 3.1398744037037796e-06, "loss": 0.72348392, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.769911050796509 }, { "auxiliary_loss_clip": 0.0129645, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.05591059, "balance_loss_mlp": 1.02369583, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 2.1700932790289866, "language_loss": 0.84379101, "learning_rate": 3.139234245408538e-06, "loss": 0.86707342, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.6687350273132324 }, { "auxiliary_loss_clip": 0.0134569, "auxiliary_loss_mlp": 0.0257051, "balance_loss_clip": 1.05394328, "balance_loss_mlp": 1.00012708, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.9607053788323388, "language_loss": 0.76161921, "learning_rate": 3.1385939142923666e-06, "loss": 0.80078125, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.8452231884002686 }, { "auxiliary_loss_clip": 0.01298003, "auxiliary_loss_mlp": 0.01035327, "balance_loss_clip": 1.05303669, "balance_loss_mlp": 1.02651691, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 2.464837611701616, "language_loss": 0.78399169, "learning_rate": 3.137953410452405e-06, "loss": 0.80732495, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.687645673751831 }, { "auxiliary_loss_clip": 0.012965, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.05385065, "balance_loss_mlp": 1.02448785, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.8075910602192178, "language_loss": 0.74741364, "learning_rate": 3.1373127339858146e-06, "loss": 0.77070206, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.7853593826293945 }, { "auxiliary_loss_clip": 0.01345884, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.05258679, "balance_loss_mlp": 1.02596176, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.9594304689114797, "language_loss": 0.7499603, "learning_rate": 3.136671884989787e-06, "loss": 0.77375948, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.769310235977173 }, { "auxiliary_loss_clip": 0.01351986, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.04738784, "balance_loss_mlp": 1.02468538, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.2160594775492095, "language_loss": 0.86412191, "learning_rate": 3.1360308635615383e-06, "loss": 0.88798136, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.769449234008789 }, { "auxiliary_loss_clip": 0.013077, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.0573976, "balance_loss_mlp": 1.02685928, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 2.0447630394741596, "language_loss": 0.7873773, "learning_rate": 3.135389669798311e-06, "loss": 0.81081414, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.738898515701294 }, { "auxiliary_loss_clip": 0.01250879, "auxiliary_loss_mlp": 0.02570728, "balance_loss_clip": 1.05641246, "balance_loss_mlp": 1.00008988, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 1.8727325527428638, "language_loss": 0.79877555, "learning_rate": 3.134748303797373e-06, "loss": 0.83699167, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.659327507019043 }, { "auxiliary_loss_clip": 0.01307379, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.05179, "balance_loss_mlp": 1.0255661, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 2.1531305285056654, "language_loss": 0.80702233, "learning_rate": 3.1341067656560203e-06, "loss": 0.83044469, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.7330806255340576 }, { "auxiliary_loss_clip": 0.01310011, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.05593848, "balance_loss_mlp": 1.02612603, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 1.8315908627645825, "language_loss": 0.86562455, "learning_rate": 3.133465055471572e-06, "loss": 0.88907492, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.7080581188201904 }, { "auxiliary_loss_clip": 0.01342602, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.05312371, "balance_loss_mlp": 1.02912831, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.3480426532730667, "language_loss": 0.6631425, "learning_rate": 3.1328231733413767e-06, "loss": 0.68694013, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.690516948699951 }, { "auxiliary_loss_clip": 0.01251128, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.05760932, "balance_loss_mlp": 1.02418637, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.1347583773438994, "language_loss": 0.90849668, "learning_rate": 3.1321811193628067e-06, "loss": 0.93133068, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.59450626373291 }, { "auxiliary_loss_clip": 0.01251459, "auxiliary_loss_mlp": 0.02575666, "balance_loss_clip": 1.05952215, "balance_loss_mlp": 1.00012183, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 2.6309933939341663, "language_loss": 0.70081508, "learning_rate": 3.131538893633261e-06, "loss": 0.73908633, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.695283889770508 }, { "auxiliary_loss_clip": 0.01201483, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.0597024, "balance_loss_mlp": 1.02913618, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.036177148804156, "language_loss": 0.77839041, "learning_rate": 3.130896496250165e-06, "loss": 0.80077857, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 3.492018461227417 }, { "auxiliary_loss_clip": 0.01200263, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.05745006, "balance_loss_mlp": 1.02289343, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 2.046314384363834, "language_loss": 0.86986434, "learning_rate": 3.1302539273109693e-06, "loss": 0.89218092, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 3.417073965072632 }, { "auxiliary_loss_clip": 0.0129533, "auxiliary_loss_mlp": 0.01038514, "balance_loss_clip": 1.05558896, "balance_loss_mlp": 1.02956176, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.7229827257563408, "language_loss": 0.80462146, "learning_rate": 3.1296111869131513e-06, "loss": 0.8279599, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.6722474098205566 }, { "auxiliary_loss_clip": 0.01200283, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.05807734, "balance_loss_mlp": 1.02386308, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.899533047807885, "language_loss": 0.85800767, "learning_rate": 3.1289682751542153e-06, "loss": 0.88032788, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.6564061641693115 }, { "auxiliary_loss_clip": 0.01250834, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.05633307, "balance_loss_mlp": 1.02208388, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 2.125558754619266, "language_loss": 0.71352637, "learning_rate": 3.1283251921316883e-06, "loss": 0.73633754, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.683009147644043 }, { "auxiliary_loss_clip": 0.01396257, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.05543947, "balance_loss_mlp": 1.02586174, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 2.063073767723093, "language_loss": 0.80892402, "learning_rate": 3.1276819379431277e-06, "loss": 0.83322787, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.818302631378174 }, { "auxiliary_loss_clip": 0.01217891, "auxiliary_loss_mlp": 0.0257545, "balance_loss_clip": 1.05799115, "balance_loss_mlp": 1.00004363, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.071200183266099, "language_loss": 0.74970502, "learning_rate": 3.1270385126861134e-06, "loss": 0.78763843, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.669538974761963 }, { "auxiliary_loss_clip": 0.01203758, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.05990231, "balance_loss_mlp": 1.01673043, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 2.0256429720276397, "language_loss": 0.82112813, "learning_rate": 3.1263949164582533e-06, "loss": 0.84342313, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.609295129776001 }, { "auxiliary_loss_clip": 0.01201697, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.05671811, "balance_loss_mlp": 1.02392077, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 2.2024766073859494, "language_loss": 0.78701353, "learning_rate": 3.1257511493571797e-06, "loss": 0.80935585, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 3.483334541320801 }, { "auxiliary_loss_clip": 0.01346632, "auxiliary_loss_mlp": 0.01028716, "balance_loss_clip": 1.05289924, "balance_loss_mlp": 1.02080631, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 2.7192951768915212, "language_loss": 0.78951049, "learning_rate": 3.125107211480552e-06, "loss": 0.81326395, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.739971160888672 }, { "auxiliary_loss_clip": 0.01355638, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.050264, "balance_loss_mlp": 1.02110314, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.6422971777030027, "language_loss": 0.8042841, "learning_rate": 3.124463102926054e-06, "loss": 0.82813293, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 3.708165168762207 }, { "auxiliary_loss_clip": 0.01137722, "auxiliary_loss_mlp": 0.01007603, "balance_loss_clip": 1.01864874, "balance_loss_mlp": 1.00641656, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.847733545070313, "language_loss": 0.61625183, "learning_rate": 3.1238188237913984e-06, "loss": 0.63770509, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.347686290740967 }, { "auxiliary_loss_clip": 0.01206867, "auxiliary_loss_mlp": 0.01032653, "balance_loss_clip": 1.06046271, "balance_loss_mlp": 1.02372432, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.5170871832964057, "language_loss": 0.76035041, "learning_rate": 3.1231743741743202e-06, "loss": 0.7827456, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.6039533615112305 }, { "auxiliary_loss_clip": 0.01244747, "auxiliary_loss_mlp": 0.01028585, "balance_loss_clip": 1.05110145, "balance_loss_mlp": 1.02018058, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.1894620456584453, "language_loss": 0.83552361, "learning_rate": 3.122529754172582e-06, "loss": 0.85825694, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.645589590072632 }, { "auxiliary_loss_clip": 0.01247636, "auxiliary_loss_mlp": 0.01036174, "balance_loss_clip": 1.05691874, "balance_loss_mlp": 1.02783513, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 8.858543105136327, "language_loss": 0.72013545, "learning_rate": 3.1218849638839736e-06, "loss": 0.74297357, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.656952381134033 }, { "auxiliary_loss_clip": 0.01339291, "auxiliary_loss_mlp": 0.0102793, "balance_loss_clip": 1.04602444, "balance_loss_mlp": 1.01866174, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 4.682141996399791, "language_loss": 0.7841866, "learning_rate": 3.121240003406307e-06, "loss": 0.80785882, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.746295928955078 }, { "auxiliary_loss_clip": 0.01354288, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.0560329, "balance_loss_mlp": 1.02363014, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 2.169601903256514, "language_loss": 0.72642744, "learning_rate": 3.120594872837425e-06, "loss": 0.75029647, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.78584361076355 }, { "auxiliary_loss_clip": 0.01140993, "auxiliary_loss_mlp": 0.0251621, "balance_loss_clip": 1.02070618, "balance_loss_mlp": 0.99983913, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.8286886989355703, "language_loss": 0.62330782, "learning_rate": 3.1199495722751906e-06, "loss": 0.6598798, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.2382733821868896 }, { "auxiliary_loss_clip": 0.01397284, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.0496093, "balance_loss_mlp": 1.02335644, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 2.2873842194143257, "language_loss": 0.84079999, "learning_rate": 3.1193041018174972e-06, "loss": 0.86509645, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.8033053874969482 }, { "auxiliary_loss_clip": 0.01253868, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.05713415, "balance_loss_mlp": 1.02797675, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 2.0783511928603713, "language_loss": 0.94733453, "learning_rate": 3.118658461562261e-06, "loss": 0.97024131, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.6488711833953857 }, { "auxiliary_loss_clip": 0.01302518, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.05972767, "balance_loss_mlp": 1.02461481, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.6187275731717328, "language_loss": 0.85175478, "learning_rate": 3.118012651607426e-06, "loss": 0.87511563, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.731895923614502 }, { "auxiliary_loss_clip": 0.01201385, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.05918026, "balance_loss_mlp": 1.02788711, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 1.943628110729952, "language_loss": 0.83164418, "learning_rate": 3.1173666720509603e-06, "loss": 0.85402286, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.594526529312134 }, { "auxiliary_loss_clip": 0.01302974, "auxiliary_loss_mlp": 0.01032939, "balance_loss_clip": 1.05466187, "balance_loss_mlp": 1.02474892, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.7998656683018657, "language_loss": 0.6842587, "learning_rate": 3.116720522990859e-06, "loss": 0.70761776, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.7679738998413086 }, { "auxiliary_loss_clip": 0.01448017, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.05100858, "balance_loss_mlp": 1.02448022, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 2.1012007059063604, "language_loss": 0.62210685, "learning_rate": 3.116074204525142e-06, "loss": 0.64691401, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.7465224266052246 }, { "auxiliary_loss_clip": 0.01245958, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.0556283, "balance_loss_mlp": 1.02615178, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.8524393975877405, "language_loss": 0.83665496, "learning_rate": 3.1154277167518553e-06, "loss": 0.85946125, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.7482433319091797 }, { "auxiliary_loss_clip": 0.01185874, "auxiliary_loss_mlp": 0.01004212, "balance_loss_clip": 1.01715136, "balance_loss_mlp": 1.00279951, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7764877702506137, "language_loss": 0.59524548, "learning_rate": 3.114781059769072e-06, "loss": 0.61714637, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.1152126789093018 }, { "auxiliary_loss_clip": 0.0129781, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.05627608, "balance_loss_mlp": 1.01908565, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 5.271492523949398, "language_loss": 0.67877269, "learning_rate": 3.1141342336748874e-06, "loss": 0.7020328, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 3.6468052864074707 }, { "auxiliary_loss_clip": 0.01241552, "auxiliary_loss_mlp": 0.01034364, "balance_loss_clip": 1.05538034, "balance_loss_mlp": 1.02646613, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.5464679333151938, "language_loss": 0.82202059, "learning_rate": 3.1134872385674253e-06, "loss": 0.84477973, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 3.6118650436401367 }, { "auxiliary_loss_clip": 0.01300825, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.05155706, "balance_loss_mlp": 1.02605689, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.7434703282554898, "language_loss": 0.86048424, "learning_rate": 3.1128400745448353e-06, "loss": 0.88383996, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.7182064056396484 }, { "auxiliary_loss_clip": 0.0125155, "auxiliary_loss_mlp": 0.01034054, "balance_loss_clip": 1.05755782, "balance_loss_mlp": 1.02488041, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.019712212760729, "language_loss": 0.62909806, "learning_rate": 3.11219274170529e-06, "loss": 0.65195405, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.7237935066223145 }, { "auxiliary_loss_clip": 0.01291717, "auxiliary_loss_mlp": 0.01032802, "balance_loss_clip": 1.05194306, "balance_loss_mlp": 1.02487421, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.7154468268285585, "language_loss": 0.81531727, "learning_rate": 3.1115452401469903e-06, "loss": 0.83856243, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.7129788398742676 }, { "auxiliary_loss_clip": 0.01393764, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.04916573, "balance_loss_mlp": 1.02608633, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 1.978102616197759, "language_loss": 0.86409819, "learning_rate": 3.1108975699681613e-06, "loss": 0.88837743, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.744023561477661 }, { "auxiliary_loss_clip": 0.01347182, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.05319035, "balance_loss_mlp": 1.02649474, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 1.7370548161557668, "language_loss": 0.71527684, "learning_rate": 3.1102497312670542e-06, "loss": 0.73909223, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.740532159805298 }, { "auxiliary_loss_clip": 0.01286842, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 1.04918337, "balance_loss_mlp": 1.02181709, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.015949721084627, "language_loss": 0.80705106, "learning_rate": 3.109601724141946e-06, "loss": 0.83021712, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.7140095233917236 }, { "auxiliary_loss_clip": 0.01204902, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.05242705, "balance_loss_mlp": 1.02877581, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 2.6874592582794268, "language_loss": 0.68311036, "learning_rate": 3.108953548691138e-06, "loss": 0.70553398, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 3.5832653045654297 }, { "auxiliary_loss_clip": 0.01200962, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 1.05795896, "balance_loss_mlp": 1.02257228, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.663432802853727, "language_loss": 0.72415268, "learning_rate": 3.108305205012959e-06, "loss": 0.74647069, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 3.607879877090454 }, { "auxiliary_loss_clip": 0.01298734, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.05512369, "balance_loss_mlp": 1.02222824, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.0458596462054026, "language_loss": 0.87452739, "learning_rate": 3.107656693205761e-06, "loss": 0.8978098, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.6674318313598633 }, { "auxiliary_loss_clip": 0.01206726, "auxiliary_loss_mlp": 0.010357, "balance_loss_clip": 1.05939186, "balance_loss_mlp": 1.02637792, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.4603400159097664, "language_loss": 0.70731485, "learning_rate": 3.107008013367924e-06, "loss": 0.72973907, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.624476671218872 }, { "auxiliary_loss_clip": 0.0134245, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.0503819, "balance_loss_mlp": 1.01944315, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 2.2425945038124597, "language_loss": 0.86661565, "learning_rate": 3.1063591655978507e-06, "loss": 0.89031684, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.7107934951782227 }, { "auxiliary_loss_clip": 0.01295523, "auxiliary_loss_mlp": 0.01032023, "balance_loss_clip": 1.04507422, "balance_loss_mlp": 1.02296925, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 4.58374583264801, "language_loss": 0.79991364, "learning_rate": 3.105710149993972e-06, "loss": 0.82318914, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.7472386360168457 }, { "auxiliary_loss_clip": 0.01201781, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.05801034, "balance_loss_mlp": 1.02773571, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 2.5387504433761943, "language_loss": 0.85855955, "learning_rate": 3.1050609666547427e-06, "loss": 0.88093483, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.6382105350494385 }, { "auxiliary_loss_clip": 0.01352055, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.05358946, "balance_loss_mlp": 1.02544117, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 1.8736338027229247, "language_loss": 0.77823442, "learning_rate": 3.104411615678644e-06, "loss": 0.80209392, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.734347105026245 }, { "auxiliary_loss_clip": 0.01304948, "auxiliary_loss_mlp": 0.0104344, "balance_loss_clip": 1.0578028, "balance_loss_mlp": 1.03448164, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 3.4621394168369406, "language_loss": 0.7357372, "learning_rate": 3.1037620971641803e-06, "loss": 0.75922108, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.8004729747772217 }, { "auxiliary_loss_clip": 0.01201522, "auxiliary_loss_mlp": 0.01030479, "balance_loss_clip": 1.0580771, "balance_loss_mlp": 1.0215379, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 2.619045902386856, "language_loss": 0.65488327, "learning_rate": 3.1031124112098844e-06, "loss": 0.6772033, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.5992393493652344 }, { "auxiliary_loss_clip": 0.01300142, "auxiliary_loss_mlp": 0.01037325, "balance_loss_clip": 1.05449033, "balance_loss_mlp": 1.02853894, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 2.018403684885105, "language_loss": 0.72516191, "learning_rate": 3.1024625579143127e-06, "loss": 0.74853659, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.6751391887664795 }, { "auxiliary_loss_clip": 0.01200145, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.05736971, "balance_loss_mlp": 1.03028297, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 5.625520016850965, "language_loss": 0.7325536, "learning_rate": 3.101812537376048e-06, "loss": 0.7549457, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.6273956298828125 }, { "auxiliary_loss_clip": 0.01290044, "auxiliary_loss_mlp": 0.02570789, "balance_loss_clip": 1.05139744, "balance_loss_mlp": 1.0000273, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 1.955391173297287, "language_loss": 0.84733456, "learning_rate": 3.1011623496936973e-06, "loss": 0.88594294, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.7413854598999023 }, { "auxiliary_loss_clip": 0.01195992, "auxiliary_loss_mlp": 0.0103395, "balance_loss_clip": 1.05591655, "balance_loss_mlp": 1.02568316, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.9557450623242916, "language_loss": 0.69575524, "learning_rate": 3.100511994965893e-06, "loss": 0.71805465, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.623093843460083 }, { "auxiliary_loss_clip": 0.01244021, "auxiliary_loss_mlp": 0.01037898, "balance_loss_clip": 1.05621099, "balance_loss_mlp": 1.03007245, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.7538028981508496, "language_loss": 0.84761012, "learning_rate": 3.0998614732912947e-06, "loss": 0.8704294, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.693640947341919 }, { "auxiliary_loss_clip": 0.01248804, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.05791688, "balance_loss_mlp": 1.02378082, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 2.1441176688974637, "language_loss": 0.68325448, "learning_rate": 3.0992107847685855e-06, "loss": 0.70606804, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.6067380905151367 }, { "auxiliary_loss_clip": 0.01298339, "auxiliary_loss_mlp": 0.01039334, "balance_loss_clip": 1.05856013, "balance_loss_mlp": 1.02983356, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.8194510368540773, "language_loss": 0.79316336, "learning_rate": 3.0985599294964736e-06, "loss": 0.81654012, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.6973791122436523 }, { "auxiliary_loss_clip": 0.01271499, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.0556314, "balance_loss_mlp": 1.03031325, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.8876852404589808, "language_loss": 0.70185524, "learning_rate": 3.097908907573695e-06, "loss": 0.72496372, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.7446839809417725 }, { "auxiliary_loss_clip": 0.0144526, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.05347109, "balance_loss_mlp": 1.02296877, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 2.0192186812961954, "language_loss": 0.89545643, "learning_rate": 3.0972577190990067e-06, "loss": 0.92021775, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 3.706181049346924 }, { "auxiliary_loss_clip": 0.0135201, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.0535835, "balance_loss_mlp": 1.02830482, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.8553399103126984, "language_loss": 0.80052805, "learning_rate": 3.096606364171196e-06, "loss": 0.82441485, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 3.6682770252227783 }, { "auxiliary_loss_clip": 0.01341204, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.05255628, "balance_loss_mlp": 1.02449441, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 1.920542284244653, "language_loss": 0.85274148, "learning_rate": 3.0959548428890703e-06, "loss": 0.87648207, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.688107967376709 }, { "auxiliary_loss_clip": 0.01247473, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.05815566, "balance_loss_mlp": 1.02224016, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 2.2162673016810976, "language_loss": 0.84005433, "learning_rate": 3.095303155351468e-06, "loss": 0.86283302, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.669416904449463 }, { "auxiliary_loss_clip": 0.01391975, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.05039001, "balance_loss_mlp": 1.0282836, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.6231057846577297, "language_loss": 0.80003983, "learning_rate": 3.0946513016572464e-06, "loss": 0.82432085, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.7578086853027344 }, { "auxiliary_loss_clip": 0.01256084, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.05824137, "balance_loss_mlp": 1.02236044, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 2.2957509561373026, "language_loss": 0.76776075, "learning_rate": 3.0939992819052938e-06, "loss": 0.79063833, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.5690016746520996 }, { "auxiliary_loss_clip": 0.01298414, "auxiliary_loss_mlp": 0.01035703, "balance_loss_clip": 1.05548191, "balance_loss_mlp": 1.02758503, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 2.1555898322291354, "language_loss": 0.81108975, "learning_rate": 3.0933470961945193e-06, "loss": 0.83443093, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.7319107055664062 }, { "auxiliary_loss_clip": 0.01293056, "auxiliary_loss_mlp": 0.01027229, "balance_loss_clip": 1.05501771, "balance_loss_mlp": 1.0194149, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 2.556095907357704, "language_loss": 0.68244994, "learning_rate": 3.0926947446238597e-06, "loss": 0.70565283, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.7062857151031494 }, { "auxiliary_loss_clip": 0.012558, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.05601263, "balance_loss_mlp": 1.02554917, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.418136690683408, "language_loss": 0.82298017, "learning_rate": 3.092042227292276e-06, "loss": 0.84588075, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 3.579735517501831 }, { "auxiliary_loss_clip": 0.01200587, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.05957985, "balance_loss_mlp": 1.02318811, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.6584475596709194, "language_loss": 0.88412666, "learning_rate": 3.0913895442987557e-06, "loss": 0.90644193, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 3.609792709350586 }, { "auxiliary_loss_clip": 0.01344038, "auxiliary_loss_mlp": 0.02574172, "balance_loss_clip": 1.05435681, "balance_loss_mlp": 0.99999923, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.549887470672772, "language_loss": 0.8596772, "learning_rate": 3.090736695742308e-06, "loss": 0.89885926, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.992711067199707 }, { "auxiliary_loss_clip": 0.01393148, "auxiliary_loss_mlp": 0.01028257, "balance_loss_clip": 1.05032766, "balance_loss_mlp": 1.02048469, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.1945931637305005, "language_loss": 0.52512771, "learning_rate": 3.0900836817219713e-06, "loss": 0.54934174, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.7522132396698 }, { "auxiliary_loss_clip": 0.01200145, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.0582372, "balance_loss_mlp": 1.01946294, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 2.1331154628984668, "language_loss": 0.83926344, "learning_rate": 3.089430502336807e-06, "loss": 0.86153704, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.6406409740448 }, { "auxiliary_loss_clip": 0.0125389, "auxiliary_loss_mlp": 0.01027484, "balance_loss_clip": 1.05743003, "balance_loss_mlp": 1.01864433, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 2.879870179361866, "language_loss": 0.9048714, "learning_rate": 3.088777157685902e-06, "loss": 0.92768514, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.640377998352051 }, { "auxiliary_loss_clip": 0.01295234, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.05652869, "balance_loss_mlp": 1.02276516, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 1.775621810347205, "language_loss": 0.85765034, "learning_rate": 3.088123647868367e-06, "loss": 0.88091189, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.586916446685791 }, { "auxiliary_loss_clip": 0.01256519, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.05726624, "balance_loss_mlp": 1.02441597, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 4.708248592912433, "language_loss": 0.81795371, "learning_rate": 3.0874699729833405e-06, "loss": 0.84084886, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.713743209838867 }, { "auxiliary_loss_clip": 0.01294874, "auxiliary_loss_mlp": 0.01026229, "balance_loss_clip": 1.05557394, "balance_loss_mlp": 1.01837897, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.5892024279808348, "language_loss": 0.7992177, "learning_rate": 3.086816133129983e-06, "loss": 0.82242864, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.7191998958587646 }, { "auxiliary_loss_clip": 0.01206637, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.06467474, "balance_loss_mlp": 1.0287776, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 3.157989242402198, "language_loss": 0.76340902, "learning_rate": 3.0861621284074826e-06, "loss": 0.78584445, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.6715030670166016 }, { "auxiliary_loss_clip": 0.01216108, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.05881977, "balance_loss_mlp": 1.02385557, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.593514767918034, "language_loss": 0.72983742, "learning_rate": 3.085507958915051e-06, "loss": 0.75231707, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.662234306335449 }, { "auxiliary_loss_clip": 0.01296243, "auxiliary_loss_mlp": 0.01025704, "balance_loss_clip": 1.05559945, "balance_loss_mlp": 1.01782143, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 2.5701637048061117, "language_loss": 0.71652627, "learning_rate": 3.084853624751925e-06, "loss": 0.73974574, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.9217023849487305 }, { "auxiliary_loss_clip": 0.01353239, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.05834317, "balance_loss_mlp": 1.02399063, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.9002776208774446, "language_loss": 0.8558706, "learning_rate": 3.0841991260173668e-06, "loss": 0.87972307, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.7624831199645996 }, { "auxiliary_loss_clip": 0.01204173, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.06061471, "balance_loss_mlp": 1.02441335, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.9026400019455083, "language_loss": 0.80556357, "learning_rate": 3.0835444628106634e-06, "loss": 0.82792974, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.6516847610473633 }, { "auxiliary_loss_clip": 0.01199441, "auxiliary_loss_mlp": 0.02572286, "balance_loss_clip": 1.05791736, "balance_loss_mlp": 0.99997997, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.8450959070441355, "language_loss": 0.8335017, "learning_rate": 3.082889635231126e-06, "loss": 0.87121892, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.6424148082733154 }, { "auxiliary_loss_clip": 0.01302915, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.05555987, "balance_loss_mlp": 1.01978791, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.709681895240856, "language_loss": 0.77549541, "learning_rate": 3.0822346433780925e-06, "loss": 0.79880655, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.746055841445923 }, { "auxiliary_loss_clip": 0.0125323, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.05635333, "balance_loss_mlp": 1.02447343, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 2.751785574958954, "language_loss": 0.87506437, "learning_rate": 3.0815794873509237e-06, "loss": 0.89792341, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.728621482849121 }, { "auxiliary_loss_clip": 0.01201671, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.05895519, "balance_loss_mlp": 1.02279782, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 2.2851441945425397, "language_loss": 0.72633129, "learning_rate": 3.0809241672490066e-06, "loss": 0.74866188, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.5971593856811523 }, { "auxiliary_loss_clip": 0.01298089, "auxiliary_loss_mlp": 0.01035597, "balance_loss_clip": 1.05523443, "balance_loss_mlp": 1.02749085, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.6565066312209926, "language_loss": 0.84890008, "learning_rate": 3.080268683171753e-06, "loss": 0.87223697, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 3.653149366378784 }, { "auxiliary_loss_clip": 0.01250069, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.05475175, "balance_loss_mlp": 1.02245355, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 1.9957402452401236, "language_loss": 0.89259076, "learning_rate": 3.0796130352185985e-06, "loss": 0.91539085, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 3.490203619003296 }, { "auxiliary_loss_clip": 0.01290964, "auxiliary_loss_mlp": 0.02574613, "balance_loss_clip": 1.04815495, "balance_loss_mlp": 1.00000346, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 4.566977908397807, "language_loss": 0.66949272, "learning_rate": 3.0789572234890057e-06, "loss": 0.70814848, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.8559441566467285 }, { "auxiliary_loss_clip": 0.013021, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.06087303, "balance_loss_mlp": 1.02517116, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.8873803002495546, "language_loss": 0.77251768, "learning_rate": 3.0783012480824596e-06, "loss": 0.79587626, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.7080438137054443 }, { "auxiliary_loss_clip": 0.01198167, "auxiliary_loss_mlp": 0.01038844, "balance_loss_clip": 1.05635285, "balance_loss_mlp": 1.03013599, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 4.811029889216106, "language_loss": 0.74348807, "learning_rate": 3.077645109098471e-06, "loss": 0.76585817, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.6358115673065186 }, { "auxiliary_loss_clip": 0.01340399, "auxiliary_loss_mlp": 0.01032482, "balance_loss_clip": 1.05388868, "balance_loss_mlp": 1.02432179, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.660770272088191, "language_loss": 0.7236433, "learning_rate": 3.076988806636577e-06, "loss": 0.74737215, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.7425107955932617 }, { "auxiliary_loss_clip": 0.01303298, "auxiliary_loss_mlp": 0.02576644, "balance_loss_clip": 1.05895877, "balance_loss_mlp": 0.99998045, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 2.0491568020610758, "language_loss": 0.88920105, "learning_rate": 3.0763323407963377e-06, "loss": 0.92800045, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.6998302936553955 }, { "auxiliary_loss_clip": 0.0124951, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.05601192, "balance_loss_mlp": 1.0215261, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 1.8844807428221013, "language_loss": 0.80271935, "learning_rate": 3.075675711677337e-06, "loss": 0.82551205, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.7269704341888428 }, { "auxiliary_loss_clip": 0.01293676, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 1.05855143, "balance_loss_mlp": 1.02360034, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 3.4446712259318475, "language_loss": 0.78205991, "learning_rate": 3.0750189193791865e-06, "loss": 0.80531096, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 3.5523312091827393 }, { "auxiliary_loss_clip": 0.01248826, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.05568397, "balance_loss_mlp": 1.02295053, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 2.090682257376321, "language_loss": 0.70584714, "learning_rate": 3.0743619640015203e-06, "loss": 0.72865212, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 3.6334452629089355 }, { "auxiliary_loss_clip": 0.01303341, "auxiliary_loss_mlp": 0.01032582, "balance_loss_clip": 1.05250978, "balance_loss_mlp": 1.02445757, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 1.8422748635004635, "language_loss": 0.92346609, "learning_rate": 3.073704845643999e-06, "loss": 0.94682539, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.660728693008423 }, { "auxiliary_loss_clip": 0.01253035, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.05548024, "balance_loss_mlp": 1.0251472, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 2.7938925364671094, "language_loss": 0.7860617, "learning_rate": 3.0730475644063063e-06, "loss": 0.80893207, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.6082005500793457 }, { "auxiliary_loss_clip": 0.0120249, "auxiliary_loss_mlp": 0.02569882, "balance_loss_clip": 1.05074823, "balance_loss_mlp": 0.99997818, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.8633370378373313, "language_loss": 0.64551264, "learning_rate": 3.072390120388151e-06, "loss": 0.68323636, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.699586868286133 }, { "auxiliary_loss_clip": 0.01247545, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.0557965, "balance_loss_mlp": 1.02677464, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.0490007837624358, "language_loss": 0.71676713, "learning_rate": 3.071732513689267e-06, "loss": 0.73960334, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.6384952068328857 }, { "auxiliary_loss_clip": 0.01258113, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.06201804, "balance_loss_mlp": 1.0199312, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.325105304614131, "language_loss": 0.67657918, "learning_rate": 3.0710747444094134e-06, "loss": 0.69944668, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.61584210395813 }, { "auxiliary_loss_clip": 0.01299803, "auxiliary_loss_mlp": 0.01032187, "balance_loss_clip": 1.0580169, "balance_loss_mlp": 1.02380681, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 1.9671626811567173, "language_loss": 0.65230381, "learning_rate": 3.070416812648372e-06, "loss": 0.67562366, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.8860719203948975 }, { "auxiliary_loss_clip": 0.01347546, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.049505, "balance_loss_mlp": 1.02189779, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.7036207958834506, "language_loss": 0.64696276, "learning_rate": 3.069758718505951e-06, "loss": 0.67074531, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.7567684650421143 }, { "auxiliary_loss_clip": 0.01202627, "auxiliary_loss_mlp": 0.01028605, "balance_loss_clip": 1.06062388, "balance_loss_mlp": 1.02048635, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.9529698311175585, "language_loss": 0.80214089, "learning_rate": 3.0691004620819836e-06, "loss": 0.82445318, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.6620278358459473 }, { "auxiliary_loss_clip": 0.01302684, "auxiliary_loss_mlp": 0.01008036, "balance_loss_clip": 1.02286744, "balance_loss_mlp": 1.0065341, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.8083668566183136, "language_loss": 0.60224771, "learning_rate": 3.0684420434763254e-06, "loss": 0.62535489, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.36918568611145 }, { "auxiliary_loss_clip": 0.01342219, "auxiliary_loss_mlp": 0.01029184, "balance_loss_clip": 1.05455899, "balance_loss_mlp": 1.02140307, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.9520546707809927, "language_loss": 0.76898718, "learning_rate": 3.06778346278886e-06, "loss": 0.79270118, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.8621397018432617 }, { "auxiliary_loss_clip": 0.01202164, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.06100523, "balance_loss_mlp": 1.02337003, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 1.6385479522757052, "language_loss": 0.78992945, "learning_rate": 3.0671247201194906e-06, "loss": 0.81226933, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.655972957611084 }, { "auxiliary_loss_clip": 0.01349351, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 1.05456686, "balance_loss_mlp": 1.01666141, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.746291028028605, "language_loss": 0.75365794, "learning_rate": 3.066465815568151e-06, "loss": 0.77739966, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.775007963180542 }, { "auxiliary_loss_clip": 0.01249987, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.05468464, "balance_loss_mlp": 1.01823926, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 2.371282391575246, "language_loss": 0.68747717, "learning_rate": 3.0658067492347947e-06, "loss": 0.71024215, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.6964564323425293 }, { "auxiliary_loss_clip": 0.01488878, "auxiliary_loss_mlp": 0.01030524, "balance_loss_clip": 1.0485158, "balance_loss_mlp": 1.02223873, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 2.3658192162742435, "language_loss": 0.67217636, "learning_rate": 3.065147521219402e-06, "loss": 0.69737035, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.7631824016571045 }, { "auxiliary_loss_clip": 0.01294178, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 1.05532622, "balance_loss_mlp": 1.0211854, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.9523049982021068, "language_loss": 0.74436152, "learning_rate": 3.064488131621977e-06, "loss": 0.76759136, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.8839781284332275 }, { "auxiliary_loss_clip": 0.01242044, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.05292928, "balance_loss_mlp": 1.02276039, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.7370054330186282, "language_loss": 0.73704726, "learning_rate": 3.063828580542549e-06, "loss": 0.75977945, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.6840124130249023 }, { "auxiliary_loss_clip": 0.01296013, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.05410957, "balance_loss_mlp": 1.02252758, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 2.2215956593803714, "language_loss": 0.73463941, "learning_rate": 3.0631688680811706e-06, "loss": 0.75790024, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 3.632139205932617 }, { "auxiliary_loss_clip": 0.01199621, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.05813038, "balance_loss_mlp": 1.02138352, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.407393847541024, "language_loss": 0.75839919, "learning_rate": 3.062508994337921e-06, "loss": 0.7806952, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.6265995502471924 }, { "auxiliary_loss_clip": 0.01246338, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.0550642, "balance_loss_mlp": 1.02016997, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 2.1942321686988753, "language_loss": 0.79553413, "learning_rate": 3.0618489594129013e-06, "loss": 0.81828308, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 3.5991854667663574 }, { "auxiliary_loss_clip": 0.01355145, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 1.05739653, "balance_loss_mlp": 1.02204561, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 6.168506178877179, "language_loss": 0.71447265, "learning_rate": 3.061188763406239e-06, "loss": 0.73832703, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.7855207920074463 }, { "auxiliary_loss_clip": 0.01300141, "auxiliary_loss_mlp": 0.01030504, "balance_loss_clip": 1.05668342, "balance_loss_mlp": 1.02229023, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.8795020495272, "language_loss": 0.82370239, "learning_rate": 3.060528406418085e-06, "loss": 0.84700882, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.7231194972991943 }, { "auxiliary_loss_clip": 0.01297538, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.0565294, "balance_loss_mlp": 1.02690339, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.718029328777169, "language_loss": 0.61779535, "learning_rate": 3.0598678885486145e-06, "loss": 0.64111447, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.8023905754089355 }, { "auxiliary_loss_clip": 0.01256942, "auxiliary_loss_mlp": 0.02568102, "balance_loss_clip": 1.0489614, "balance_loss_mlp": 1.00002313, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.7060818850665407, "language_loss": 0.74576873, "learning_rate": 3.0592072098980282e-06, "loss": 0.78401917, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.791290760040283 }, { "auxiliary_loss_clip": 0.01300389, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.05659628, "balance_loss_mlp": 1.03215027, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 3.517453095316934, "language_loss": 0.73346376, "learning_rate": 3.0585463705665514e-06, "loss": 0.75687385, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.74783992767334 }, { "auxiliary_loss_clip": 0.0134765, "auxiliary_loss_mlp": 0.01027351, "balance_loss_clip": 1.05221713, "balance_loss_mlp": 1.01920271, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.1911681322027463, "language_loss": 0.71083188, "learning_rate": 3.0578853706544304e-06, "loss": 0.73458183, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 3.66605806350708 }, { "auxiliary_loss_clip": 0.01355384, "auxiliary_loss_mlp": 0.02569147, "balance_loss_clip": 1.05518365, "balance_loss_mlp": 1.00014806, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.5346313080248914, "language_loss": 0.65659481, "learning_rate": 3.0572242102619404e-06, "loss": 0.69584012, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 2.7188103199005127 }, { "auxiliary_loss_clip": 0.01297008, "auxiliary_loss_mlp": 0.01030132, "balance_loss_clip": 1.05779171, "balance_loss_mlp": 1.02271152, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 1.880083563564818, "language_loss": 0.8067044, "learning_rate": 3.0565628894893784e-06, "loss": 0.82997584, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 3.6470959186553955 }, { "auxiliary_loss_clip": 0.01245735, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.05618906, "balance_loss_mlp": 1.0247097, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.7589289009453013, "language_loss": 0.7493031, "learning_rate": 3.0559014084370655e-06, "loss": 0.77208459, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.6362407207489014 }, { "auxiliary_loss_clip": 0.01305029, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.05777144, "balance_loss_mlp": 1.02461553, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.817655543699375, "language_loss": 0.78906167, "learning_rate": 3.055239767205349e-06, "loss": 0.81244481, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.6955840587615967 }, { "auxiliary_loss_clip": 0.01249436, "auxiliary_loss_mlp": 0.01030523, "balance_loss_clip": 1.06122398, "balance_loss_mlp": 1.02222037, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.7596597196699444, "language_loss": 0.78241777, "learning_rate": 3.054577965894599e-06, "loss": 0.80521739, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.627566337585449 }, { "auxiliary_loss_clip": 0.01215179, "auxiliary_loss_mlp": 0.0103201, "balance_loss_clip": 1.06020951, "balance_loss_mlp": 1.02393627, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 2.432306442392321, "language_loss": 0.70391345, "learning_rate": 3.0539160046052094e-06, "loss": 0.72638535, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.67649245262146 }, { "auxiliary_loss_clip": 0.01296361, "auxiliary_loss_mlp": 0.01021229, "balance_loss_clip": 1.05562878, "balance_loss_mlp": 1.01306343, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.269103280764902, "language_loss": 0.70520192, "learning_rate": 3.0532538834376003e-06, "loss": 0.72837782, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.71311092376709 }, { "auxiliary_loss_clip": 0.01253983, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.05755031, "balance_loss_mlp": 1.02429628, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 1.7507060213573395, "language_loss": 0.78002769, "learning_rate": 3.0525916024922143e-06, "loss": 0.80289251, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.6404237747192383 }, { "auxiliary_loss_clip": 0.01299193, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.05523217, "balance_loss_mlp": 1.02315092, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 2.7706428263664553, "language_loss": 0.83950269, "learning_rate": 3.0519291618695193e-06, "loss": 0.86280143, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.705951452255249 }, { "auxiliary_loss_clip": 0.01343416, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.05178261, "balance_loss_mlp": 1.0230782, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.5241441360545758, "language_loss": 0.76151222, "learning_rate": 3.0512665616700065e-06, "loss": 0.78525662, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.7042758464813232 }, { "auxiliary_loss_clip": 0.01392442, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.04903865, "balance_loss_mlp": 1.02204311, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 1.8767471386832204, "language_loss": 0.89043045, "learning_rate": 3.0506038019941933e-06, "loss": 0.91465086, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.850890874862671 }, { "auxiliary_loss_clip": 0.01344466, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.05469656, "balance_loss_mlp": 1.02281404, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.8541793409845844, "language_loss": 0.68242025, "learning_rate": 3.049940882942617e-06, "loss": 0.70617622, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.7351346015930176 }, { "auxiliary_loss_clip": 0.01197173, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.0564661, "balance_loss_mlp": 1.02171206, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 1.7515300265296787, "language_loss": 0.80567169, "learning_rate": 3.0492778046158448e-06, "loss": 0.82793695, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.660358190536499 }, { "auxiliary_loss_clip": 0.0124273, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.05847704, "balance_loss_mlp": 1.02042818, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 2.283570285180506, "language_loss": 0.76906985, "learning_rate": 3.0486145671144633e-06, "loss": 0.79177451, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.692950487136841 }, { "auxiliary_loss_clip": 0.01442197, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.05166888, "balance_loss_mlp": 1.02469313, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.5124677619776783, "language_loss": 0.76629072, "learning_rate": 3.047951170539086e-06, "loss": 0.79103911, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.8293051719665527 }, { "auxiliary_loss_clip": 0.0135478, "auxiliary_loss_mlp": 0.01025822, "balance_loss_clip": 1.06390595, "balance_loss_mlp": 1.01877367, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.856365461727799, "language_loss": 0.84168553, "learning_rate": 3.047287614990349e-06, "loss": 0.86549151, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.68662691116333 }, { "auxiliary_loss_clip": 0.01295198, "auxiliary_loss_mlp": 0.01033003, "balance_loss_clip": 1.05649853, "balance_loss_mlp": 1.02455127, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.300300943102262, "language_loss": 0.62307304, "learning_rate": 3.046623900568914e-06, "loss": 0.64635503, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.7912259101867676 }, { "auxiliary_loss_clip": 0.01304642, "auxiliary_loss_mlp": 0.01027074, "balance_loss_clip": 1.05873775, "balance_loss_mlp": 1.01861, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 3.2359562199309555, "language_loss": 0.69886613, "learning_rate": 3.045960027375465e-06, "loss": 0.72218323, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 3.9531455039978027 }, { "auxiliary_loss_clip": 0.0125205, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.05641341, "balance_loss_mlp": 1.02435255, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 3.158265182135427, "language_loss": 0.82837856, "learning_rate": 3.045295995510711e-06, "loss": 0.85122579, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.7042510509490967 }, { "auxiliary_loss_clip": 0.01304449, "auxiliary_loss_mlp": 0.01035667, "balance_loss_clip": 1.0610218, "balance_loss_mlp": 1.02863717, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 2.502218940855827, "language_loss": 0.7349174, "learning_rate": 3.0446318050753865e-06, "loss": 0.75831854, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 3.68770432472229 }, { "auxiliary_loss_clip": 0.01245306, "auxiliary_loss_mlp": 0.01025328, "balance_loss_clip": 1.05523014, "balance_loss_mlp": 1.01739478, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.001408784185906, "language_loss": 0.77418417, "learning_rate": 3.0439674561702474e-06, "loss": 0.79689056, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.7185468673706055 }, { "auxiliary_loss_clip": 0.01242911, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.05783534, "balance_loss_mlp": 1.02299142, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.183170458335052, "language_loss": 0.88232958, "learning_rate": 3.043302948896076e-06, "loss": 0.90506274, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.6122987270355225 }, { "auxiliary_loss_clip": 0.01393698, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.05439997, "balance_loss_mlp": 1.0206573, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 1.806954175569307, "language_loss": 0.60424829, "learning_rate": 3.0426382833536756e-06, "loss": 0.62846929, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 2.9287376403808594 }, { "auxiliary_loss_clip": 0.01344807, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.05255508, "balance_loss_mlp": 1.02866125, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 3.6052410968422124, "language_loss": 0.77794123, "learning_rate": 3.041973459643877e-06, "loss": 0.80176115, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.772815465927124 }, { "auxiliary_loss_clip": 0.0139636, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.05062151, "balance_loss_mlp": 1.02150869, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 2.063082479889827, "language_loss": 0.6724571, "learning_rate": 3.0413084778675334e-06, "loss": 0.69671702, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.874943494796753 }, { "auxiliary_loss_clip": 0.01294589, "auxiliary_loss_mlp": 0.0256529, "balance_loss_clip": 1.05235481, "balance_loss_mlp": 1.00005674, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 3.232096635245014, "language_loss": 0.84104788, "learning_rate": 3.0406433381255214e-06, "loss": 0.87964666, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 3.541861057281494 }, { "auxiliary_loss_clip": 0.01247441, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.06137574, "balance_loss_mlp": 1.02358246, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 3.578625584412755, "language_loss": 0.82272971, "learning_rate": 3.0399780405187425e-06, "loss": 0.84551954, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 3.5735225677490234 }, { "auxiliary_loss_clip": 0.0124609, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.0562067, "balance_loss_mlp": 1.02059233, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 1.787639165991668, "language_loss": 0.78503442, "learning_rate": 3.0393125851481216e-06, "loss": 0.80777729, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.6958537101745605 }, { "auxiliary_loss_clip": 0.01344586, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.05385017, "balance_loss_mlp": 1.02018893, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.3212336242194986, "language_loss": 0.86662054, "learning_rate": 3.038646972114608e-06, "loss": 0.89034349, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.6876232624053955 }, { "auxiliary_loss_clip": 0.01342013, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.05608046, "balance_loss_mlp": 1.02152658, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.5964690922880143, "language_loss": 0.67680919, "learning_rate": 3.037981201519174e-06, "loss": 0.7005229, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.733980178833008 }, { "auxiliary_loss_clip": 0.01250268, "auxiliary_loss_mlp": 0.01031755, "balance_loss_clip": 1.06183374, "balance_loss_mlp": 1.023947, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 1.8904672104716467, "language_loss": 0.70919919, "learning_rate": 3.0373152734628175e-06, "loss": 0.73201942, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.613339424133301 }, { "auxiliary_loss_clip": 0.01241704, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.05365396, "balance_loss_mlp": 1.02411723, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 1.975680279799148, "language_loss": 0.76342285, "learning_rate": 3.0366491880465584e-06, "loss": 0.78615189, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.612025022506714 }, { "auxiliary_loss_clip": 0.01204697, "auxiliary_loss_mlp": 0.01024879, "balance_loss_clip": 1.06276393, "balance_loss_mlp": 1.01614118, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.6325619020318005, "language_loss": 0.82266486, "learning_rate": 3.035982945371443e-06, "loss": 0.84496069, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.700660467147827 }, { "auxiliary_loss_clip": 0.01308684, "auxiliary_loss_mlp": 0.01027681, "balance_loss_clip": 1.0584439, "balance_loss_mlp": 1.01965535, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.4815026347840563, "language_loss": 0.85827732, "learning_rate": 3.035316545538537e-06, "loss": 0.88164103, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.6735472679138184 }, { "auxiliary_loss_clip": 0.01298341, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.06119943, "balance_loss_mlp": 1.02814829, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 2.0153273931868716, "language_loss": 0.80154729, "learning_rate": 3.034649988648935e-06, "loss": 0.82488954, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.7298243045806885 }, { "auxiliary_loss_clip": 0.01298858, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.05598998, "balance_loss_mlp": 1.01666856, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.9382147174370683, "language_loss": 0.80804759, "learning_rate": 3.033983274803752e-06, "loss": 0.83128619, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.717029571533203 }, { "auxiliary_loss_clip": 0.01298808, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.05499673, "balance_loss_mlp": 1.02111959, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 3.141783589045261, "language_loss": 0.72760749, "learning_rate": 3.0333164041041283e-06, "loss": 0.75088787, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.7648487091064453 }, { "auxiliary_loss_clip": 0.01452178, "auxiliary_loss_mlp": 0.01033539, "balance_loss_clip": 1.05425119, "balance_loss_mlp": 1.02588856, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 1.7387916483392476, "language_loss": 0.72238022, "learning_rate": 3.032649376651228e-06, "loss": 0.74723732, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.8098206520080566 }, { "auxiliary_loss_clip": 0.01349666, "auxiliary_loss_mlp": 0.01031479, "balance_loss_clip": 1.05611324, "balance_loss_mlp": 1.02340269, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 1.630708424129234, "language_loss": 0.76302791, "learning_rate": 3.031982192546238e-06, "loss": 0.78683937, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.811267375946045 }, { "auxiliary_loss_clip": 0.01250344, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.05764449, "balance_loss_mlp": 1.02099967, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.1834125088658154, "language_loss": 0.95394766, "learning_rate": 3.0313148518903696e-06, "loss": 0.9767369, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.675196409225464 }, { "auxiliary_loss_clip": 0.0130872, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.06150341, "balance_loss_mlp": 1.02349138, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 3.5922252334808937, "language_loss": 0.80942374, "learning_rate": 3.030647354784859e-06, "loss": 0.83282316, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.679007053375244 }, { "auxiliary_loss_clip": 0.01340159, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.05248487, "balance_loss_mlp": 1.01864588, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 2.0219135173934824, "language_loss": 0.77761233, "learning_rate": 3.029979701330964e-06, "loss": 0.80127627, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.77066969871521 }, { "auxiliary_loss_clip": 0.01303838, "auxiliary_loss_mlp": 0.01029298, "balance_loss_clip": 1.05746698, "balance_loss_mlp": 1.02122784, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.0941642795206508, "language_loss": 0.80150127, "learning_rate": 3.029311891629966e-06, "loss": 0.82483256, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.6482183933258057 }, { "auxiliary_loss_clip": 0.01297254, "auxiliary_loss_mlp": 0.0102529, "balance_loss_clip": 1.05899191, "balance_loss_mlp": 1.01720142, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 2.144632102169098, "language_loss": 0.74256647, "learning_rate": 3.0286439257831744e-06, "loss": 0.76579189, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 2.7423503398895264 }, { "auxiliary_loss_clip": 0.01203669, "auxiliary_loss_mlp": 0.01036787, "balance_loss_clip": 1.06010556, "balance_loss_mlp": 1.02841878, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 2.5929317651061594, "language_loss": 0.7171495, "learning_rate": 3.0279758038919156e-06, "loss": 0.73955411, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 3.6103627681732178 }, { "auxiliary_loss_clip": 0.01251128, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.06073165, "balance_loss_mlp": 1.01942182, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 1.9585543513408568, "language_loss": 0.78764915, "learning_rate": 3.0273075260575455e-06, "loss": 0.81043565, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 3.5402910709381104 }, { "auxiliary_loss_clip": 0.01304913, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.05710649, "balance_loss_mlp": 1.01770711, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.9284678994522868, "language_loss": 0.81073809, "learning_rate": 3.0266390923814396e-06, "loss": 0.83405107, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.7446138858795166 }, { "auxiliary_loss_clip": 0.01312145, "auxiliary_loss_mlp": 0.01028438, "balance_loss_clip": 1.06516707, "balance_loss_mlp": 1.02009296, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.8776095090856384, "language_loss": 0.81982636, "learning_rate": 3.0259705029650008e-06, "loss": 0.84323215, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.6701841354370117 }, { "auxiliary_loss_clip": 0.01254496, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.0598371, "balance_loss_mlp": 1.01977229, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.6488953251531697, "language_loss": 0.72806543, "learning_rate": 3.025301757909652e-06, "loss": 0.75088274, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.7365505695343018 }, { "auxiliary_loss_clip": 0.01345338, "auxiliary_loss_mlp": 0.02575989, "balance_loss_clip": 1.05547428, "balance_loss_mlp": 1.00009525, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.6366392487946995, "language_loss": 0.80826539, "learning_rate": 3.024632857316842e-06, "loss": 0.84747863, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.774203062057495 }, { "auxiliary_loss_clip": 0.01250702, "auxiliary_loss_mlp": 0.0102935, "balance_loss_clip": 1.05986404, "balance_loss_mlp": 1.02059364, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.7389674195137272, "language_loss": 0.77799773, "learning_rate": 3.0239638012880412e-06, "loss": 0.8007983, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 2.7017650604248047 }, { "auxiliary_loss_clip": 0.01391587, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.05099428, "balance_loss_mlp": 1.02164733, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.922504252998511, "language_loss": 0.81347024, "learning_rate": 3.0232945899247466e-06, "loss": 0.83768845, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 3.660767078399658 }, { "auxiliary_loss_clip": 0.01255085, "auxiliary_loss_mlp": 0.01029683, "balance_loss_clip": 1.05915058, "balance_loss_mlp": 1.02162468, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 2.2548856985989953, "language_loss": 0.77468705, "learning_rate": 3.022625223328476e-06, "loss": 0.79753476, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 2.7253222465515137 }, { "auxiliary_loss_clip": 0.01169996, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.06373179, "balance_loss_mlp": 1.02045918, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.37707164140349, "language_loss": 0.69126767, "learning_rate": 3.0219557016007723e-06, "loss": 0.71326125, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 3.599968194961548 }, { "auxiliary_loss_clip": 0.01254463, "auxiliary_loss_mlp": 0.01029837, "balance_loss_clip": 1.0623858, "balance_loss_mlp": 1.02144432, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 2.210288308125198, "language_loss": 0.7000367, "learning_rate": 3.021286024843202e-06, "loss": 0.72287971, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.6889326572418213 }, { "auxiliary_loss_clip": 0.01098174, "auxiliary_loss_mlp": 0.0100536, "balance_loss_clip": 1.02917361, "balance_loss_mlp": 1.00400662, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.0864220020905129, "language_loss": 0.64809883, "learning_rate": 3.0206161931573526e-06, "loss": 0.66913414, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.127211809158325 }, { "auxiliary_loss_clip": 0.01296771, "auxiliary_loss_mlp": 0.01029911, "balance_loss_clip": 1.05397987, "balance_loss_mlp": 1.02147746, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.8995439473427054, "language_loss": 0.93223166, "learning_rate": 3.0199462066448388e-06, "loss": 0.95549846, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.747288942337036 }, { "auxiliary_loss_clip": 0.0125543, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.06257486, "balance_loss_mlp": 1.02619171, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 2.264444772207905, "language_loss": 0.68924975, "learning_rate": 3.019276065407296e-06, "loss": 0.71214914, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.674916982650757 }, { "auxiliary_loss_clip": 0.01395562, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.05146444, "balance_loss_mlp": 1.02618098, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 1.7542974990852325, "language_loss": 0.80792344, "learning_rate": 3.018605769546385e-06, "loss": 0.83222735, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.838106155395508 }, { "auxiliary_loss_clip": 0.0125108, "auxiliary_loss_mlp": 0.0103038, "balance_loss_clip": 1.05665517, "balance_loss_mlp": 1.02198124, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 2.221595207219945, "language_loss": 0.79188776, "learning_rate": 3.017935319163788e-06, "loss": 0.81470239, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.6216273307800293 }, { "auxiliary_loss_clip": 0.01254419, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.0605607, "balance_loss_mlp": 1.02278483, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.8832301096803592, "language_loss": 0.70397651, "learning_rate": 3.017264714361213e-06, "loss": 0.72683835, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.8792366981506348 }, { "auxiliary_loss_clip": 0.0129837, "auxiliary_loss_mlp": 0.02575759, "balance_loss_clip": 1.05639052, "balance_loss_mlp": 1.00016546, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 2.199930549883388, "language_loss": 0.82045829, "learning_rate": 3.016593955240389e-06, "loss": 0.85919958, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.713514566421509 }, { "auxiliary_loss_clip": 0.01145843, "auxiliary_loss_mlp": 0.00997123, "balance_loss_clip": 1.02746487, "balance_loss_mlp": 0.99580592, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8217401868922278, "language_loss": 0.63616377, "learning_rate": 3.015923041903071e-06, "loss": 0.65759349, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.2971999645233154 }, { "auxiliary_loss_clip": 0.01251588, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.06070924, "balance_loss_mlp": 1.02246416, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.2313131008273985, "language_loss": 0.83377379, "learning_rate": 3.0152519744510347e-06, "loss": 0.85659212, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.7556324005126953 }, { "auxiliary_loss_clip": 0.01348038, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.05224216, "balance_loss_mlp": 1.02595711, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 3.9064656588883357, "language_loss": 0.82569265, "learning_rate": 3.014580752986081e-06, "loss": 0.84951472, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.7637572288513184 }, { "auxiliary_loss_clip": 0.0139969, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.05637908, "balance_loss_mlp": 1.03222096, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.363804104669686, "language_loss": 0.78995574, "learning_rate": 3.0139093776100345e-06, "loss": 0.81436038, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.792764186859131 }, { "auxiliary_loss_clip": 0.01198148, "auxiliary_loss_mlp": 0.0103157, "balance_loss_clip": 1.05866432, "balance_loss_mlp": 1.02332699, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.6567816407604703, "language_loss": 0.75324911, "learning_rate": 3.013237848424741e-06, "loss": 0.77554625, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.667980432510376 }, { "auxiliary_loss_clip": 0.01304289, "auxiliary_loss_mlp": 0.01032801, "balance_loss_clip": 1.0607357, "balance_loss_mlp": 1.02467704, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.3453746823652937, "language_loss": 0.75217903, "learning_rate": 3.012566165532072e-06, "loss": 0.77554989, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.7625412940979004 }, { "auxiliary_loss_clip": 0.01360942, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.05101812, "balance_loss_mlp": 1.02657723, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.3020050930763025, "language_loss": 0.76573831, "learning_rate": 3.0118943290339207e-06, "loss": 0.78969741, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.8298134803771973 }, { "auxiliary_loss_clip": 0.01339494, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.04976106, "balance_loss_mlp": 1.02159619, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 2.137480331227174, "language_loss": 0.68013978, "learning_rate": 3.011222339032204e-06, "loss": 0.70384377, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 3.5904178619384766 }, { "auxiliary_loss_clip": 0.01200829, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.06027532, "balance_loss_mlp": 1.02383888, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.9085507078779673, "language_loss": 0.69318336, "learning_rate": 3.0105501956288626e-06, "loss": 0.71551251, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.6774535179138184 }, { "auxiliary_loss_clip": 0.01255439, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 1.05775332, "balance_loss_mlp": 1.02380943, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.0400598138346617, "language_loss": 0.72812152, "learning_rate": 3.0098778989258602e-06, "loss": 0.75100219, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 3.623885154724121 }, { "auxiliary_loss_clip": 0.01350631, "auxiliary_loss_mlp": 0.01035948, "balance_loss_clip": 1.05675507, "balance_loss_mlp": 1.02763939, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 2.8782503288087855, "language_loss": 0.88061786, "learning_rate": 3.009205449025183e-06, "loss": 0.90448362, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.7012813091278076 }, { "auxiliary_loss_clip": 0.01255185, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 1.05134571, "balance_loss_mlp": 1.02482939, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.9603742810008622, "language_loss": 0.63394737, "learning_rate": 3.008532846028842e-06, "loss": 0.65682626, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.679356098175049 }, { "auxiliary_loss_clip": 0.01201636, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.05974245, "balance_loss_mlp": 1.02376509, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 2.519872987984773, "language_loss": 0.71885729, "learning_rate": 3.0078600900388694e-06, "loss": 0.74119961, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.7495343685150146 }, { "auxiliary_loss_clip": 0.0133947, "auxiliary_loss_mlp": 0.01038234, "balance_loss_clip": 1.04775882, "balance_loss_mlp": 1.02905464, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 2.3858668801231513, "language_loss": 0.7390005, "learning_rate": 3.007187181157323e-06, "loss": 0.76277751, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.7449352741241455 }, { "auxiliary_loss_clip": 0.01431698, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.04583287, "balance_loss_mlp": 1.02801561, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.4188780185297794, "language_loss": 0.67821825, "learning_rate": 3.006514119486282e-06, "loss": 0.70289874, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 3.7007052898406982 }, { "auxiliary_loss_clip": 0.01341614, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.05168009, "balance_loss_mlp": 1.02597773, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.806421691674182, "language_loss": 0.69705331, "learning_rate": 3.005840905127849e-06, "loss": 0.7208128, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.749537229537964 }, { "auxiliary_loss_clip": 0.01201961, "auxiliary_loss_mlp": 0.01037396, "balance_loss_clip": 1.0615716, "balance_loss_mlp": 1.02931905, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.26353440054525, "language_loss": 0.87086499, "learning_rate": 3.0051675381841516e-06, "loss": 0.89325857, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.639583110809326 }, { "auxiliary_loss_clip": 0.01403831, "auxiliary_loss_mlp": 0.02573991, "balance_loss_clip": 1.04534459, "balance_loss_mlp": 1.00021839, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.7621734849814825, "language_loss": 0.76974648, "learning_rate": 3.0044940187573363e-06, "loss": 0.80952466, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 3.8181819915771484 }, { "auxiliary_loss_clip": 0.01249345, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.05483556, "balance_loss_mlp": 1.02474129, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 2.3978450971591454, "language_loss": 0.64934766, "learning_rate": 3.003820346949578e-06, "loss": 0.67217118, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.772486448287964 }, { "auxiliary_loss_clip": 0.011986, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.05647361, "balance_loss_mlp": 1.02656937, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.214218708568085, "language_loss": 0.79693305, "learning_rate": 3.003146522863071e-06, "loss": 0.8192668, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.5668749809265137 }, { "auxiliary_loss_clip": 0.01298729, "auxiliary_loss_mlp": 0.01033571, "balance_loss_clip": 1.05848837, "balance_loss_mlp": 1.02569723, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.244986959263357, "language_loss": 0.8617748, "learning_rate": 3.0024725466000345e-06, "loss": 0.88509774, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.7575132846832275 }, { "auxiliary_loss_clip": 0.01249105, "auxiliary_loss_mlp": 0.01026411, "balance_loss_clip": 1.05937684, "balance_loss_mlp": 1.01819754, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.7099310374129788, "language_loss": 0.7855444, "learning_rate": 3.0017984182627087e-06, "loss": 0.8082996, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.625941514968872 }, { "auxiliary_loss_clip": 0.01349204, "auxiliary_loss_mlp": 0.02574059, "balance_loss_clip": 1.05251586, "balance_loss_mlp": 1.00025392, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 1.9818999003388524, "language_loss": 0.82523179, "learning_rate": 3.00112413795336e-06, "loss": 0.86446428, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.746152400970459 }, { "auxiliary_loss_clip": 0.01296868, "auxiliary_loss_mlp": 0.01028187, "balance_loss_clip": 1.05103421, "balance_loss_mlp": 1.01951408, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 2.1066074923463294, "language_loss": 0.8044014, "learning_rate": 3.000449705774275e-06, "loss": 0.82765198, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.6534218788146973 }, { "auxiliary_loss_clip": 0.01249986, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 1.058918, "balance_loss_mlp": 1.01864946, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 1.9000180673906124, "language_loss": 0.71594846, "learning_rate": 2.9997751218277654e-06, "loss": 0.73871624, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.681589365005493 }, { "auxiliary_loss_clip": 0.01201356, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.06019497, "balance_loss_mlp": 1.01930285, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 2.1233293421578634, "language_loss": 0.78059334, "learning_rate": 2.999100386216166e-06, "loss": 0.80288446, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.6055612564086914 }, { "auxiliary_loss_clip": 0.0129766, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.05607712, "balance_loss_mlp": 1.0215292, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.731475548266655, "language_loss": 0.74133748, "learning_rate": 2.998425499041831e-06, "loss": 0.76461172, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.7447891235351562 }, { "auxiliary_loss_clip": 0.01140722, "auxiliary_loss_mlp": 0.01010327, "balance_loss_clip": 1.02226949, "balance_loss_mlp": 1.00913513, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.2735930314002386, "language_loss": 0.64562643, "learning_rate": 2.997750460407142e-06, "loss": 0.66713691, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.2823379039764404 }, { "auxiliary_loss_clip": 0.01350039, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.05077016, "balance_loss_mlp": 1.0169096, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.339887514320975, "language_loss": 0.70114267, "learning_rate": 2.997075270414501e-06, "loss": 0.7249043, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.8193156719207764 }, { "auxiliary_loss_clip": 0.01195145, "auxiliary_loss_mlp": 0.01008533, "balance_loss_clip": 1.02402544, "balance_loss_mlp": 1.00731683, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.7053579189208251, "language_loss": 0.57726681, "learning_rate": 2.9963999291663347e-06, "loss": 0.5993036, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.260542869567871 }, { "auxiliary_loss_clip": 0.01390379, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.05393124, "balance_loss_mlp": 1.02450335, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 2.361853224914991, "language_loss": 0.73921335, "learning_rate": 2.9957244367650915e-06, "loss": 0.7634393, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.759427785873413 }, { "auxiliary_loss_clip": 0.0140134, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.05495906, "balance_loss_mlp": 1.02385044, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 1.7235788882184087, "language_loss": 0.8387177, "learning_rate": 2.9950487933132425e-06, "loss": 0.86304784, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.987109899520874 }, { "auxiliary_loss_clip": 0.01256034, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.05944991, "balance_loss_mlp": 1.02521491, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.4431819094064062, "language_loss": 0.7193771, "learning_rate": 2.994372998913283e-06, "loss": 0.74227178, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.682549238204956 }, { "auxiliary_loss_clip": 0.01297321, "auxiliary_loss_mlp": 0.01026116, "balance_loss_clip": 1.05748618, "balance_loss_mlp": 1.01834393, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 3.4698281544662, "language_loss": 0.62770754, "learning_rate": 2.99369705366773e-06, "loss": 0.65094197, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 3.67044997215271 }, { "auxiliary_loss_clip": 0.01292924, "auxiliary_loss_mlp": 0.01029159, "balance_loss_clip": 1.05479431, "balance_loss_mlp": 1.0215174, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 2.272329350863742, "language_loss": 0.82378089, "learning_rate": 2.9930209576791244e-06, "loss": 0.84700167, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.7504618167877197 }, { "auxiliary_loss_clip": 0.01242156, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.05500674, "balance_loss_mlp": 1.02072251, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 2.0012296292381095, "language_loss": 0.63810778, "learning_rate": 2.9923447110500285e-06, "loss": 0.66081536, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 3.55464243888855 }, { "auxiliary_loss_clip": 0.01241412, "auxiliary_loss_mlp": 0.01027069, "balance_loss_clip": 1.05502033, "balance_loss_mlp": 1.01890945, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.4177018290837673, "language_loss": 0.75303149, "learning_rate": 2.9916683138830295e-06, "loss": 0.7757163, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.7042813301086426 }, { "auxiliary_loss_clip": 0.0129459, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.05647469, "balance_loss_mlp": 1.022892, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.037661827915762, "language_loss": 0.81105608, "learning_rate": 2.9909917662807353e-06, "loss": 0.83431137, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.695880174636841 }, { "auxiliary_loss_clip": 0.01244415, "auxiliary_loss_mlp": 0.01029196, "balance_loss_clip": 1.05634296, "balance_loss_mlp": 1.02092004, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.3558241636632413, "language_loss": 0.69033897, "learning_rate": 2.9903150683457783e-06, "loss": 0.7130751, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.694754123687744 }, { "auxiliary_loss_clip": 0.01204998, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.05476391, "balance_loss_mlp": 1.02804434, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 2.0910234547330413, "language_loss": 0.65692914, "learning_rate": 2.9896382201808126e-06, "loss": 0.67933822, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.7765626907348633 }, { "auxiliary_loss_clip": 0.01199604, "auxiliary_loss_mlp": 0.01034808, "balance_loss_clip": 1.05809021, "balance_loss_mlp": 1.02623725, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.4342032491161096, "language_loss": 0.81209171, "learning_rate": 2.988961221888516e-06, "loss": 0.83443582, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 3.6689159870147705 }, { "auxiliary_loss_clip": 0.01338126, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.05035496, "balance_loss_mlp": 1.02126598, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.4899091048557205, "language_loss": 0.79314733, "learning_rate": 2.988284073571589e-06, "loss": 0.81682128, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.7968881130218506 }, { "auxiliary_loss_clip": 0.01251838, "auxiliary_loss_mlp": 0.02567856, "balance_loss_clip": 1.0592767, "balance_loss_mlp": 1.0001651, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.344623167266212, "language_loss": 0.73605549, "learning_rate": 2.9876067753327528e-06, "loss": 0.77425253, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 2.6749179363250732 }, { "auxiliary_loss_clip": 0.01158613, "auxiliary_loss_mlp": 0.01027882, "balance_loss_clip": 1.05586815, "balance_loss_mlp": 1.019853, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 2.3004940671535348, "language_loss": 0.80557799, "learning_rate": 2.986929327274754e-06, "loss": 0.827443, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 3.6901650428771973 }, { "auxiliary_loss_clip": 0.01245049, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.05833793, "balance_loss_mlp": 1.0230577, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.6961911792437312, "language_loss": 0.79221648, "learning_rate": 2.9862517295003617e-06, "loss": 0.81497711, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.7463302612304688 }, { "auxiliary_loss_clip": 0.01346372, "auxiliary_loss_mlp": 0.01028545, "balance_loss_clip": 1.0520159, "balance_loss_mlp": 1.02043259, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.5558816365458525, "language_loss": 0.72387207, "learning_rate": 2.9855739821123654e-06, "loss": 0.74762118, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.7461211681365967 }, { "auxiliary_loss_clip": 0.01242475, "auxiliary_loss_mlp": 0.01030188, "balance_loss_clip": 1.05701363, "balance_loss_mlp": 1.02238607, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 1.9020468192867084, "language_loss": 0.82389641, "learning_rate": 2.98489608521358e-06, "loss": 0.846623, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.759153366088867 }, { "auxiliary_loss_clip": 0.012548, "auxiliary_loss_mlp": 0.02566659, "balance_loss_clip": 1.05948472, "balance_loss_mlp": 1.00024128, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 2.1627255666364675, "language_loss": 0.79832017, "learning_rate": 2.9842180389068425e-06, "loss": 0.83653474, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.6509788036346436 }, { "auxiliary_loss_clip": 0.01270821, "auxiliary_loss_mlp": 0.01004156, "balance_loss_clip": 1.04057419, "balance_loss_mlp": 1.00299346, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.8089236796170745, "language_loss": 0.59168649, "learning_rate": 2.98353984329501e-06, "loss": 0.61443621, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.30861234664917 }, { "auxiliary_loss_clip": 0.01294795, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.05708587, "balance_loss_mlp": 1.01875365, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.5421768292027147, "language_loss": 0.70437837, "learning_rate": 2.982861498480965e-06, "loss": 0.72759348, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.7424862384796143 }, { "auxiliary_loss_clip": 0.01341114, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.04865766, "balance_loss_mlp": 1.02194858, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.747494597500948, "language_loss": 0.83035445, "learning_rate": 2.9821830045676122e-06, "loss": 0.85406297, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.790914297103882 }, { "auxiliary_loss_clip": 0.01200699, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.06033838, "balance_loss_mlp": 1.02281785, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.810577063942442, "language_loss": 0.73600501, "learning_rate": 2.9815043616578793e-06, "loss": 0.75831926, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.713864326477051 }, { "auxiliary_loss_clip": 0.01345197, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.05172026, "balance_loss_mlp": 1.02256083, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.7182058955177824, "language_loss": 0.77022707, "learning_rate": 2.9808255698547145e-06, "loss": 0.79399133, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.8962297439575195 }, { "auxiliary_loss_clip": 0.01245552, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.05819809, "balance_loss_mlp": 1.01866555, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.726898990487692, "language_loss": 0.795735, "learning_rate": 2.9801466292610913e-06, "loss": 0.81845999, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.7073915004730225 }, { "auxiliary_loss_clip": 0.01247444, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.0573951, "balance_loss_mlp": 1.02459598, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.1641190504190333, "language_loss": 0.81038547, "learning_rate": 2.979467539980003e-06, "loss": 0.83318126, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.6415023803710938 }, { "auxiliary_loss_clip": 0.01251995, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.05907428, "balance_loss_mlp": 1.02185285, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.8308594687598716, "language_loss": 0.76941317, "learning_rate": 2.978788302114468e-06, "loss": 0.79223287, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.7208497524261475 }, { "auxiliary_loss_clip": 0.01247784, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.05902362, "balance_loss_mlp": 1.02466679, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 2.687672071060123, "language_loss": 0.83277583, "learning_rate": 2.9781089157675255e-06, "loss": 0.85557663, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.8177216053009033 }, { "auxiliary_loss_clip": 0.01250867, "auxiliary_loss_mlp": 0.01026756, "balance_loss_clip": 1.06147063, "balance_loss_mlp": 1.01881111, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 3.8547058395599714, "language_loss": 0.88453019, "learning_rate": 2.977429381042238e-06, "loss": 0.90730637, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.7452831268310547 }, { "auxiliary_loss_clip": 0.01296845, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.05538273, "balance_loss_mlp": 1.0241363, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.6855890153259603, "language_loss": 0.88908601, "learning_rate": 2.9767496980416913e-06, "loss": 0.91236699, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.7648606300354004 }, { "auxiliary_loss_clip": 0.01291007, "auxiliary_loss_mlp": 0.01025684, "balance_loss_clip": 1.05235195, "balance_loss_mlp": 1.01772058, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.5106483207038033, "language_loss": 0.81181151, "learning_rate": 2.9760698668689914e-06, "loss": 0.83497834, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 3.7123777866363525 }, { "auxiliary_loss_clip": 0.01247694, "auxiliary_loss_mlp": 0.01024825, "balance_loss_clip": 1.05664515, "balance_loss_mlp": 1.01757145, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 1.9355895552522022, "language_loss": 0.71492374, "learning_rate": 2.975389887627269e-06, "loss": 0.73764896, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.894158363342285 }, { "auxiliary_loss_clip": 0.01351207, "auxiliary_loss_mlp": 0.01026476, "balance_loss_clip": 1.05627918, "balance_loss_mlp": 1.01879311, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.0979508680797223, "language_loss": 0.89808655, "learning_rate": 2.9747097604196764e-06, "loss": 0.92186332, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 3.742046356201172 }, { "auxiliary_loss_clip": 0.01301215, "auxiliary_loss_mlp": 0.01002212, "balance_loss_clip": 1.019889, "balance_loss_mlp": 1.00096059, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6754156572029947, "language_loss": 0.56680131, "learning_rate": 2.9740294853493875e-06, "loss": 0.58983552, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.5512185096740723 }, { "auxiliary_loss_clip": 0.01398151, "auxiliary_loss_mlp": 0.01029634, "balance_loss_clip": 1.04983664, "balance_loss_mlp": 1.02216506, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.4463176358511047, "language_loss": 0.67477667, "learning_rate": 2.9733490625196008e-06, "loss": 0.69905448, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.822939395904541 }, { "auxiliary_loss_clip": 0.01342111, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.05362391, "balance_loss_mlp": 1.02194798, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 3.08481580988248, "language_loss": 0.75779885, "learning_rate": 2.9726684920335353e-06, "loss": 0.78151357, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.700693368911743 }, { "auxiliary_loss_clip": 0.0120122, "auxiliary_loss_mlp": 0.02569941, "balance_loss_clip": 1.05955529, "balance_loss_mlp": 1.00014174, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.3539410676824986, "language_loss": 0.8217833, "learning_rate": 2.971987773994432e-06, "loss": 0.85949498, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.676730155944824 }, { "auxiliary_loss_clip": 0.01240335, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.05266678, "balance_loss_mlp": 1.02496147, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.0557398391177424, "language_loss": 0.82761037, "learning_rate": 2.9713069085055566e-06, "loss": 0.85034156, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 3.6268632411956787 }, { "auxiliary_loss_clip": 0.01344327, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.05288041, "balance_loss_mlp": 1.0206728, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 2.103435737586411, "language_loss": 0.79076028, "learning_rate": 2.9706258956701958e-06, "loss": 0.81448865, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 2.74326229095459 }, { "auxiliary_loss_clip": 0.01250913, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.05923295, "balance_loss_mlp": 1.02319956, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 2.624781458655542, "language_loss": 0.77545166, "learning_rate": 2.9699447355916575e-06, "loss": 0.79827535, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.7463860511779785 }, { "auxiliary_loss_clip": 0.0119661, "auxiliary_loss_mlp": 0.02564055, "balance_loss_clip": 1.05766177, "balance_loss_mlp": 1.00020957, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 2.2063978988529644, "language_loss": 0.74214208, "learning_rate": 2.969263428373275e-06, "loss": 0.7797488, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 3.598775863647461 }, { "auxiliary_loss_clip": 0.01297587, "auxiliary_loss_mlp": 0.01034413, "balance_loss_clip": 1.05490136, "balance_loss_mlp": 1.02615809, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 2.0234310595002194, "language_loss": 0.79588473, "learning_rate": 2.9685819741184007e-06, "loss": 0.81920475, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.924746036529541 }, { "auxiliary_loss_clip": 0.01337907, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.05148768, "balance_loss_mlp": 1.02008927, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 4.476017933994888, "language_loss": 0.68820989, "learning_rate": 2.967900372930411e-06, "loss": 0.71186274, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.74324631690979 }, { "auxiliary_loss_clip": 0.01293657, "auxiliary_loss_mlp": 0.01028032, "balance_loss_clip": 1.05351353, "balance_loss_mlp": 1.02005661, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 2.4639385955964275, "language_loss": 0.79286218, "learning_rate": 2.9672186249127046e-06, "loss": 0.81607908, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.683039665222168 }, { "auxiliary_loss_clip": 0.01296202, "auxiliary_loss_mlp": 0.01035248, "balance_loss_clip": 1.05632865, "balance_loss_mlp": 1.02795208, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 2.0683908266243907, "language_loss": 0.79263461, "learning_rate": 2.9665367301687014e-06, "loss": 0.81594908, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.8014633655548096 }, { "auxiliary_loss_clip": 0.01291307, "auxiliary_loss_mlp": 0.01028489, "balance_loss_clip": 1.05179775, "balance_loss_mlp": 1.02064478, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 1.9197467979577256, "language_loss": 0.76690525, "learning_rate": 2.965854688801845e-06, "loss": 0.7901032, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.770176649093628 }, { "auxiliary_loss_clip": 0.01243168, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.05149853, "balance_loss_mlp": 1.02237141, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 1.8335903440015937, "language_loss": 0.76345295, "learning_rate": 2.9651725009156005e-06, "loss": 0.78618395, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.700124740600586 }, { "auxiliary_loss_clip": 0.01290241, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.05185652, "balance_loss_mlp": 1.0247556, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.6935712865448291, "language_loss": 0.74228418, "learning_rate": 2.964490166613454e-06, "loss": 0.76551509, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.680142402648926 }, { "auxiliary_loss_clip": 0.01088439, "auxiliary_loss_mlp": 0.01012515, "balance_loss_clip": 1.01912415, "balance_loss_mlp": 1.01123381, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7570451811322899, "language_loss": 0.57678711, "learning_rate": 2.963807685998917e-06, "loss": 0.59779668, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.007030725479126 }, { "auxiliary_loss_clip": 0.01393766, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.0493921, "balance_loss_mlp": 1.02196229, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.5001568071570641, "language_loss": 0.7840237, "learning_rate": 2.9631250591755196e-06, "loss": 0.80825424, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.88545560836792 }, { "auxiliary_loss_clip": 0.01289979, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.05542612, "balance_loss_mlp": 1.01838934, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 2.0138246480398387, "language_loss": 0.58059299, "learning_rate": 2.962442286246817e-06, "loss": 0.60375553, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.8754172325134277 }, { "auxiliary_loss_clip": 0.01205264, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.0549475, "balance_loss_mlp": 1.01631916, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 2.27123205485388, "language_loss": 0.69320744, "learning_rate": 2.9617593673163853e-06, "loss": 0.71550417, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.6835734844207764 }, { "auxiliary_loss_clip": 0.01296024, "auxiliary_loss_mlp": 0.01021611, "balance_loss_clip": 1.04997492, "balance_loss_mlp": 1.01450658, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.198435730454935, "language_loss": 0.77367771, "learning_rate": 2.9610763024878216e-06, "loss": 0.79685402, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.642463207244873 }, { "auxiliary_loss_clip": 0.01294343, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.05295825, "balance_loss_mlp": 1.01968658, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 2.1090013596956085, "language_loss": 0.9168551, "learning_rate": 2.960393091864747e-06, "loss": 0.94006687, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.7425074577331543 }, { "auxiliary_loss_clip": 0.01290701, "auxiliary_loss_mlp": 0.01028904, "balance_loss_clip": 1.05153012, "balance_loss_mlp": 1.02125943, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.793550116242748, "language_loss": 0.75340343, "learning_rate": 2.959709735550804e-06, "loss": 0.77659941, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.7236011028289795 }, { "auxiliary_loss_clip": 0.0139803, "auxiliary_loss_mlp": 0.01028926, "balance_loss_clip": 1.05282235, "balance_loss_mlp": 1.02115917, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.725967434014118, "language_loss": 0.7605828, "learning_rate": 2.9590262336496575e-06, "loss": 0.78485233, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.8498809337615967 }, { "auxiliary_loss_clip": 0.01352342, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.05842185, "balance_loss_mlp": 1.02756488, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.0539637534491426, "language_loss": 0.85623461, "learning_rate": 2.9583425862649936e-06, "loss": 0.88012576, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 3.739971399307251 }, { "auxiliary_loss_clip": 0.01201165, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.05922151, "balance_loss_mlp": 1.01888728, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.840433286976049, "language_loss": 0.73826516, "learning_rate": 2.9576587935005215e-06, "loss": 0.76055372, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.7730135917663574 }, { "auxiliary_loss_clip": 0.01249806, "auxiliary_loss_mlp": 0.01033606, "balance_loss_clip": 1.05686629, "balance_loss_mlp": 1.025249, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.2565212171712563, "language_loss": 0.72006863, "learning_rate": 2.9569748554599713e-06, "loss": 0.74290276, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 3.5440452098846436 }, { "auxiliary_loss_clip": 0.01293903, "auxiliary_loss_mlp": 0.01023571, "balance_loss_clip": 1.0547936, "balance_loss_mlp": 1.01609349, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 2.089164691606238, "language_loss": 0.73638737, "learning_rate": 2.956290772247097e-06, "loss": 0.75956202, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.912515878677368 }, { "auxiliary_loss_clip": 0.01385755, "auxiliary_loss_mlp": 0.01028971, "balance_loss_clip": 1.05164647, "balance_loss_mlp": 1.02089429, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 1.9388513617752037, "language_loss": 0.73510969, "learning_rate": 2.9556065439656724e-06, "loss": 0.75925696, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.7971277236938477 }, { "auxiliary_loss_clip": 0.01436204, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 1.04540372, "balance_loss_mlp": 1.02212274, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 2.011154059835425, "language_loss": 0.8158623, "learning_rate": 2.9549221707194952e-06, "loss": 0.84052122, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.822597026824951 }, { "auxiliary_loss_clip": 0.01249265, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.05724609, "balance_loss_mlp": 1.02185392, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 1.8795961937630459, "language_loss": 0.72690231, "learning_rate": 2.954237652612384e-06, "loss": 0.74969959, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.7321994304656982 }, { "auxiliary_loss_clip": 0.01292665, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.05515373, "balance_loss_mlp": 1.0243113, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 2.177821564539628, "language_loss": 0.84912735, "learning_rate": 2.9535529897481796e-06, "loss": 0.87237036, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 3.5835440158843994 }, { "auxiliary_loss_clip": 0.01197952, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.05743086, "balance_loss_mlp": 1.02313423, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.4831276487998934, "language_loss": 0.76854181, "learning_rate": 2.9528681822307446e-06, "loss": 0.79083914, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.6247260570526123 }, { "auxiliary_loss_clip": 0.0124294, "auxiliary_loss_mlp": 0.02566019, "balance_loss_clip": 1.05599511, "balance_loss_mlp": 1.00010741, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.79137794792398, "language_loss": 0.82201338, "learning_rate": 2.952183230163964e-06, "loss": 0.86010301, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 3.676154375076294 }, { "auxiliary_loss_clip": 0.01332445, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.04766309, "balance_loss_mlp": 1.01871371, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 2.035801867893786, "language_loss": 0.72918665, "learning_rate": 2.9514981336517448e-06, "loss": 0.75277692, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.7914249897003174 }, { "auxiliary_loss_clip": 0.01244905, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.05776656, "balance_loss_mlp": 1.02007127, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 1.9423730687581724, "language_loss": 0.81206191, "learning_rate": 2.950812892798015e-06, "loss": 0.83478886, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.6941518783569336 }, { "auxiliary_loss_clip": 0.01385287, "auxiliary_loss_mlp": 0.02566233, "balance_loss_clip": 1.0524652, "balance_loss_mlp": 1.00021386, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 2.4026555458428867, "language_loss": 0.87345302, "learning_rate": 2.9501275077067256e-06, "loss": 0.91296816, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.881466865539551 }, { "auxiliary_loss_clip": 0.01432478, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.04581702, "balance_loss_mlp": 1.02129281, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.5374911280372703, "language_loss": 0.88589048, "learning_rate": 2.949441978481848e-06, "loss": 0.91050321, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.896239757537842 }, { "auxiliary_loss_clip": 0.01256776, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.05307305, "balance_loss_mlp": 1.02373624, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 2.2143687702046755, "language_loss": 0.79846817, "learning_rate": 2.9487563052273778e-06, "loss": 0.82135653, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.766615867614746 }, { "auxiliary_loss_clip": 0.01253971, "auxiliary_loss_mlp": 0.01028816, "balance_loss_clip": 1.06404591, "balance_loss_mlp": 1.02070427, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 4.296978598433576, "language_loss": 0.85878909, "learning_rate": 2.94807048804733e-06, "loss": 0.88161695, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.696566104888916 }, { "auxiliary_loss_clip": 0.01258739, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.05248213, "balance_loss_mlp": 1.0267117, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 1.7954972925220913, "language_loss": 0.90143383, "learning_rate": 2.9473845270457434e-06, "loss": 0.92436469, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.7397186756134033 }, { "auxiliary_loss_clip": 0.01292738, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.05378234, "balance_loss_mlp": 1.02680266, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 2.212279746964265, "language_loss": 0.70544267, "learning_rate": 2.946698422326677e-06, "loss": 0.7287153, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.6429286003112793 }, { "auxiliary_loss_clip": 0.0139314, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.04951453, "balance_loss_mlp": 1.02242208, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 3.0565742289723374, "language_loss": 0.79979849, "learning_rate": 2.946012173994213e-06, "loss": 0.82403648, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.7804157733917236 }, { "auxiliary_loss_clip": 0.0124135, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.05876505, "balance_loss_mlp": 1.01959968, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 9.732205862403287, "language_loss": 0.67635572, "learning_rate": 2.945325782152454e-06, "loss": 0.69904089, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.8001904487609863 }, { "auxiliary_loss_clip": 0.01296219, "auxiliary_loss_mlp": 0.01025262, "balance_loss_clip": 1.05083871, "balance_loss_mlp": 1.01815748, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.079715427287812, "language_loss": 0.79204625, "learning_rate": 2.9446392469055257e-06, "loss": 0.81526107, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.6733763217926025 }, { "auxiliary_loss_clip": 0.01337574, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.05477262, "balance_loss_mlp": 1.02462411, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 6.627986391157917, "language_loss": 0.79758775, "learning_rate": 2.9439525683575745e-06, "loss": 0.82128692, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.735973596572876 }, { "auxiliary_loss_clip": 0.01203697, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.06089973, "balance_loss_mlp": 1.02194142, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 1.976809300905705, "language_loss": 0.75215805, "learning_rate": 2.9432657466127694e-06, "loss": 0.77449834, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.6092965602874756 }, { "auxiliary_loss_clip": 0.01393143, "auxiliary_loss_mlp": 0.01029024, "balance_loss_clip": 1.05638289, "balance_loss_mlp": 1.02169251, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 5.820645062830646, "language_loss": 0.76772952, "learning_rate": 2.9425787817753007e-06, "loss": 0.79195118, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.7539424896240234 }, { "auxiliary_loss_clip": 0.01254937, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 1.05444574, "balance_loss_mlp": 1.02140427, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.5958796630460161, "language_loss": 0.71572816, "learning_rate": 2.94189167394938e-06, "loss": 0.73857081, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.780282974243164 }, { "auxiliary_loss_clip": 0.01196945, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.05810452, "balance_loss_mlp": 1.02320075, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 2.0205102231882015, "language_loss": 0.81170332, "learning_rate": 2.941204423239241e-06, "loss": 0.83398652, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 3.6828622817993164 }, { "auxiliary_loss_clip": 0.01242781, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.05635715, "balance_loss_mlp": 1.01929665, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 1.9342106898355373, "language_loss": 0.75992274, "learning_rate": 2.9405170297491395e-06, "loss": 0.78262478, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.75007963180542 }, { "auxiliary_loss_clip": 0.01438617, "auxiliary_loss_mlp": 0.02570457, "balance_loss_clip": 1.05392468, "balance_loss_mlp": 1.00012934, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 2.3615952420775694, "language_loss": 0.80584311, "learning_rate": 2.939829493583353e-06, "loss": 0.84593385, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.7817671298980713 }, { "auxiliary_loss_clip": 0.01340147, "auxiliary_loss_mlp": 0.01029516, "balance_loss_clip": 1.04751039, "balance_loss_mlp": 1.02114129, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.8841537203748047, "language_loss": 0.83205372, "learning_rate": 2.939141814846179e-06, "loss": 0.85575032, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 3.6783313751220703 }, { "auxiliary_loss_clip": 0.01295916, "auxiliary_loss_mlp": 0.01030747, "balance_loss_clip": 1.05296624, "balance_loss_mlp": 1.02289724, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.9009310647209374, "language_loss": 0.8265059, "learning_rate": 2.938453993641938e-06, "loss": 0.84977257, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.6706743240356445 }, { "auxiliary_loss_clip": 0.01294817, "auxiliary_loss_mlp": 0.01027196, "balance_loss_clip": 1.05951428, "balance_loss_mlp": 1.0188756, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.737036002465674, "language_loss": 0.70034713, "learning_rate": 2.937766030074973e-06, "loss": 0.72356725, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.7120187282562256 }, { "auxiliary_loss_clip": 0.0125531, "auxiliary_loss_mlp": 0.01034789, "balance_loss_clip": 1.0537914, "balance_loss_mlp": 1.02600312, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.8636186924783458, "language_loss": 0.82761604, "learning_rate": 2.937077924249646e-06, "loss": 0.85051703, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.715573787689209 }, { "auxiliary_loss_clip": 0.01209179, "auxiliary_loss_mlp": 0.01029439, "balance_loss_clip": 1.05471826, "balance_loss_mlp": 1.02073073, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 2.531898145195225, "language_loss": 0.76066971, "learning_rate": 2.9363896762703443e-06, "loss": 0.7830559, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.6427853107452393 }, { "auxiliary_loss_clip": 0.01198173, "auxiliary_loss_mlp": 0.0102739, "balance_loss_clip": 1.05793881, "balance_loss_mlp": 1.01898587, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.637874701430685, "language_loss": 0.84770143, "learning_rate": 2.9357012862414725e-06, "loss": 0.86995709, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 3.5681889057159424 }, { "auxiliary_loss_clip": 0.0124855, "auxiliary_loss_mlp": 0.01026165, "balance_loss_clip": 1.05659258, "balance_loss_mlp": 1.01800561, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 2.207340432909591, "language_loss": 0.71892262, "learning_rate": 2.9350127542674593e-06, "loss": 0.74166977, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.6805710792541504 }, { "auxiliary_loss_clip": 0.01209551, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.05845678, "balance_loss_mlp": 1.01932955, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 2.0283670211791045, "language_loss": 0.76670092, "learning_rate": 2.934324080452755e-06, "loss": 0.78907287, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 2.752976894378662 }, { "auxiliary_loss_clip": 0.01341348, "auxiliary_loss_mlp": 0.02573531, "balance_loss_clip": 1.0506984, "balance_loss_mlp": 1.00013387, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.6730990115275195, "language_loss": 0.77959442, "learning_rate": 2.9336352649018307e-06, "loss": 0.81874323, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 3.9581828117370605 }, { "auxiliary_loss_clip": 0.01298966, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.05544519, "balance_loss_mlp": 1.02720535, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 1.758778565828236, "language_loss": 0.7019155, "learning_rate": 2.9329463077191783e-06, "loss": 0.72525537, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.76631498336792 }, { "auxiliary_loss_clip": 0.01401117, "auxiliary_loss_mlp": 0.01025215, "balance_loss_clip": 1.05359769, "balance_loss_mlp": 1.0173353, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 39.03295672251877, "language_loss": 0.64477181, "learning_rate": 2.9322572090093135e-06, "loss": 0.66903508, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.816297769546509 }, { "auxiliary_loss_clip": 0.01393553, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.0492053, "balance_loss_mlp": 1.02192807, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 2.4898874646944074, "language_loss": 0.76299214, "learning_rate": 2.9315679688767713e-06, "loss": 0.78722996, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.7503767013549805 }, { "auxiliary_loss_clip": 0.01292956, "auxiliary_loss_mlp": 0.01031152, "balance_loss_clip": 1.04951358, "balance_loss_mlp": 1.02314138, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.562543436886404, "language_loss": 0.66647226, "learning_rate": 2.9308785874261085e-06, "loss": 0.68971336, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.893484592437744 }, { "auxiliary_loss_clip": 0.0119974, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.0600481, "balance_loss_mlp": 1.0228219, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.6414104794618707, "language_loss": 0.82039905, "learning_rate": 2.9301890647619045e-06, "loss": 0.84270442, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.62060284614563 }, { "auxiliary_loss_clip": 0.01206564, "auxiliary_loss_mlp": 0.01031766, "balance_loss_clip": 1.05517852, "balance_loss_mlp": 1.02312303, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 2.7764720708186448, "language_loss": 0.80386138, "learning_rate": 2.929499400988759e-06, "loss": 0.82624471, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.7128212451934814 }, { "auxiliary_loss_clip": 0.01245379, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.05594516, "balance_loss_mlp": 1.02214932, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 2.523138678289529, "language_loss": 0.65139186, "learning_rate": 2.9288095962112927e-06, "loss": 0.67414773, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.7137391567230225 }, { "auxiliary_loss_clip": 0.01198174, "auxiliary_loss_mlp": 0.01026955, "balance_loss_clip": 1.05851018, "balance_loss_mlp": 1.01901019, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 2.034722929416078, "language_loss": 0.85390007, "learning_rate": 2.9281196505341503e-06, "loss": 0.87615144, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.7045884132385254 }, { "auxiliary_loss_clip": 0.01387306, "auxiliary_loss_mlp": 0.02567233, "balance_loss_clip": 1.05031824, "balance_loss_mlp": 1.00003302, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.271805425234311, "language_loss": 0.78598428, "learning_rate": 2.9274295640619946e-06, "loss": 0.82552963, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.7838637828826904 }, { "auxiliary_loss_clip": 0.01341936, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.04975843, "balance_loss_mlp": 1.0291661, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 1.7336001731898243, "language_loss": 0.7860083, "learning_rate": 2.9267393368995103e-06, "loss": 0.80979145, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.765315294265747 }, { "auxiliary_loss_clip": 0.01197971, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.05817926, "balance_loss_mlp": 1.0235064, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 3.0780618004346003, "language_loss": 0.74052173, "learning_rate": 2.926048969151407e-06, "loss": 0.76281714, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.631822109222412 }, { "auxiliary_loss_clip": 0.01379891, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.05054808, "balance_loss_mlp": 1.01734781, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.9790132881766362, "language_loss": 0.69057417, "learning_rate": 2.92535846092241e-06, "loss": 0.71462858, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.7616312503814697 }, { "auxiliary_loss_clip": 0.01298824, "auxiliary_loss_mlp": 0.01030591, "balance_loss_clip": 1.05707884, "balance_loss_mlp": 1.02254772, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 2.2027057997687702, "language_loss": 0.82592416, "learning_rate": 2.9246678123172704e-06, "loss": 0.84921831, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.739614725112915 }, { "auxiliary_loss_clip": 0.01200155, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.05880237, "balance_loss_mlp": 1.02510238, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 3.278875414342902, "language_loss": 0.74769223, "learning_rate": 2.9239770234407596e-06, "loss": 0.77002627, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 3.5758256912231445 }, { "auxiliary_loss_clip": 0.01247672, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.05585444, "balance_loss_mlp": 1.02288735, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.7091569984452317, "language_loss": 0.68896931, "learning_rate": 2.9232860943976686e-06, "loss": 0.71175623, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.7167863845825195 }, { "auxiliary_loss_clip": 0.01292641, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.05263901, "balance_loss_mlp": 1.02454066, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.66390243976988, "language_loss": 0.84421492, "learning_rate": 2.9225950252928115e-06, "loss": 0.86746287, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.8123667240142822 }, { "auxiliary_loss_clip": 0.01247125, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.05763769, "balance_loss_mlp": 1.02168727, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 2.108705891564849, "language_loss": 0.82142246, "learning_rate": 2.9219038162310217e-06, "loss": 0.84419143, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.690699815750122 }, { "auxiliary_loss_clip": 0.01405562, "auxiliary_loss_mlp": 0.02575282, "balance_loss_clip": 1.0501492, "balance_loss_mlp": 1.00006533, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 2.2419317124070752, "language_loss": 0.82721257, "learning_rate": 2.921212467317157e-06, "loss": 0.86702108, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 3.759967088699341 }, { "auxiliary_loss_clip": 0.01288984, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.05005956, "balance_loss_mlp": 1.01940823, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 2.0158375581342014, "language_loss": 0.80451119, "learning_rate": 2.920520978656093e-06, "loss": 0.82768148, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 3.0118439197540283 }, { "auxiliary_loss_clip": 0.01193541, "auxiliary_loss_mlp": 0.02569826, "balance_loss_clip": 1.05577803, "balance_loss_mlp": 1.00007081, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 2.1605640506405734, "language_loss": 0.77369666, "learning_rate": 2.919829350352729e-06, "loss": 0.81133038, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.6673061847686768 }, { "auxiliary_loss_clip": 0.0108272, "auxiliary_loss_mlp": 0.01015843, "balance_loss_clip": 1.01388383, "balance_loss_mlp": 1.01455522, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7536303211647591, "language_loss": 0.59992224, "learning_rate": 2.919137582511983e-06, "loss": 0.62090784, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.1133365631103516 }, { "auxiliary_loss_clip": 0.01359282, "auxiliary_loss_mlp": 0.01032925, "balance_loss_clip": 1.05938172, "balance_loss_mlp": 1.02491438, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 3.8242454158830395, "language_loss": 0.63853329, "learning_rate": 2.918445675238797e-06, "loss": 0.66245538, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.705484628677368 }, { "auxiliary_loss_clip": 0.0119626, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.0554142, "balance_loss_mlp": 1.01978087, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 2.7857906051980263, "language_loss": 0.69795287, "learning_rate": 2.917753628638132e-06, "loss": 0.72019678, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 3.5469605922698975 }, { "auxiliary_loss_clip": 0.01298389, "auxiliary_loss_mlp": 0.01026053, "balance_loss_clip": 1.05617785, "balance_loss_mlp": 1.01755309, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 9.800223366761692, "language_loss": 0.70373672, "learning_rate": 2.9170614428149716e-06, "loss": 0.72698116, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.6845271587371826 }, { "auxiliary_loss_clip": 0.01340386, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.05316305, "balance_loss_mlp": 1.02183032, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.91328981138716, "language_loss": 0.86779332, "learning_rate": 2.9163691178743195e-06, "loss": 0.8915025, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 3.804323434829712 }, { "auxiliary_loss_clip": 0.01241967, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.05533087, "balance_loss_mlp": 1.01997387, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 1.7151423420158292, "language_loss": 0.77362055, "learning_rate": 2.9156766539212006e-06, "loss": 0.79632384, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.6938748359680176 }, { "auxiliary_loss_clip": 0.01249372, "auxiliary_loss_mlp": 0.01029886, "balance_loss_clip": 1.05544806, "balance_loss_mlp": 1.02217901, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.921041672318301, "language_loss": 0.72203708, "learning_rate": 2.9149840510606614e-06, "loss": 0.74482965, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.6715407371520996 }, { "auxiliary_loss_clip": 0.01134381, "auxiliary_loss_mlp": 0.02512883, "balance_loss_clip": 1.01579177, "balance_loss_mlp": 1.00017798, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.0222100209291298, "language_loss": 0.64245057, "learning_rate": 2.914291309397769e-06, "loss": 0.67892325, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.366791009902954 }, { "auxiliary_loss_clip": 0.01430584, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.04579973, "balance_loss_mlp": 1.02386951, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.5000872486711376, "language_loss": 0.78608561, "learning_rate": 2.9135984290376117e-06, "loss": 0.81070924, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.85245418548584 }, { "auxiliary_loss_clip": 0.01439957, "auxiliary_loss_mlp": 0.01036502, "balance_loss_clip": 1.04713106, "balance_loss_mlp": 1.02874804, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 1.7968244864558323, "language_loss": 0.82601631, "learning_rate": 2.9129054100853e-06, "loss": 0.8507809, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.839069366455078 }, { "auxiliary_loss_clip": 0.01297948, "auxiliary_loss_mlp": 0.01035383, "balance_loss_clip": 1.05537331, "balance_loss_mlp": 1.02746153, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.7394668448490995, "language_loss": 0.76281953, "learning_rate": 2.912212252645963e-06, "loss": 0.78615284, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.7729578018188477 }, { "auxiliary_loss_clip": 0.01251311, "auxiliary_loss_mlp": 0.01037839, "balance_loss_clip": 1.05575967, "balance_loss_mlp": 1.02904749, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.5180062048131346, "language_loss": 0.76257062, "learning_rate": 2.9115189568247523e-06, "loss": 0.78546208, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.6509242057800293 }, { "auxiliary_loss_clip": 0.01387978, "auxiliary_loss_mlp": 0.0102777, "balance_loss_clip": 1.05521011, "balance_loss_mlp": 1.02024519, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.181451859723835, "language_loss": 0.92351818, "learning_rate": 2.910825522726841e-06, "loss": 0.94767565, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.740921974182129 }, { "auxiliary_loss_clip": 0.01386298, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.04816818, "balance_loss_mlp": 1.02051365, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 2.0935600130105674, "language_loss": 0.77569628, "learning_rate": 2.9101319504574215e-06, "loss": 0.79983771, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.700392723083496 }, { "auxiliary_loss_clip": 0.01240022, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 1.05137086, "balance_loss_mlp": 1.02007902, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 1.6367629664366687, "language_loss": 0.76207066, "learning_rate": 2.909438240121709e-06, "loss": 0.78475744, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.713472604751587 }, { "auxiliary_loss_clip": 0.01284994, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.05349553, "balance_loss_mlp": 1.0211246, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.8988052153801198, "language_loss": 0.70493412, "learning_rate": 2.908744391824939e-06, "loss": 0.72807014, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.7147817611694336 }, { "auxiliary_loss_clip": 0.01450716, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.0487411, "balance_loss_mlp": 1.01944435, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 2.0488632360745003, "language_loss": 0.79068029, "learning_rate": 2.908050405672367e-06, "loss": 0.8154639, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.866995334625244 }, { "auxiliary_loss_clip": 0.01303242, "auxiliary_loss_mlp": 0.01027044, "balance_loss_clip": 1.05066967, "balance_loss_mlp": 1.01908076, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 2.219067358074968, "language_loss": 0.79371095, "learning_rate": 2.9073562817692703e-06, "loss": 0.81701374, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.6937832832336426 }, { "auxiliary_loss_clip": 0.01302528, "auxiliary_loss_mlp": 0.00999781, "balance_loss_clip": 1.01711917, "balance_loss_mlp": 0.9984991, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7231319293404593, "language_loss": 0.56512511, "learning_rate": 2.9066620202209468e-06, "loss": 0.58814812, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.2697668075561523 }, { "auxiliary_loss_clip": 0.01337181, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.05146945, "balance_loss_mlp": 1.0176177, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 1.9787390718445712, "language_loss": 0.77660656, "learning_rate": 2.905967621132716e-06, "loss": 0.80023366, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 3.8384017944335938 }, { "auxiliary_loss_clip": 0.01300107, "auxiliary_loss_mlp": 0.01035169, "balance_loss_clip": 1.055089, "balance_loss_mlp": 1.02608526, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 1.8518153889656865, "language_loss": 0.75465935, "learning_rate": 2.9052730846099172e-06, "loss": 0.77801204, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.7341861724853516 }, { "auxiliary_loss_clip": 0.01186528, "auxiliary_loss_mlp": 0.01003414, "balance_loss_clip": 1.01479363, "balance_loss_mlp": 1.00222778, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8743616178666769, "language_loss": 0.60926485, "learning_rate": 2.9045784107579123e-06, "loss": 0.63116431, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.28112530708313 }, { "auxiliary_loss_clip": 0.01198032, "auxiliary_loss_mlp": 0.01032385, "balance_loss_clip": 1.05879128, "balance_loss_mlp": 1.02427256, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.9241490880572505, "language_loss": 0.67293692, "learning_rate": 2.9038835996820807e-06, "loss": 0.69524109, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.6218249797821045 }, { "auxiliary_loss_clip": 0.0134991, "auxiliary_loss_mlp": 0.01028376, "balance_loss_clip": 1.04938388, "balance_loss_mlp": 1.020872, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 4.480648710542528, "language_loss": 0.7947945, "learning_rate": 2.903188651487826e-06, "loss": 0.81857729, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 3.6267547607421875 }, { "auxiliary_loss_clip": 0.01155194, "auxiliary_loss_mlp": 0.01025831, "balance_loss_clip": 1.05592275, "balance_loss_mlp": 1.01799285, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.3008562053552364, "language_loss": 0.86778522, "learning_rate": 2.902493566280571e-06, "loss": 0.88959551, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.651834726333618 }, { "auxiliary_loss_clip": 0.01292429, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.05344951, "balance_loss_mlp": 1.02742839, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 5.899975652218037, "language_loss": 0.81485987, "learning_rate": 2.9017983441657595e-06, "loss": 0.83813941, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.6492788791656494 }, { "auxiliary_loss_clip": 0.0139798, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.04978466, "balance_loss_mlp": 1.01965809, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.144956956260684, "language_loss": 0.75568271, "learning_rate": 2.9011029852488564e-06, "loss": 0.77993667, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 2.7589683532714844 }, { "auxiliary_loss_clip": 0.01082258, "auxiliary_loss_mlp": 0.01004482, "balance_loss_clip": 1.01352, "balance_loss_mlp": 1.00324774, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 1.195926501188446, "language_loss": 0.6247679, "learning_rate": 2.9004074896353465e-06, "loss": 0.64563537, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.069413423538208 }, { "auxiliary_loss_clip": 0.01198956, "auxiliary_loss_mlp": 0.01026187, "balance_loss_clip": 1.06056952, "balance_loss_mlp": 1.01868856, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 3.7836233234078427, "language_loss": 0.81544781, "learning_rate": 2.8997118574307362e-06, "loss": 0.83769923, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 3.4709956645965576 }, { "auxiliary_loss_clip": 0.01259322, "auxiliary_loss_mlp": 0.01030927, "balance_loss_clip": 1.05531073, "balance_loss_mlp": 1.02293968, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.526471752998361, "language_loss": 0.74850541, "learning_rate": 2.899016088740553e-06, "loss": 0.77140784, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.747607707977295 }, { "auxiliary_loss_clip": 0.01390198, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.05104637, "balance_loss_mlp": 1.02297783, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 2.6973625649504323, "language_loss": 0.79330689, "learning_rate": 2.898320183670344e-06, "loss": 0.81751877, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 3.682981491088867 }, { "auxiliary_loss_clip": 0.01398417, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.05601668, "balance_loss_mlp": 1.01848996, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.7992158890471055, "language_loss": 0.89219445, "learning_rate": 2.8976241423256767e-06, "loss": 0.91644686, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.821410655975342 }, { "auxiliary_loss_clip": 0.01296955, "auxiliary_loss_mlp": 0.0103079, "balance_loss_clip": 1.05380249, "balance_loss_mlp": 1.02381957, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 2.289706271751039, "language_loss": 0.68592548, "learning_rate": 2.896927964812142e-06, "loss": 0.70920295, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.7800779342651367 }, { "auxiliary_loss_clip": 0.01304061, "auxiliary_loss_mlp": 0.01039065, "balance_loss_clip": 1.06167912, "balance_loss_mlp": 1.03088689, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.9090977564580873, "language_loss": 0.75212574, "learning_rate": 2.8962316512353465e-06, "loss": 0.77555704, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.6838791370391846 }, { "auxiliary_loss_clip": 0.01438928, "auxiliary_loss_mlp": 0.0102249, "balance_loss_clip": 1.04701364, "balance_loss_mlp": 1.01462853, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.7279539702096047, "language_loss": 0.74792802, "learning_rate": 2.8955352017009233e-06, "loss": 0.77254224, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.8148629665374756 }, { "auxiliary_loss_clip": 0.01294858, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.05614817, "balance_loss_mlp": 1.01682627, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 1.9092540274145933, "language_loss": 0.77356136, "learning_rate": 2.8948386163145212e-06, "loss": 0.7967602, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.6916961669921875 }, { "auxiliary_loss_clip": 0.01156802, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.05648839, "balance_loss_mlp": 1.02173638, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.8090157883618678, "language_loss": 0.79772913, "learning_rate": 2.8941418951818135e-06, "loss": 0.81959224, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.666903495788574 }, { "auxiliary_loss_clip": 0.01346208, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.05155182, "balance_loss_mlp": 1.0184834, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.1477107825996744, "language_loss": 0.71682936, "learning_rate": 2.8934450384084903e-06, "loss": 0.740556, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.699648857116699 }, { "auxiliary_loss_clip": 0.01290577, "auxiliary_loss_mlp": 0.01030069, "balance_loss_clip": 1.0495975, "balance_loss_mlp": 1.02132463, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 2.0367096308028727, "language_loss": 0.70473707, "learning_rate": 2.8927480461002653e-06, "loss": 0.72794348, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.7301511764526367 }, { "auxiliary_loss_clip": 0.01298781, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.0519942, "balance_loss_mlp": 1.01987362, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.5379591238931, "language_loss": 0.8609944, "learning_rate": 2.892050918362872e-06, "loss": 0.8842752, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.6489176750183105 }, { "auxiliary_loss_clip": 0.01396371, "auxiliary_loss_mlp": 0.0100313, "balance_loss_clip": 1.01463532, "balance_loss_mlp": 1.00176549, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8528029580203966, "language_loss": 0.55895519, "learning_rate": 2.8913536553020626e-06, "loss": 0.58295017, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.5380163192749023 }, { "auxiliary_loss_clip": 0.01388713, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.04728186, "balance_loss_mlp": 1.02188659, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.914132469785722, "language_loss": 0.84810507, "learning_rate": 2.8906562570236137e-06, "loss": 0.87229311, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 3.1141517162323 }, { "auxiliary_loss_clip": 0.01439876, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.04362273, "balance_loss_mlp": 1.0265038, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 4.7718567569279715, "language_loss": 0.76615345, "learning_rate": 2.889958723633318e-06, "loss": 0.79089671, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.755072832107544 }, { "auxiliary_loss_clip": 0.01341618, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.05243802, "balance_loss_mlp": 1.02605736, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.7699201148811547, "language_loss": 0.73807693, "learning_rate": 2.889261055236992e-06, "loss": 0.76183033, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.795851469039917 }, { "auxiliary_loss_clip": 0.01291952, "auxiliary_loss_mlp": 0.01031177, "balance_loss_clip": 1.05464101, "balance_loss_mlp": 1.02289832, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.7734284478473232, "language_loss": 0.82616132, "learning_rate": 2.8885632519404704e-06, "loss": 0.84939259, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.679410457611084 }, { "auxiliary_loss_clip": 0.01289231, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.05328918, "balance_loss_mlp": 1.02306318, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.2528523885958163, "language_loss": 0.76134902, "learning_rate": 2.8878653138496107e-06, "loss": 0.78455669, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 3.60315203666687 }, { "auxiliary_loss_clip": 0.01349964, "auxiliary_loss_mlp": 0.01029419, "balance_loss_clip": 1.04523492, "balance_loss_mlp": 1.02113676, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.7499521402208984, "language_loss": 0.76064837, "learning_rate": 2.8871672410702878e-06, "loss": 0.78444219, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.844857931137085 }, { "auxiliary_loss_clip": 0.01352721, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.05215454, "balance_loss_mlp": 1.02156234, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.6937147857160744, "language_loss": 0.81905711, "learning_rate": 2.8864690337084008e-06, "loss": 0.8428877, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.762549638748169 }, { "auxiliary_loss_clip": 0.01244234, "auxiliary_loss_mlp": 0.01029069, "balance_loss_clip": 1.05246103, "balance_loss_mlp": 1.0205965, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.7719854952291259, "language_loss": 0.78188145, "learning_rate": 2.885770691869866e-06, "loss": 0.80461454, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.6865594387054443 }, { "auxiliary_loss_clip": 0.0124102, "auxiliary_loss_mlp": 0.01030409, "balance_loss_clip": 1.05482113, "balance_loss_mlp": 1.02247226, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.3916393848429314, "language_loss": 0.75099933, "learning_rate": 2.8850722156606207e-06, "loss": 0.77371359, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 3.9893603324890137 }, { "auxiliary_loss_clip": 0.01238584, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.05217052, "balance_loss_mlp": 1.02233779, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.67742768756399, "language_loss": 0.67408931, "learning_rate": 2.8843736051866252e-06, "loss": 0.69677746, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.6693367958068848 }, { "auxiliary_loss_clip": 0.01389417, "auxiliary_loss_mlp": 0.02566978, "balance_loss_clip": 1.04869378, "balance_loss_mlp": 0.99999666, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.7957007859532768, "language_loss": 0.69501257, "learning_rate": 2.8836748605538557e-06, "loss": 0.73457646, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.9054551124572754 }, { "auxiliary_loss_clip": 0.01296032, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.05068398, "balance_loss_mlp": 1.02095544, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 3.5132068858955714, "language_loss": 0.63786203, "learning_rate": 2.882975981868313e-06, "loss": 0.66111577, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 2.8151938915252686 }, { "auxiliary_loss_clip": 0.01245718, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.05658841, "balance_loss_mlp": 1.02536607, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.3392718075339065, "language_loss": 0.68746674, "learning_rate": 2.882276969236016e-06, "loss": 0.71026015, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.8545894622802734 }, { "auxiliary_loss_clip": 0.0129079, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.05236673, "balance_loss_mlp": 1.02660584, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.077524333029217, "language_loss": 0.76532888, "learning_rate": 2.881577822763005e-06, "loss": 0.78858864, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 3.7685089111328125 }, { "auxiliary_loss_clip": 0.01243097, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.05419338, "balance_loss_mlp": 1.01819086, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 2.040081417978842, "language_loss": 0.87721491, "learning_rate": 2.880878542555338e-06, "loss": 0.89990294, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 3.6105687618255615 }, { "auxiliary_loss_clip": 0.01197089, "auxiliary_loss_mlp": 0.01035102, "balance_loss_clip": 1.05690122, "balance_loss_mlp": 1.02688193, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 1.992005447087093, "language_loss": 0.79920161, "learning_rate": 2.8801791287190976e-06, "loss": 0.82152355, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.6409547328948975 }, { "auxiliary_loss_clip": 0.01247387, "auxiliary_loss_mlp": 0.01030107, "balance_loss_clip": 1.05537987, "balance_loss_mlp": 1.02176237, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 2.62560756454543, "language_loss": 0.8585099, "learning_rate": 2.8794795813603817e-06, "loss": 0.88128489, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.659003734588623 }, { "auxiliary_loss_clip": 0.01154738, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.05449486, "balance_loss_mlp": 1.02178299, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 2.7349768325367254, "language_loss": 0.81646311, "learning_rate": 2.878779900585314e-06, "loss": 0.83830941, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.6319336891174316 }, { "auxiliary_loss_clip": 0.01303197, "auxiliary_loss_mlp": 0.01028438, "balance_loss_clip": 1.05706215, "balance_loss_mlp": 1.02076054, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.495998963800741, "language_loss": 0.75345874, "learning_rate": 2.8780800865000336e-06, "loss": 0.77677506, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.6904497146606445 }, { "auxiliary_loss_clip": 0.01138138, "auxiliary_loss_mlp": 0.01014216, "balance_loss_clip": 1.01648426, "balance_loss_mlp": 1.0128926, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9824208484623803, "language_loss": 0.59165329, "learning_rate": 2.877380139210702e-06, "loss": 0.61317682, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.1853835582733154 }, { "auxiliary_loss_clip": 0.01251228, "auxiliary_loss_mlp": 0.01026337, "balance_loss_clip": 1.05288625, "balance_loss_mlp": 1.01813555, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.8029654920250584, "language_loss": 0.76583827, "learning_rate": 2.876680058823501e-06, "loss": 0.78861392, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.790881633758545 }, { "auxiliary_loss_clip": 0.01286962, "auxiliary_loss_mlp": 0.01026448, "balance_loss_clip": 1.05060422, "balance_loss_mlp": 1.01822233, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 2.0939301429068666, "language_loss": 0.6567471, "learning_rate": 2.8759798454446314e-06, "loss": 0.6798811, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.7498703002929688 }, { "auxiliary_loss_clip": 0.01246445, "auxiliary_loss_mlp": 0.01027214, "balance_loss_clip": 1.05435133, "balance_loss_mlp": 1.02020168, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 2.2192393752751434, "language_loss": 0.81658167, "learning_rate": 2.8752794991803173e-06, "loss": 0.83931828, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.7061307430267334 }, { "auxiliary_loss_clip": 0.01292204, "auxiliary_loss_mlp": 0.01024919, "balance_loss_clip": 1.05344963, "balance_loss_mlp": 1.01771259, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 2.0237174505027493, "language_loss": 0.75258696, "learning_rate": 2.8745790201367976e-06, "loss": 0.77575815, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.6293084621429443 }, { "auxiliary_loss_clip": 0.01196582, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 1.02533531, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 2.718271538856479, "language_loss": 0.84327686, "learning_rate": 2.8738784084203373e-06, "loss": 0.8655811, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.660172462463379 }, { "auxiliary_loss_clip": 0.01286623, "auxiliary_loss_mlp": 0.01024802, "balance_loss_clip": 1.04661965, "balance_loss_mlp": 1.0174942, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.8275815639332953, "language_loss": 0.78874111, "learning_rate": 2.873177664137216e-06, "loss": 0.81185538, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.6609041690826416 }, { "auxiliary_loss_clip": 0.01335757, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05184495, "balance_loss_mlp": 1.01953065, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 2.1523622686703865, "language_loss": 0.6914109, "learning_rate": 2.8724767873937384e-06, "loss": 0.71503991, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.814842462539673 }, { "auxiliary_loss_clip": 0.01198816, "auxiliary_loss_mlp": 0.01022119, "balance_loss_clip": 1.0533185, "balance_loss_mlp": 1.01488328, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.1283314274975895, "language_loss": 0.8745296, "learning_rate": 2.871775778296225e-06, "loss": 0.89673901, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.6530778408050537 }, { "auxiliary_loss_clip": 0.01252174, "auxiliary_loss_mlp": 0.01035372, "balance_loss_clip": 1.06002307, "balance_loss_mlp": 1.02680135, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.919054888179233, "language_loss": 0.78600943, "learning_rate": 2.8710746369510196e-06, "loss": 0.80888498, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.670923948287964 }, { "auxiliary_loss_clip": 0.01293505, "auxiliary_loss_mlp": 0.0102755, "balance_loss_clip": 1.05259824, "balance_loss_mlp": 1.01932502, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.5161274112388274, "language_loss": 0.83141422, "learning_rate": 2.8703733634644846e-06, "loss": 0.85462475, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.622692108154297 }, { "auxiliary_loss_clip": 0.01194211, "auxiliary_loss_mlp": 0.01027829, "balance_loss_clip": 1.05673993, "balance_loss_mlp": 1.02056909, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.6684415983593122, "language_loss": 0.7956351, "learning_rate": 2.869671957943002e-06, "loss": 0.81785548, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 3.59295654296875 }, { "auxiliary_loss_clip": 0.01296719, "auxiliary_loss_mlp": 0.01028124, "balance_loss_clip": 1.06042218, "balance_loss_mlp": 1.02008343, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 3.2419200957551055, "language_loss": 0.74479777, "learning_rate": 2.8689704204929747e-06, "loss": 0.7680462, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.682708740234375 }, { "auxiliary_loss_clip": 0.01195503, "auxiliary_loss_mlp": 0.0102328, "balance_loss_clip": 1.05721784, "balance_loss_mlp": 1.01538837, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 2.4302835105834197, "language_loss": 0.81308985, "learning_rate": 2.8682687512208253e-06, "loss": 0.83527762, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.611602306365967 }, { "auxiliary_loss_clip": 0.01249164, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.05345941, "balance_loss_mlp": 1.02016711, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 1.9192060572584764, "language_loss": 0.80653882, "learning_rate": 2.8675669502329972e-06, "loss": 0.82930851, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.7191195487976074 }, { "auxiliary_loss_clip": 0.01240999, "auxiliary_loss_mlp": 0.0256667, "balance_loss_clip": 1.05388892, "balance_loss_mlp": 1.00002372, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.3338040436484913, "language_loss": 0.85942608, "learning_rate": 2.866865017635952e-06, "loss": 0.89750278, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.643761157989502 }, { "auxiliary_loss_clip": 0.01347154, "auxiliary_loss_mlp": 0.01023275, "balance_loss_clip": 1.05744028, "balance_loss_mlp": 1.01581895, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.891223283197314, "language_loss": 0.7935999, "learning_rate": 2.866162953536174e-06, "loss": 0.81730419, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 3.6602272987365723 }, { "auxiliary_loss_clip": 0.01294435, "auxiliary_loss_mlp": 0.02566697, "balance_loss_clip": 1.05204868, "balance_loss_mlp": 0.99994934, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.7917135653399359, "language_loss": 0.75303805, "learning_rate": 2.8654607580401634e-06, "loss": 0.79164934, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.7213101387023926 }, { "auxiliary_loss_clip": 0.01136867, "auxiliary_loss_mlp": 0.01005014, "balance_loss_clip": 1.01990485, "balance_loss_mlp": 1.0037322, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.882991127663658, "language_loss": 0.65172124, "learning_rate": 2.8647584312544446e-06, "loss": 0.67314005, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.2497687339782715 }, { "auxiliary_loss_clip": 0.01245077, "auxiliary_loss_mlp": 0.02566613, "balance_loss_clip": 1.05020785, "balance_loss_mlp": 1.00002694, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.7114961906062856, "language_loss": 0.8569665, "learning_rate": 2.864055973285559e-06, "loss": 0.89508343, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 2.8739869594573975 }, { "auxiliary_loss_clip": 0.01284257, "auxiliary_loss_mlp": 0.01024324, "balance_loss_clip": 1.05091476, "balance_loss_mlp": 1.01713896, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.9322845059485376, "language_loss": 0.86773467, "learning_rate": 2.8633533842400698e-06, "loss": 0.8908205, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 3.632336139678955 }, { "auxiliary_loss_clip": 0.01247968, "auxiliary_loss_mlp": 0.02573441, "balance_loss_clip": 1.05722773, "balance_loss_mlp": 1.00001621, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.8070396840476057, "language_loss": 0.77437145, "learning_rate": 2.862650664224558e-06, "loss": 0.81258559, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 3.566664457321167 }, { "auxiliary_loss_clip": 0.0124159, "auxiliary_loss_mlp": 0.01030329, "balance_loss_clip": 1.05780482, "balance_loss_mlp": 1.02297688, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.5839133227275146, "language_loss": 0.70062917, "learning_rate": 2.861947813345627e-06, "loss": 0.72334838, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.8184397220611572 }, { "auxiliary_loss_clip": 0.01196272, "auxiliary_loss_mlp": 0.02569363, "balance_loss_clip": 1.05697012, "balance_loss_mlp": 0.9999373, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 1.8436772044479168, "language_loss": 0.72758913, "learning_rate": 2.8612448317098974e-06, "loss": 0.7652455, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.655693292617798 }, { "auxiliary_loss_clip": 0.01340797, "auxiliary_loss_mlp": 0.0257037, "balance_loss_clip": 1.05040908, "balance_loss_mlp": 1.00009668, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.1037717597282524, "language_loss": 0.82816637, "learning_rate": 2.8605417194240114e-06, "loss": 0.86727804, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.8008902072906494 }, { "auxiliary_loss_clip": 0.01237236, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.05374956, "balance_loss_mlp": 1.0227282, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 2.030626725126595, "language_loss": 0.791673, "learning_rate": 2.8598384765946315e-06, "loss": 0.81434178, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.5639336109161377 }, { "auxiliary_loss_clip": 0.0119288, "auxiliary_loss_mlp": 0.01022158, "balance_loss_clip": 1.05391479, "balance_loss_mlp": 1.01538372, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 2.0577253569409284, "language_loss": 0.71877658, "learning_rate": 2.8591351033284377e-06, "loss": 0.74092698, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.647655487060547 }, { "auxiliary_loss_clip": 0.01150138, "auxiliary_loss_mlp": 0.01030346, "balance_loss_clip": 1.05262876, "balance_loss_mlp": 1.02271914, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.3815263613026723, "language_loss": 0.83949804, "learning_rate": 2.8584315997321325e-06, "loss": 0.86130291, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.629415273666382 }, { "auxiliary_loss_clip": 0.01194894, "auxiliary_loss_mlp": 0.01026905, "balance_loss_clip": 1.05504751, "balance_loss_mlp": 1.01924539, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.3906731748634713, "language_loss": 0.78226721, "learning_rate": 2.8577279659124356e-06, "loss": 0.8044852, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.585972547531128 }, { "auxiliary_loss_clip": 0.01241322, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.05213594, "balance_loss_mlp": 1.0201211, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 2.0346928515105276, "language_loss": 0.83364546, "learning_rate": 2.857024201976089e-06, "loss": 0.85632974, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.6354966163635254 }, { "auxiliary_loss_clip": 0.01290461, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.05247402, "balance_loss_mlp": 1.02793574, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 1.9085778929231576, "language_loss": 0.73371357, "learning_rate": 2.8563203080298516e-06, "loss": 0.75697768, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.7575082778930664 }, { "auxiliary_loss_clip": 0.01288771, "auxiliary_loss_mlp": 0.02567581, "balance_loss_clip": 1.05383587, "balance_loss_mlp": 1.00003767, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.156857685442955, "language_loss": 0.89474595, "learning_rate": 2.855616284180505e-06, "loss": 0.93330956, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.652336359024048 }, { "auxiliary_loss_clip": 0.01140252, "auxiliary_loss_mlp": 0.01002529, "balance_loss_clip": 1.01747847, "balance_loss_mlp": 1.0013907, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8713301673635935, "language_loss": 0.6606859, "learning_rate": 2.8549121305348477e-06, "loss": 0.68211365, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.1922800540924072 }, { "auxiliary_loss_clip": 0.01245059, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 1.05466402, "balance_loss_mlp": 1.0207603, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.846549855291197, "language_loss": 0.83688384, "learning_rate": 2.8542078471997006e-06, "loss": 0.85961401, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.6453568935394287 }, { "auxiliary_loss_clip": 0.01242432, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.0529964, "balance_loss_mlp": 1.02259052, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 2.389372158834539, "language_loss": 0.7569443, "learning_rate": 2.8535034342819013e-06, "loss": 0.77966666, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.667666435241699 }, { "auxiliary_loss_clip": 0.01190215, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.05421507, "balance_loss_mlp": 1.02199745, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.5571050421978772, "language_loss": 0.72537053, "learning_rate": 2.85279889188831e-06, "loss": 0.74757171, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.6578457355499268 }, { "auxiliary_loss_clip": 0.01343089, "auxiliary_loss_mlp": 0.01026891, "balance_loss_clip": 1.0480082, "balance_loss_mlp": 1.01942253, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 2.155583056941778, "language_loss": 0.81233299, "learning_rate": 2.852094220125805e-06, "loss": 0.83603269, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.7316648960113525 }, { "auxiliary_loss_clip": 0.01241093, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05580151, "balance_loss_mlp": 1.02024066, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.6476659752461997, "language_loss": 0.71359175, "learning_rate": 2.8513894191012846e-06, "loss": 0.73627818, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 3.544900894165039 }, { "auxiliary_loss_clip": 0.01194793, "auxiliary_loss_mlp": 0.01020546, "balance_loss_clip": 1.05603731, "balance_loss_mlp": 1.0133338, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.5302160615405733, "language_loss": 0.78800189, "learning_rate": 2.8506844889216664e-06, "loss": 0.81015533, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.6266186237335205 }, { "auxiliary_loss_clip": 0.01139036, "auxiliary_loss_mlp": 0.00999699, "balance_loss_clip": 1.02291369, "balance_loss_mlp": 0.99854296, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8891434934520204, "language_loss": 0.62769395, "learning_rate": 2.849979429693887e-06, "loss": 0.64908135, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.250614881515503 }, { "auxiliary_loss_clip": 0.01191101, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.05354822, "balance_loss_mlp": 1.02493334, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 1.906592097760744, "language_loss": 0.74356413, "learning_rate": 2.8492742415249042e-06, "loss": 0.76580596, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.593364953994751 }, { "auxiliary_loss_clip": 0.01189002, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.05279267, "balance_loss_mlp": 1.01873112, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.8750817318375423, "language_loss": 0.76647824, "learning_rate": 2.848568924521694e-06, "loss": 0.78862745, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 3.5111873149871826 }, { "auxiliary_loss_clip": 0.01235449, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04941595, "balance_loss_mlp": 1.02066946, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 1.8801025857005682, "language_loss": 0.73495674, "learning_rate": 2.8478634787912526e-06, "loss": 0.75759345, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.657728910446167 }, { "auxiliary_loss_clip": 0.01246022, "auxiliary_loss_mlp": 0.01031429, "balance_loss_clip": 1.05455279, "balance_loss_mlp": 1.02362061, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.1007527549646112, "language_loss": 0.76299155, "learning_rate": 2.847157904440596e-06, "loss": 0.78576601, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.695172071456909 }, { "auxiliary_loss_clip": 0.01237163, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.05264187, "balance_loss_mlp": 1.01703167, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.6507716018862728, "language_loss": 0.73661309, "learning_rate": 2.846452201576759e-06, "loss": 0.75922853, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.5962162017822266 }, { "auxiliary_loss_clip": 0.01192515, "auxiliary_loss_mlp": 0.01005517, "balance_loss_clip": 1.01657391, "balance_loss_mlp": 1.00434291, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.9253613530839787, "language_loss": 0.62794882, "learning_rate": 2.845746370306795e-06, "loss": 0.64992917, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.3010714054107666 }, { "auxiliary_loss_clip": 0.01246312, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.05513322, "balance_loss_mlp": 1.02210641, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 3.0571342481077943, "language_loss": 0.78638422, "learning_rate": 2.84504041073778e-06, "loss": 0.80914295, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 3.5671546459198 }, { "auxiliary_loss_clip": 0.01286925, "auxiliary_loss_mlp": 0.01029437, "balance_loss_clip": 1.05010498, "balance_loss_mlp": 1.02189398, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.9625682482062767, "language_loss": 0.79253131, "learning_rate": 2.844334322976806e-06, "loss": 0.81569493, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.680466413497925 }, { "auxiliary_loss_clip": 0.0139497, "auxiliary_loss_mlp": 0.01022351, "balance_loss_clip": 1.04862821, "balance_loss_mlp": 1.01416183, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 1.7688213979720362, "language_loss": 0.83296978, "learning_rate": 2.8436281071309866e-06, "loss": 0.85714293, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 3.620453357696533 }, { "auxiliary_loss_clip": 0.01305444, "auxiliary_loss_mlp": 0.01000433, "balance_loss_clip": 1.01402974, "balance_loss_mlp": 0.99925917, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7280143145629807, "language_loss": 0.52977526, "learning_rate": 2.842921763307455e-06, "loss": 0.55283403, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.2630279064178467 }, { "auxiliary_loss_clip": 0.01289525, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.04938269, "balance_loss_mlp": 1.02252471, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 2.181580046708282, "language_loss": 0.8268044, "learning_rate": 2.842215291613361e-06, "loss": 0.85000122, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.700155019760132 }, { "auxiliary_loss_clip": 0.01392354, "auxiliary_loss_mlp": 0.01016399, "balance_loss_clip": 1.01641619, "balance_loss_mlp": 1.01520085, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.7948504083475938, "language_loss": 0.59228337, "learning_rate": 2.8415086921558774e-06, "loss": 0.61637086, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.562152624130249 }, { "auxiliary_loss_clip": 0.01282717, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.04609036, "balance_loss_mlp": 1.01791692, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.5653632446888952, "language_loss": 0.78756958, "learning_rate": 2.840801965042194e-06, "loss": 0.81064892, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 2.8943099975585938 }, { "auxiliary_loss_clip": 0.01286161, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.04767799, "balance_loss_mlp": 1.0193491, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.8089321043139683, "language_loss": 0.83736598, "learning_rate": 2.840095110379521e-06, "loss": 0.86049563, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.7105891704559326 }, { "auxiliary_loss_clip": 0.01282124, "auxiliary_loss_mlp": 0.01001914, "balance_loss_clip": 1.01257181, "balance_loss_mlp": 1.00088882, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7369701736921025, "language_loss": 0.53813159, "learning_rate": 2.8393881282750884e-06, "loss": 0.56097198, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.202564239501953 }, { "auxiliary_loss_clip": 0.01289374, "auxiliary_loss_mlp": 0.01027571, "balance_loss_clip": 1.05140042, "balance_loss_mlp": 1.01919639, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 2.435873721968084, "language_loss": 0.78688079, "learning_rate": 2.838681018836144e-06, "loss": 0.81005019, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.70814847946167 }, { "auxiliary_loss_clip": 0.01342616, "auxiliary_loss_mlp": 0.0256389, "balance_loss_clip": 1.04894412, "balance_loss_mlp": 1.000072, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 2.387154467329958, "language_loss": 0.78927106, "learning_rate": 2.837973782169955e-06, "loss": 0.82833612, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.700079917907715 }, { "auxiliary_loss_clip": 0.01080288, "auxiliary_loss_mlp": 0.01004168, "balance_loss_clip": 1.01343918, "balance_loss_mlp": 1.00310135, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8076850725381319, "language_loss": 0.59133506, "learning_rate": 2.8372664183838096e-06, "loss": 0.61217964, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.2572011947631836 }, { "auxiliary_loss_clip": 0.01188643, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.053684, "balance_loss_mlp": 1.02419591, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.3499340145079692, "language_loss": 0.69040054, "learning_rate": 2.836558927585015e-06, "loss": 0.7126019, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.6211469173431396 }, { "auxiliary_loss_clip": 0.01243052, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.05240822, "balance_loss_mlp": 1.02500236, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 2.017119681113495, "language_loss": 0.82648933, "learning_rate": 2.8358513098808957e-06, "loss": 0.84924656, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.6547610759735107 }, { "auxiliary_loss_clip": 0.01375568, "auxiliary_loss_mlp": 0.01036461, "balance_loss_clip": 1.045856, "balance_loss_mlp": 1.02876043, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 2.0056659887467285, "language_loss": 0.77119225, "learning_rate": 2.835143565378798e-06, "loss": 0.79531252, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.7676126956939697 }, { "auxiliary_loss_clip": 0.0142722, "auxiliary_loss_mlp": 0.01025163, "balance_loss_clip": 1.04347134, "balance_loss_mlp": 1.01691389, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 3.2109017802435416, "language_loss": 0.7795769, "learning_rate": 2.8344356941860847e-06, "loss": 0.80410075, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.815690517425537 }, { "auxiliary_loss_clip": 0.01339383, "auxiliary_loss_mlp": 0.01026191, "balance_loss_clip": 1.04915679, "balance_loss_mlp": 1.0187968, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.3163389089422575, "language_loss": 0.66367561, "learning_rate": 2.8337276964101403e-06, "loss": 0.68733138, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 4.167910575866699 }, { "auxiliary_loss_clip": 0.01243175, "auxiliary_loss_mlp": 0.01024548, "balance_loss_clip": 1.05286872, "balance_loss_mlp": 1.01700199, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 2.218069588101566, "language_loss": 0.76351023, "learning_rate": 2.833019572158367e-06, "loss": 0.78618753, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.72501540184021 }, { "auxiliary_loss_clip": 0.012872, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.05295956, "balance_loss_mlp": 1.02226543, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 1.8676258820512213, "language_loss": 0.80120307, "learning_rate": 2.8323113215381872e-06, "loss": 0.82437158, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.6754629611968994 }, { "auxiliary_loss_clip": 0.01336588, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.04925621, "balance_loss_mlp": 1.02079916, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 2.1046325976184925, "language_loss": 0.76229143, "learning_rate": 2.831602944657042e-06, "loss": 0.78594542, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.750411033630371 }, { "auxiliary_loss_clip": 0.01199357, "auxiliary_loss_mlp": 0.01025543, "balance_loss_clip": 1.04874194, "balance_loss_mlp": 1.01834893, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.444612505503567, "language_loss": 0.74689561, "learning_rate": 2.830894441622391e-06, "loss": 0.76914465, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.670667886734009 }, { "auxiliary_loss_clip": 0.01342723, "auxiliary_loss_mlp": 0.02566361, "balance_loss_clip": 1.04719985, "balance_loss_mlp": 1.00003874, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 2.1048293625481858, "language_loss": 0.79946673, "learning_rate": 2.8301858125417134e-06, "loss": 0.83855754, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 3.7010602951049805 }, { "auxiliary_loss_clip": 0.01296825, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.05452895, "balance_loss_mlp": 1.02311897, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.8896965525497378, "language_loss": 0.73990285, "learning_rate": 2.8294770575225082e-06, "loss": 0.76317346, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.729907989501953 }, { "auxiliary_loss_clip": 0.01240217, "auxiliary_loss_mlp": 0.01026129, "balance_loss_clip": 1.05448651, "balance_loss_mlp": 1.01877367, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.9077463357339228, "language_loss": 0.84123486, "learning_rate": 2.828768176672293e-06, "loss": 0.86389834, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.658665657043457 }, { "auxiliary_loss_clip": 0.01340684, "auxiliary_loss_mlp": 0.01025426, "balance_loss_clip": 1.04673958, "balance_loss_mlp": 1.01709306, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 2.3699117653438653, "language_loss": 0.71721745, "learning_rate": 2.8280591700986044e-06, "loss": 0.74087858, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.838984966278076 }, { "auxiliary_loss_clip": 0.01294083, "auxiliary_loss_mlp": 0.01022482, "balance_loss_clip": 1.05011737, "balance_loss_mlp": 1.01508546, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 2.007625815735119, "language_loss": 0.75242102, "learning_rate": 2.827350037908999e-06, "loss": 0.77558672, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.7596468925476074 }, { "auxiliary_loss_clip": 0.01345365, "auxiliary_loss_mlp": 0.01026995, "balance_loss_clip": 1.04950881, "balance_loss_mlp": 1.01879978, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.2665438358439736, "language_loss": 0.79246759, "learning_rate": 2.8266407802110496e-06, "loss": 0.81619114, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 3.555495262145996 }, { "auxiliary_loss_clip": 0.01493872, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.04388404, "balance_loss_mlp": 1.02104747, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 2.087356685251463, "language_loss": 0.76125646, "learning_rate": 2.8259313971123515e-06, "loss": 0.78648394, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.9187867641448975 }, { "auxiliary_loss_clip": 0.01236427, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.05364084, "balance_loss_mlp": 1.02070856, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5921029842738508, "language_loss": 0.78417611, "learning_rate": 2.8252218887205166e-06, "loss": 0.8068189, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 3.6375017166137695 }, { "auxiliary_loss_clip": 0.01439828, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04692531, "balance_loss_mlp": 1.02045441, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.7134412647851787, "language_loss": 0.80389321, "learning_rate": 2.824512255143178e-06, "loss": 0.82857364, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.841970205307007 }, { "auxiliary_loss_clip": 0.01343962, "auxiliary_loss_mlp": 0.01030918, "balance_loss_clip": 1.0502336, "balance_loss_mlp": 1.02369988, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.7669549508463924, "language_loss": 0.79345584, "learning_rate": 2.8238024964879855e-06, "loss": 0.81720459, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.725078821182251 }, { "auxiliary_loss_clip": 0.01196757, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.05695057, "balance_loss_mlp": 1.02680147, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.2659316431188934, "language_loss": 0.77323675, "learning_rate": 2.8230926128626095e-06, "loss": 0.795555, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.6614410877227783 }, { "auxiliary_loss_clip": 0.01284694, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.04812205, "balance_loss_mlp": 1.02357101, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 1.885627670617992, "language_loss": 0.79964149, "learning_rate": 2.822382604374738e-06, "loss": 0.82279867, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.7607107162475586 }, { "auxiliary_loss_clip": 0.01290601, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.05413294, "balance_loss_mlp": 1.02466619, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 2.0179969391048136, "language_loss": 0.66190314, "learning_rate": 2.8216724711320793e-06, "loss": 0.68512613, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.758011817932129 }, { "auxiliary_loss_clip": 0.01187945, "auxiliary_loss_mlp": 0.02560195, "balance_loss_clip": 1.05247188, "balance_loss_mlp": 1.0001148, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.5366082694822143, "language_loss": 0.79592776, "learning_rate": 2.820962213242361e-06, "loss": 0.83340919, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.697155714035034 }, { "auxiliary_loss_clip": 0.01246076, "auxiliary_loss_mlp": 0.01032285, "balance_loss_clip": 1.0582273, "balance_loss_mlp": 1.02448273, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.2253312947397816, "language_loss": 0.83891201, "learning_rate": 2.8202518308133264e-06, "loss": 0.86169565, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.6350111961364746 }, { "auxiliary_loss_clip": 0.01194055, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.05469728, "balance_loss_mlp": 1.02119029, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 2.0348568967475336, "language_loss": 0.7346198, "learning_rate": 2.8195413239527426e-06, "loss": 0.75684911, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.6504735946655273 }, { "auxiliary_loss_clip": 0.01238759, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.05097151, "balance_loss_mlp": 1.02464974, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 1.9502649758273667, "language_loss": 0.80824465, "learning_rate": 2.8188306927683906e-06, "loss": 0.83095354, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.681459665298462 }, { "auxiliary_loss_clip": 0.01292597, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.05309165, "balance_loss_mlp": 1.02041233, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.333515940643704, "language_loss": 0.74375331, "learning_rate": 2.818119937368074e-06, "loss": 0.76695216, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.7107772827148438 }, { "auxiliary_loss_clip": 0.01246799, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.05289292, "balance_loss_mlp": 1.02066064, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 3.35376078311206, "language_loss": 0.65412688, "learning_rate": 2.817409057859613e-06, "loss": 0.67688197, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.6618170738220215 }, { "auxiliary_loss_clip": 0.01381393, "auxiliary_loss_mlp": 0.01025443, "balance_loss_clip": 1.04793668, "balance_loss_mlp": 1.01803458, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 2.8123818358850743, "language_loss": 0.79368377, "learning_rate": 2.8166980543508482e-06, "loss": 0.81775212, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.7937402725219727 }, { "auxiliary_loss_clip": 0.0119563, "auxiliary_loss_mlp": 0.01025068, "balance_loss_clip": 1.05710769, "balance_loss_mlp": 1.01743841, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 2.6610465774360588, "language_loss": 0.79906356, "learning_rate": 2.815986926949638e-06, "loss": 0.82127047, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.6518466472625732 }, { "auxiliary_loss_clip": 0.01235144, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.05290103, "balance_loss_mlp": 1.01945055, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.9014123710653381, "language_loss": 0.80580133, "learning_rate": 2.8152756757638597e-06, "loss": 0.82841963, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 3.7779510021209717 }, { "auxiliary_loss_clip": 0.01238323, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.05438137, "balance_loss_mlp": 1.02071261, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 2.2425325722910605, "language_loss": 0.84511995, "learning_rate": 2.8145643009014093e-06, "loss": 0.8677811, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.7388932704925537 }, { "auxiliary_loss_clip": 0.01241187, "auxiliary_loss_mlp": 0.01025975, "balance_loss_clip": 1.05197215, "balance_loss_mlp": 1.01931143, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.9277421805842203, "language_loss": 0.78926289, "learning_rate": 2.813852802470202e-06, "loss": 0.81193453, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.6069629192352295 }, { "auxiliary_loss_clip": 0.01288228, "auxiliary_loss_mlp": 0.01028695, "balance_loss_clip": 1.05199194, "balance_loss_mlp": 1.02111292, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 2.368439226755666, "language_loss": 0.72794861, "learning_rate": 2.8131411805781717e-06, "loss": 0.75111783, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.7361230850219727 }, { "auxiliary_loss_clip": 0.01292226, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.05504155, "balance_loss_mlp": 1.02155352, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 3.066041251493373, "language_loss": 0.6449573, "learning_rate": 2.8124294353332707e-06, "loss": 0.66816759, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.7139575481414795 }, { "auxiliary_loss_clip": 0.0134421, "auxiliary_loss_mlp": 0.01023845, "balance_loss_clip": 1.05145121, "balance_loss_mlp": 1.01647162, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 2.0186482145999753, "language_loss": 0.77494013, "learning_rate": 2.8117175668434713e-06, "loss": 0.7986207, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 3.651909351348877 }, { "auxiliary_loss_clip": 0.01194626, "auxiliary_loss_mlp": 0.01026257, "balance_loss_clip": 1.05617881, "balance_loss_mlp": 1.0187645, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.661388110076216, "language_loss": 0.70292091, "learning_rate": 2.811005575216762e-06, "loss": 0.72512978, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.639404535293579 }, { "auxiliary_loss_clip": 0.01343484, "auxiliary_loss_mlp": 0.01029298, "balance_loss_clip": 1.05228531, "balance_loss_mlp": 1.02223229, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.514788989628463, "language_loss": 0.79061633, "learning_rate": 2.8102934605611513e-06, "loss": 0.81434417, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.7515461444854736 }, { "auxiliary_loss_clip": 0.01295561, "auxiliary_loss_mlp": 0.0102938, "balance_loss_clip": 1.05495358, "balance_loss_mlp": 1.02256763, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.382460382579082, "language_loss": 0.67431402, "learning_rate": 2.8095812229846665e-06, "loss": 0.69756347, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.7034125328063965 }, { "auxiliary_loss_clip": 0.01291638, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.05217886, "balance_loss_mlp": 1.01962638, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.8929859322960447, "language_loss": 0.691365, "learning_rate": 2.808868862595355e-06, "loss": 0.71455657, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.7086334228515625 }, { "auxiliary_loss_clip": 0.01244961, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.05416954, "balance_loss_mlp": 1.02228808, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 2.0045113991927286, "language_loss": 0.79764915, "learning_rate": 2.8081563795012795e-06, "loss": 0.82039571, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 3.599130153656006 }, { "auxiliary_loss_clip": 0.01299274, "auxiliary_loss_mlp": 0.01024729, "balance_loss_clip": 1.05186558, "balance_loss_mlp": 1.01747477, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.9700562060465656, "language_loss": 0.73885095, "learning_rate": 2.807443773810524e-06, "loss": 0.76209098, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 3.763050079345703 }, { "auxiliary_loss_clip": 0.01342339, "auxiliary_loss_mlp": 0.01030195, "balance_loss_clip": 1.05544972, "balance_loss_mlp": 1.0224551, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.890494527481991, "language_loss": 0.89468515, "learning_rate": 2.80673104563119e-06, "loss": 0.91841054, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.7416720390319824 }, { "auxiliary_loss_clip": 0.01236831, "auxiliary_loss_mlp": 0.01028427, "balance_loss_clip": 1.05370641, "balance_loss_mlp": 1.02168012, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 2.219466106222124, "language_loss": 0.79071593, "learning_rate": 2.8060181950713976e-06, "loss": 0.81336856, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.666572332382202 }, { "auxiliary_loss_clip": 0.01343013, "auxiliary_loss_mlp": 0.01031907, "balance_loss_clip": 1.0496999, "balance_loss_mlp": 1.02387881, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 1.959692364203572, "language_loss": 0.80981839, "learning_rate": 2.805305222239286e-06, "loss": 0.83356762, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.6673171520233154 }, { "auxiliary_loss_clip": 0.01288732, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.05323219, "balance_loss_mlp": 1.02911329, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 1.7857305138758812, "language_loss": 0.73715317, "learning_rate": 2.8045921272430118e-06, "loss": 0.76040554, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.724794387817383 }, { "auxiliary_loss_clip": 0.0124987, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.05542636, "balance_loss_mlp": 1.02035427, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.258988464885024, "language_loss": 0.76675713, "learning_rate": 2.803878910190753e-06, "loss": 0.78954005, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.6279170513153076 }, { "auxiliary_loss_clip": 0.01245741, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.05298054, "balance_loss_mlp": 1.02315712, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 3.4873468683967017, "language_loss": 0.82270277, "learning_rate": 2.8031655711907017e-06, "loss": 0.84546477, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.6165103912353516 }, { "auxiliary_loss_clip": 0.01241672, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.0554235, "balance_loss_mlp": 1.02042747, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.0727043793815376, "language_loss": 0.80855596, "learning_rate": 2.8024521103510723e-06, "loss": 0.83124751, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.614391565322876 }, { "auxiliary_loss_clip": 0.01240246, "auxiliary_loss_mlp": 0.0102612, "balance_loss_clip": 1.05012906, "balance_loss_mlp": 1.01841283, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.7969597932684584, "language_loss": 0.75279438, "learning_rate": 2.8017385277800952e-06, "loss": 0.7754581, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.6493000984191895 }, { "auxiliary_loss_clip": 0.01344113, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.05318201, "balance_loss_mlp": 1.02177238, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 1.8299109062145495, "language_loss": 0.74913079, "learning_rate": 2.8010248235860213e-06, "loss": 0.77286744, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.7388575077056885 }, { "auxiliary_loss_clip": 0.01195545, "auxiliary_loss_mlp": 0.02508441, "balance_loss_clip": 1.01666796, "balance_loss_mlp": 1.00019217, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8262551152289839, "language_loss": 0.62800068, "learning_rate": 2.8003109978771192e-06, "loss": 0.66504049, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.363452911376953 }, { "auxiliary_loss_clip": 0.01332464, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 1.04586148, "balance_loss_mlp": 1.01580334, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 2.3450719125577115, "language_loss": 0.7876904, "learning_rate": 2.799597050761674e-06, "loss": 0.81125039, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.770724058151245 }, { "auxiliary_loss_clip": 0.01195355, "auxiliary_loss_mlp": 0.01024126, "balance_loss_clip": 1.05622387, "balance_loss_mlp": 1.01680601, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 2.182797083423832, "language_loss": 0.79204786, "learning_rate": 2.7988829823479924e-06, "loss": 0.81424266, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.685530185699463 }, { "auxiliary_loss_clip": 0.0128642, "auxiliary_loss_mlp": 0.01024112, "balance_loss_clip": 1.05045068, "balance_loss_mlp": 1.0166111, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 2.0392095307584537, "language_loss": 0.64158261, "learning_rate": 2.7981687927443976e-06, "loss": 0.66468793, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.659245252609253 }, { "auxiliary_loss_clip": 0.01243529, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.05129993, "balance_loss_mlp": 1.02545834, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 2.6123069843899978, "language_loss": 0.85512352, "learning_rate": 2.797454482059231e-06, "loss": 0.87788236, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.627046585083008 }, { "auxiliary_loss_clip": 0.01197288, "auxiliary_loss_mlp": 0.01032262, "balance_loss_clip": 1.05824518, "balance_loss_mlp": 1.02448344, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 1.6440536467845568, "language_loss": 0.84521759, "learning_rate": 2.7967400504008537e-06, "loss": 0.86751312, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 3.52471923828125 }, { "auxiliary_loss_clip": 0.01295635, "auxiliary_loss_mlp": 0.01010229, "balance_loss_clip": 1.01599503, "balance_loss_mlp": 1.00901306, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.7883924369784058, "language_loss": 0.57427609, "learning_rate": 2.7960254978776456e-06, "loss": 0.5973348, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.3015406131744385 }, { "auxiliary_loss_clip": 0.01197189, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.05738091, "balance_loss_mlp": 1.03017211, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 6.132917576270898, "language_loss": 0.81145716, "learning_rate": 2.7953108245980006e-06, "loss": 0.83380705, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.5764636993408203 }, { "auxiliary_loss_clip": 0.01288793, "auxiliary_loss_mlp": 0.0103201, "balance_loss_clip": 1.05521965, "balance_loss_mlp": 1.0246073, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.820929193586301, "language_loss": 0.73662913, "learning_rate": 2.7945960306703365e-06, "loss": 0.75983715, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.7453856468200684 }, { "auxiliary_loss_clip": 0.01244806, "auxiliary_loss_mlp": 0.01036278, "balance_loss_clip": 1.05298483, "balance_loss_mlp": 1.02861249, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 2.496728839163769, "language_loss": 0.65918499, "learning_rate": 2.7938811162030865e-06, "loss": 0.68199581, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.6843411922454834 }, { "auxiliary_loss_clip": 0.01237584, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.05346656, "balance_loss_mlp": 1.02338052, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 2.0002836117780696, "language_loss": 0.82325041, "learning_rate": 2.793166081304702e-06, "loss": 0.84592819, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 3.685251235961914 }, { "auxiliary_loss_clip": 0.0134442, "auxiliary_loss_mlp": 0.01025385, "balance_loss_clip": 1.04986787, "balance_loss_mlp": 1.01713014, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 1.7917907903666261, "language_loss": 0.82167423, "learning_rate": 2.7924509260836543e-06, "loss": 0.84537232, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.710801839828491 }, { "auxiliary_loss_clip": 0.01337431, "auxiliary_loss_mlp": 0.01027428, "balance_loss_clip": 1.04999185, "balance_loss_mlp": 1.02017438, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.6132480515168308, "language_loss": 0.68574744, "learning_rate": 2.791735650648431e-06, "loss": 0.70939606, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.7006843090057373 }, { "auxiliary_loss_clip": 0.01287716, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.05230761, "balance_loss_mlp": 1.02370143, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 7.616729826500581, "language_loss": 0.74479604, "learning_rate": 2.791020255107538e-06, "loss": 0.7679857, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.718636989593506 }, { "auxiliary_loss_clip": 0.0133828, "auxiliary_loss_mlp": 0.01025583, "balance_loss_clip": 1.047364, "balance_loss_mlp": 1.01800466, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.6151597629561283, "language_loss": 0.80834603, "learning_rate": 2.7903047395695023e-06, "loss": 0.83198464, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.769615888595581 }, { "auxiliary_loss_clip": 0.01240215, "auxiliary_loss_mlp": 0.02565014, "balance_loss_clip": 1.05616164, "balance_loss_mlp": 0.9999997, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.3811642799204447, "language_loss": 0.90666485, "learning_rate": 2.789589104142865e-06, "loss": 0.94471717, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 4.541808843612671 }, { "auxiliary_loss_clip": 0.0134909, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.05340004, "balance_loss_mlp": 1.02362561, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 6.37984128761456, "language_loss": 0.76654387, "learning_rate": 2.7888733489361895e-06, "loss": 0.79034281, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.8276987075805664 }, { "auxiliary_loss_clip": 0.01081052, "auxiliary_loss_mlp": 0.010031, "balance_loss_clip": 1.01441097, "balance_loss_mlp": 1.00215864, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7270605568843346, "language_loss": 0.58661026, "learning_rate": 2.788157474058054e-06, "loss": 0.6074518, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.394801378250122 }, { "auxiliary_loss_clip": 0.01189755, "auxiliary_loss_mlp": 0.01020571, "balance_loss_clip": 1.05400586, "balance_loss_mlp": 1.01364493, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.4868352608196058, "language_loss": 0.69914275, "learning_rate": 2.7874414796170555e-06, "loss": 0.721246, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.694288730621338 }, { "auxiliary_loss_clip": 0.01241961, "auxiliary_loss_mlp": 0.01025049, "balance_loss_clip": 1.05308616, "balance_loss_mlp": 1.01715112, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 3.3135797931948385, "language_loss": 0.83994007, "learning_rate": 2.7867253657218113e-06, "loss": 0.86261022, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.602369785308838 }, { "auxiliary_loss_clip": 0.01291273, "auxiliary_loss_mlp": 0.02566271, "balance_loss_clip": 1.05107164, "balance_loss_mlp": 1.00003672, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.8192435983661714, "language_loss": 0.73010677, "learning_rate": 2.7860091324809544e-06, "loss": 0.76868224, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.7641847133636475 }, { "auxiliary_loss_clip": 0.01237941, "auxiliary_loss_mlp": 0.01022254, "balance_loss_clip": 1.05500281, "balance_loss_mlp": 1.01494658, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 2.082126861995118, "language_loss": 0.81072557, "learning_rate": 2.7852927800031377e-06, "loss": 0.83332753, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.677333116531372 }, { "auxiliary_loss_clip": 0.01290247, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.05221939, "balance_loss_mlp": 1.02022123, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.8970322703627414, "language_loss": 0.83323359, "learning_rate": 2.7845763083970298e-06, "loss": 0.85641092, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.748837947845459 }, { "auxiliary_loss_clip": 0.01237732, "auxiliary_loss_mlp": 0.01028134, "balance_loss_clip": 1.05175149, "balance_loss_mlp": 1.02041554, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 1.9789146678909455, "language_loss": 0.82298541, "learning_rate": 2.7838597177713205e-06, "loss": 0.84564412, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.6625468730926514 }, { "auxiliary_loss_clip": 0.01441665, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.05465901, "balance_loss_mlp": 1.0231241, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 18.839958374051903, "language_loss": 0.73455989, "learning_rate": 2.7831430082347143e-06, "loss": 0.7592814, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.7900984287261963 }, { "auxiliary_loss_clip": 0.01242125, "auxiliary_loss_mlp": 0.0256167, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 0.99998468, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 2.3685143466154495, "language_loss": 0.82234645, "learning_rate": 2.7824261798959373e-06, "loss": 0.8603844, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.7184128761291504 }, { "auxiliary_loss_clip": 0.01295221, "auxiliary_loss_mlp": 0.01034065, "balance_loss_clip": 1.04947734, "balance_loss_mlp": 1.02669168, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.960404090479542, "language_loss": 0.79358715, "learning_rate": 2.78170923286373e-06, "loss": 0.81687999, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.672183036804199 }, { "auxiliary_loss_clip": 0.01493884, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.05182815, "balance_loss_mlp": 1.02308106, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.186389661726091, "language_loss": 0.84016603, "learning_rate": 2.780992167246854e-06, "loss": 0.86541271, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.940718412399292 }, { "auxiliary_loss_clip": 0.01190805, "auxiliary_loss_mlp": 0.01001541, "balance_loss_clip": 1.01396894, "balance_loss_mlp": 1.00067043, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9709122347991077, "language_loss": 0.72097194, "learning_rate": 2.7802749831540883e-06, "loss": 0.74289536, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.3678650856018066 }, { "auxiliary_loss_clip": 0.01300972, "auxiliary_loss_mlp": 0.01029409, "balance_loss_clip": 1.05169451, "balance_loss_mlp": 1.02283418, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 2.0420650401118325, "language_loss": 0.82135695, "learning_rate": 2.7795576806942268e-06, "loss": 0.84466076, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.7661383152008057 }, { "auxiliary_loss_clip": 0.01204503, "auxiliary_loss_mlp": 0.01008693, "balance_loss_clip": 1.0334785, "balance_loss_mlp": 1.00750065, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.764785976622441, "language_loss": 0.54875255, "learning_rate": 2.778840259976085e-06, "loss": 0.57088447, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 4.109271764755249 }, { "auxiliary_loss_clip": 0.01242206, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.054268, "balance_loss_mlp": 1.0272969, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 3.6955667299468873, "language_loss": 0.76653987, "learning_rate": 2.778122721108495e-06, "loss": 0.78931427, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.645414113998413 }, { "auxiliary_loss_clip": 0.0124141, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.05464017, "balance_loss_mlp": 1.01838756, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 2.155989091014263, "language_loss": 0.88188154, "learning_rate": 2.7774050642003076e-06, "loss": 0.90455687, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.7484490871429443 }, { "auxiliary_loss_clip": 0.01196007, "auxiliary_loss_mlp": 0.01034211, "balance_loss_clip": 1.05627954, "balance_loss_mlp": 1.02617073, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 1.9706893219130057, "language_loss": 0.93434799, "learning_rate": 2.7766872893603896e-06, "loss": 0.95665014, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.6565232276916504 }, { "auxiliary_loss_clip": 0.01243895, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.05423534, "balance_loss_mlp": 1.02330053, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.7464051114237589, "language_loss": 0.72926539, "learning_rate": 2.7759693966976275e-06, "loss": 0.75200832, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.6310641765594482 }, { "auxiliary_loss_clip": 0.01337985, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.04877615, "balance_loss_mlp": 1.02863026, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 2.047792780952871, "language_loss": 0.85259795, "learning_rate": 2.7752513863209242e-06, "loss": 0.8763454, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.7698426246643066 }, { "auxiliary_loss_clip": 0.01285602, "auxiliary_loss_mlp": 0.02561809, "balance_loss_clip": 1.05588913, "balance_loss_mlp": 1.00002038, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.7862208627292369, "language_loss": 0.84573597, "learning_rate": 2.774533258339203e-06, "loss": 0.88421011, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.7084884643554688 }, { "auxiliary_loss_clip": 0.01303517, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.04819632, "balance_loss_mlp": 1.02036202, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 3.112495679070274, "language_loss": 0.80334008, "learning_rate": 2.7738150128614014e-06, "loss": 0.82666206, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 3.616485357284546 }, { "auxiliary_loss_clip": 0.01335217, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.05119491, "balance_loss_mlp": 1.02638757, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 2.0206633196874244, "language_loss": 0.89773262, "learning_rate": 2.7730966499964777e-06, "loss": 0.92142284, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.678500175476074 }, { "auxiliary_loss_clip": 0.01193061, "auxiliary_loss_mlp": 0.01028885, "balance_loss_clip": 1.05463481, "balance_loss_mlp": 1.02047443, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 3.109206614820065, "language_loss": 0.80980968, "learning_rate": 2.772378169853408e-06, "loss": 0.83202916, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.664966344833374 }, { "auxiliary_loss_clip": 0.01338433, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.05239952, "balance_loss_mlp": 1.02553523, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 2.1571245791492584, "language_loss": 0.73979998, "learning_rate": 2.771659572541183e-06, "loss": 0.76351476, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.6380715370178223 }, { "auxiliary_loss_clip": 0.01247782, "auxiliary_loss_mlp": 0.01026689, "balance_loss_clip": 1.05721474, "balance_loss_mlp": 1.01943457, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 2.1545356422842903, "language_loss": 0.86889136, "learning_rate": 2.7709408581688143e-06, "loss": 0.89163613, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 4.510973691940308 }, { "auxiliary_loss_clip": 0.0125283, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.05335593, "balance_loss_mlp": 1.02360082, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 1.639623453251414, "language_loss": 0.88102746, "learning_rate": 2.7702220268453307e-06, "loss": 0.90386498, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.806018829345703 }, { "auxiliary_loss_clip": 0.01294633, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.0546509, "balance_loss_mlp": 1.02283525, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 1.9915248051040308, "language_loss": 0.84905672, "learning_rate": 2.7695030786797785e-06, "loss": 0.87231302, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.81109881401062 }, { "auxiliary_loss_clip": 0.01389542, "auxiliary_loss_mlp": 0.01029531, "balance_loss_clip": 1.04889536, "balance_loss_mlp": 1.02126384, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 3.384549475696944, "language_loss": 0.7450344, "learning_rate": 2.7687840137812206e-06, "loss": 0.76922512, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.8113348484039307 }, { "auxiliary_loss_clip": 0.01127355, "auxiliary_loss_mlp": 0.01012359, "balance_loss_clip": 1.01374555, "balance_loss_mlp": 1.01150632, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.797242445133077, "language_loss": 0.62038964, "learning_rate": 2.7680648322587395e-06, "loss": 0.64178681, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.3045899868011475 }, { "auxiliary_loss_clip": 0.01190673, "auxiliary_loss_mlp": 0.01026108, "balance_loss_clip": 1.05452871, "balance_loss_mlp": 1.01812065, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 4.900614238782518, "language_loss": 0.80788267, "learning_rate": 2.7673455342214334e-06, "loss": 0.83005047, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.7263402938842773 }, { "auxiliary_loss_clip": 0.01241852, "auxiliary_loss_mlp": 0.01027843, "balance_loss_clip": 1.0547415, "balance_loss_mlp": 1.01980233, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 6.923640798923827, "language_loss": 0.7631588, "learning_rate": 2.7666261197784198e-06, "loss": 0.78585577, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.6707022190093994 }, { "auxiliary_loss_clip": 0.01289705, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05685616, "balance_loss_mlp": 1.02167201, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.0474826876181136, "language_loss": 0.76558286, "learning_rate": 2.7659065890388336e-06, "loss": 0.78877163, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.6100752353668213 }, { "auxiliary_loss_clip": 0.01291491, "auxiliary_loss_mlp": 0.01027672, "balance_loss_clip": 1.05062079, "balance_loss_mlp": 1.01993513, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 1.9178533308482817, "language_loss": 0.85251069, "learning_rate": 2.7651869421118266e-06, "loss": 0.87570238, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.700011730194092 }, { "auxiliary_loss_clip": 0.01153894, "auxiliary_loss_mlp": 0.01026228, "balance_loss_clip": 1.05792212, "balance_loss_mlp": 1.01857531, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 2.121178767510199, "language_loss": 0.83139509, "learning_rate": 2.76446717910657e-06, "loss": 0.85319626, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.685286521911621 }, { "auxiliary_loss_clip": 0.01237856, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.05418444, "balance_loss_mlp": 1.02391064, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.659093262561644, "language_loss": 0.77011096, "learning_rate": 2.763747300132249e-06, "loss": 0.79280269, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.660062551498413 }, { "auxiliary_loss_clip": 0.01194832, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.01950002, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 1.7477201903478525, "language_loss": 0.86770666, "learning_rate": 2.7630273052980704e-06, "loss": 0.88992918, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.6150243282318115 }, { "auxiliary_loss_clip": 0.01286425, "auxiliary_loss_mlp": 0.01030844, "balance_loss_clip": 1.05197024, "balance_loss_mlp": 1.02359009, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.9884349369459478, "language_loss": 0.66784751, "learning_rate": 2.7623071947132554e-06, "loss": 0.69102025, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.702446699142456 }, { "auxiliary_loss_clip": 0.0129933, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.05270195, "balance_loss_mlp": 1.02017283, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.812149234159523, "language_loss": 0.78947353, "learning_rate": 2.7615869684870458e-06, "loss": 0.81274247, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.7098050117492676 }, { "auxiliary_loss_clip": 0.0124255, "auxiliary_loss_mlp": 0.01028007, "balance_loss_clip": 1.05634022, "balance_loss_mlp": 1.02063966, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.7765846246657737, "language_loss": 0.8424561, "learning_rate": 2.7608666267286986e-06, "loss": 0.86516166, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.7536628246307373 }, { "auxiliary_loss_clip": 0.01431081, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.04351711, "balance_loss_mlp": 1.01702189, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.0709055788254673, "language_loss": 0.86614782, "learning_rate": 2.760146169547489e-06, "loss": 0.89071321, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.823744773864746 }, { "auxiliary_loss_clip": 0.01294725, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.05772877, "balance_loss_mlp": 1.0245595, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.4928448021832808, "language_loss": 0.76665246, "learning_rate": 2.75942559705271e-06, "loss": 0.78991961, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 3.6473917961120605 }, { "auxiliary_loss_clip": 0.01240648, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.05413222, "balance_loss_mlp": 1.02124727, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.8505414587863103, "language_loss": 0.89268589, "learning_rate": 2.7587049093536713e-06, "loss": 0.91537964, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.6279995441436768 }, { "auxiliary_loss_clip": 0.01152365, "auxiliary_loss_mlp": 0.01026878, "balance_loss_clip": 1.0547688, "balance_loss_mlp": 1.0194397, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 3.038054704846765, "language_loss": 0.80735308, "learning_rate": 2.757984106559701e-06, "loss": 0.82914555, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.5860536098480225 }, { "auxiliary_loss_clip": 0.01286712, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.0531292, "balance_loss_mlp": 1.02177453, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 5.200109994164949, "language_loss": 0.7171396, "learning_rate": 2.7572631887801446e-06, "loss": 0.74030149, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.8399264812469482 }, { "auxiliary_loss_clip": 0.0124379, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.05547214, "balance_loss_mlp": 1.02749324, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.8940907944578396, "language_loss": 0.77038461, "learning_rate": 2.7565421561243654e-06, "loss": 0.79317552, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.6005754470825195 }, { "auxiliary_loss_clip": 0.01337567, "auxiliary_loss_mlp": 0.01023191, "balance_loss_clip": 1.0489862, "balance_loss_mlp": 1.01615167, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 2.14467114322596, "language_loss": 0.82117373, "learning_rate": 2.7558210087017413e-06, "loss": 0.8447814, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 3.5893282890319824 }, { "auxiliary_loss_clip": 0.01343547, "auxiliary_loss_mlp": 0.01036953, "balance_loss_clip": 1.05726743, "balance_loss_mlp": 1.02910352, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 3.700661605450855, "language_loss": 0.73529458, "learning_rate": 2.7550997466216724e-06, "loss": 0.75909948, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.620497703552246 }, { "auxiliary_loss_clip": 0.01296668, "auxiliary_loss_mlp": 0.01031391, "balance_loss_clip": 1.06039512, "balance_loss_mlp": 1.02348113, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 2.5382870670290485, "language_loss": 0.81261182, "learning_rate": 2.7543783699935714e-06, "loss": 0.83589244, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.559481620788574 }, { "auxiliary_loss_clip": 0.0124439, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.05920482, "balance_loss_mlp": 1.01968241, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 2.41417311045862, "language_loss": 0.86179733, "learning_rate": 2.753656878926872e-06, "loss": 0.88451892, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.5224215984344482 }, { "auxiliary_loss_clip": 0.01285027, "auxiliary_loss_mlp": 0.01024072, "balance_loss_clip": 1.05221963, "balance_loss_mlp": 1.01704454, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.8663559904964206, "language_loss": 0.74312085, "learning_rate": 2.752935273531023e-06, "loss": 0.76621187, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.571831464767456 }, { "auxiliary_loss_clip": 0.0124853, "auxiliary_loss_mlp": 0.01023351, "balance_loss_clip": 1.05939722, "balance_loss_mlp": 1.01537287, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 2.7353929437720215, "language_loss": 0.78323334, "learning_rate": 2.752213553915492e-06, "loss": 0.80595219, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 3.50278902053833 }, { "auxiliary_loss_clip": 0.01182349, "auxiliary_loss_mlp": 0.01009323, "balance_loss_clip": 1.01585054, "balance_loss_mlp": 1.00832164, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8169458486840196, "language_loss": 0.66059256, "learning_rate": 2.751491720189762e-06, "loss": 0.6825093, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 4.016615390777588 }, { "auxiliary_loss_clip": 0.01292256, "auxiliary_loss_mlp": 0.02566887, "balance_loss_clip": 1.05558741, "balance_loss_mlp": 1.00011516, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.980766852725907, "language_loss": 0.9179132, "learning_rate": 2.7507697724633364e-06, "loss": 0.95650464, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.7509605884552 }, { "auxiliary_loss_clip": 0.01250553, "auxiliary_loss_mlp": 0.01004187, "balance_loss_clip": 1.0248785, "balance_loss_mlp": 1.00304878, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7751601435089982, "language_loss": 0.54652143, "learning_rate": 2.7500477108457327e-06, "loss": 0.56906891, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.0880935192108154 }, { "auxiliary_loss_clip": 0.01242113, "auxiliary_loss_mlp": 0.01027426, "balance_loss_clip": 1.05393052, "balance_loss_mlp": 1.01984715, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 2.5022442228868713, "language_loss": 0.81166101, "learning_rate": 2.7493255354464877e-06, "loss": 0.83435631, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.6488380432128906 }, { "auxiliary_loss_clip": 0.01578364, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.03987646, "balance_loss_mlp": 1.0220176, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 1.9623778353569905, "language_loss": 0.76508796, "learning_rate": 2.748603246375156e-06, "loss": 0.79116285, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.0068135261535645 }, { "auxiliary_loss_clip": 0.01193887, "auxiliary_loss_mlp": 0.01026152, "balance_loss_clip": 1.05782127, "balance_loss_mlp": 1.01807547, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 2.33582031516074, "language_loss": 0.69684386, "learning_rate": 2.7478808437413055e-06, "loss": 0.71904421, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 3.4688849449157715 }, { "auxiliary_loss_clip": 0.01397135, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.0578351, "balance_loss_mlp": 1.02346218, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 1.6802374410180145, "language_loss": 0.66052914, "learning_rate": 2.7471583276545263e-06, "loss": 0.68481207, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.8056371212005615 }, { "auxiliary_loss_clip": 0.01294092, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05432653, "balance_loss_mlp": 1.01979351, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 1.866462683453211, "language_loss": 0.7108041, "learning_rate": 2.7464356982244224e-06, "loss": 0.73402047, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.670027017593384 }, { "auxiliary_loss_clip": 0.01152369, "auxiliary_loss_mlp": 0.01001869, "balance_loss_clip": 1.03078127, "balance_loss_mlp": 1.00060582, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7774977409858631, "language_loss": 0.61660099, "learning_rate": 2.745712955560617e-06, "loss": 0.63814342, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.158172607421875 }, { "auxiliary_loss_clip": 0.01440585, "auxiliary_loss_mlp": 0.01032553, "balance_loss_clip": 1.04796684, "balance_loss_mlp": 1.02445245, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 2.381136700060021, "language_loss": 0.76963788, "learning_rate": 2.7449900997727496e-06, "loss": 0.79436922, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.759643316268921 }, { "auxiliary_loss_clip": 0.01297463, "auxiliary_loss_mlp": 0.01028572, "balance_loss_clip": 1.05946565, "balance_loss_mlp": 1.0217886, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 1.7039873590567383, "language_loss": 0.84027076, "learning_rate": 2.744267130970476e-06, "loss": 0.86353111, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.6617252826690674 }, { "auxiliary_loss_clip": 0.01287243, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.05351675, "balance_loss_mlp": 1.01995289, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 1.8572114685057362, "language_loss": 0.77188563, "learning_rate": 2.7435440492634697e-06, "loss": 0.79503572, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.7156810760498047 }, { "auxiliary_loss_clip": 0.01293839, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.05407023, "balance_loss_mlp": 1.0200628, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 2.1471967114302473, "language_loss": 0.67093199, "learning_rate": 2.7428208547614228e-06, "loss": 0.69415998, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.720916986465454 }, { "auxiliary_loss_clip": 0.01241571, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.05591679, "balance_loss_mlp": 1.02085161, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 2.762325696126657, "language_loss": 0.77447516, "learning_rate": 2.742097547574043e-06, "loss": 0.79718769, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.609881639480591 }, { "auxiliary_loss_clip": 0.01300534, "auxiliary_loss_mlp": 0.02569987, "balance_loss_clip": 1.05481827, "balance_loss_mlp": 1.00009942, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 2.171103436077037, "language_loss": 0.78094876, "learning_rate": 2.7413741278110544e-06, "loss": 0.81965399, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 3.714444875717163 }, { "auxiliary_loss_clip": 0.01300129, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.05639458, "balance_loss_mlp": 1.0197227, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.5496839536406064, "language_loss": 0.69372523, "learning_rate": 2.7406505955822016e-06, "loss": 0.71700537, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.843712329864502 }, { "auxiliary_loss_clip": 0.01295521, "auxiliary_loss_mlp": 0.01025275, "balance_loss_clip": 1.05176926, "balance_loss_mlp": 1.01779497, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 2.188972429276266, "language_loss": 0.66598129, "learning_rate": 2.7399269509972415e-06, "loss": 0.68918926, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.663363456726074 }, { "auxiliary_loss_clip": 0.01291598, "auxiliary_loss_mlp": 0.01034364, "balance_loss_clip": 1.05004406, "balance_loss_mlp": 1.02539313, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.6169275175173516, "language_loss": 0.84624302, "learning_rate": 2.7392031941659514e-06, "loss": 0.86950266, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.6275110244750977 }, { "auxiliary_loss_clip": 0.01294518, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.06009364, "balance_loss_mlp": 1.02717829, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.7576248625471849, "language_loss": 0.86000884, "learning_rate": 2.7384793251981244e-06, "loss": 0.88330352, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.7231054306030273 }, { "auxiliary_loss_clip": 0.01248262, "auxiliary_loss_mlp": 0.01028664, "balance_loss_clip": 1.0545156, "balance_loss_mlp": 1.02088571, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 3.455614947568465, "language_loss": 0.80935663, "learning_rate": 2.737755344203571e-06, "loss": 0.8321259, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.709806203842163 }, { "auxiliary_loss_clip": 0.01244761, "auxiliary_loss_mlp": 0.01030349, "balance_loss_clip": 1.05656004, "balance_loss_mlp": 1.02279401, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.888267103019629, "language_loss": 0.79903817, "learning_rate": 2.7370312512921186e-06, "loss": 0.82178926, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 3.5513343811035156 }, { "auxiliary_loss_clip": 0.01292919, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.05162716, "balance_loss_mlp": 1.021276, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 2.5364508638443377, "language_loss": 0.7703253, "learning_rate": 2.736307046573611e-06, "loss": 0.7935487, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.6591134071350098 }, { "auxiliary_loss_clip": 0.01191677, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.05461586, "balance_loss_mlp": 1.0212028, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 1.6131799018559925, "language_loss": 0.82000637, "learning_rate": 2.73558273015791e-06, "loss": 0.84220576, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.6924021244049072 }, { "auxiliary_loss_clip": 0.01196008, "auxiliary_loss_mlp": 0.01034136, "balance_loss_clip": 1.05547428, "balance_loss_mlp": 1.0254513, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.1492199888616015, "language_loss": 0.70531988, "learning_rate": 2.734858302154894e-06, "loss": 0.72762132, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.6544349193573 }, { "auxiliary_loss_clip": 0.01287206, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.05366027, "balance_loss_mlp": 1.02037609, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 2.0817342952221716, "language_loss": 0.7675792, "learning_rate": 2.734133762674457e-06, "loss": 0.79073834, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 3.6260833740234375 }, { "auxiliary_loss_clip": 0.01291196, "auxiliary_loss_mlp": 0.01026032, "balance_loss_clip": 1.0535295, "balance_loss_mlp": 1.01763928, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 1.9149781791039362, "language_loss": 0.70507121, "learning_rate": 2.7334091118265124e-06, "loss": 0.72824347, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.704987049102783 }, { "auxiliary_loss_clip": 0.01139555, "auxiliary_loss_mlp": 0.01004624, "balance_loss_clip": 1.01956773, "balance_loss_mlp": 1.00352716, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6701135211042666, "language_loss": 0.57765323, "learning_rate": 2.732684349720989e-06, "loss": 0.59909505, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 4.116127252578735 }, { "auxiliary_loss_clip": 0.01257973, "auxiliary_loss_mlp": 0.01031908, "balance_loss_clip": 1.05572653, "balance_loss_mlp": 1.02263927, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.560645316632205, "language_loss": 0.75292623, "learning_rate": 2.7319594764678318e-06, "loss": 0.77582508, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.779829502105713 }, { "auxiliary_loss_clip": 0.0129509, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.05303991, "balance_loss_mlp": 1.02625048, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.7755111190030972, "language_loss": 0.83402681, "learning_rate": 2.7312344921770044e-06, "loss": 0.85732365, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.7767059803009033 }, { "auxiliary_loss_clip": 0.01293893, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.05098557, "balance_loss_mlp": 1.02590823, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 1.930147914256232, "language_loss": 0.7852931, "learning_rate": 2.7305093969584857e-06, "loss": 0.80857086, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.69221830368042 }, { "auxiliary_loss_clip": 0.01239571, "auxiliary_loss_mlp": 0.01029662, "balance_loss_clip": 1.0536797, "balance_loss_mlp": 1.02123976, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 2.2185610159361615, "language_loss": 0.79949725, "learning_rate": 2.729784190922272e-06, "loss": 0.82218957, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.6421306133270264 }, { "auxiliary_loss_clip": 0.01089623, "auxiliary_loss_mlp": 0.01005672, "balance_loss_clip": 1.0167737, "balance_loss_mlp": 1.00468874, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.9554523356177217, "language_loss": 0.57175618, "learning_rate": 2.729058874178378e-06, "loss": 0.59270912, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.3300294876098633 }, { "auxiliary_loss_clip": 0.01297965, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.05582643, "balance_loss_mlp": 1.0215354, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 2.236547358510773, "language_loss": 0.69094986, "learning_rate": 2.7283334468368315e-06, "loss": 0.71422982, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.8591043949127197 }, { "auxiliary_loss_clip": 0.01445364, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.03933978, "balance_loss_mlp": 1.02632332, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 10.342615399466746, "language_loss": 0.72934306, "learning_rate": 2.72760790900768e-06, "loss": 0.7541467, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 3.047367572784424 }, { "auxiliary_loss_clip": 0.01199276, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.05997562, "balance_loss_mlp": 1.02759671, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.7776318647815952, "language_loss": 0.78804386, "learning_rate": 2.7268822608009875e-06, "loss": 0.81039584, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.8163461685180664 }, { "auxiliary_loss_clip": 0.0134888, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.05400598, "balance_loss_mlp": 1.02510941, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 1.8246560634670606, "language_loss": 0.78543776, "learning_rate": 2.726156502326834e-06, "loss": 0.80926454, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 2.7694475650787354 }, { "auxiliary_loss_clip": 0.01271435, "auxiliary_loss_mlp": 0.01003421, "balance_loss_clip": 1.02671182, "balance_loss_mlp": 1.00221086, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.7082745624370083, "language_loss": 0.60276765, "learning_rate": 2.725430633695316e-06, "loss": 0.62551618, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.4696404933929443 }, { "auxiliary_loss_clip": 0.01080256, "auxiliary_loss_mlp": 0.01002513, "balance_loss_clip": 1.01435566, "balance_loss_mlp": 1.00150001, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.8779240877470704, "language_loss": 0.57930148, "learning_rate": 2.7247046550165485e-06, "loss": 0.60012925, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.7733991146087646 }, { "auxiliary_loss_clip": 0.01195906, "auxiliary_loss_mlp": 0.01031367, "balance_loss_clip": 1.05769598, "balance_loss_mlp": 1.02321851, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.7075775088813077, "language_loss": 0.75934601, "learning_rate": 2.7239785664006606e-06, "loss": 0.78161871, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.7241997718811035 }, { "auxiliary_loss_clip": 0.01134083, "auxiliary_loss_mlp": 0.01003338, "balance_loss_clip": 1.01340628, "balance_loss_mlp": 1.00232506, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.8807791959000575, "language_loss": 0.61749184, "learning_rate": 2.7232523679578002e-06, "loss": 0.63886607, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.3043599128723145 }, { "auxiliary_loss_clip": 0.01241597, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.0550611, "balance_loss_mlp": 1.02570248, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.2836800894736977, "language_loss": 0.79478645, "learning_rate": 2.7225260597981295e-06, "loss": 0.81753707, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 3.8916537761688232 }, { "auxiliary_loss_clip": 0.01348393, "auxiliary_loss_mlp": 0.02577439, "balance_loss_clip": 1.05716002, "balance_loss_mlp": 1.00016308, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.4312013847600005, "language_loss": 0.78860152, "learning_rate": 2.721799642031831e-06, "loss": 0.82785982, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.8428988456726074 }, { "auxiliary_loss_clip": 0.01298223, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.05105066, "balance_loss_mlp": 1.01895249, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.17940531073706, "language_loss": 0.77303654, "learning_rate": 2.721073114769101e-06, "loss": 0.79629028, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.647428035736084 }, { "auxiliary_loss_clip": 0.01334206, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.05082154, "balance_loss_mlp": 1.02039146, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 2.03578206710025, "language_loss": 0.75359064, "learning_rate": 2.7203464781201523e-06, "loss": 0.77721339, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.7662100791931152 }, { "auxiliary_loss_clip": 0.01195564, "auxiliary_loss_mlp": 0.01026148, "balance_loss_clip": 1.05802536, "balance_loss_mlp": 1.01788092, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.3598880145077437, "language_loss": 0.78219706, "learning_rate": 2.719619732195215e-06, "loss": 0.80441421, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.6748733520507812 }, { "auxiliary_loss_clip": 0.01340151, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.05125356, "balance_loss_mlp": 1.02091503, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.4607547529717289, "language_loss": 0.72577107, "learning_rate": 2.7188928771045377e-06, "loss": 0.74946374, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 3.6455564498901367 }, { "auxiliary_loss_clip": 0.01334814, "auxiliary_loss_mlp": 0.01028427, "balance_loss_clip": 1.04884338, "balance_loss_mlp": 1.02020121, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 1.9052680503651094, "language_loss": 0.80047989, "learning_rate": 2.7181659129583815e-06, "loss": 0.8241123, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.754040241241455 }, { "auxiliary_loss_clip": 0.01286295, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.04707241, "balance_loss_mlp": 1.02145195, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.7510604692317788, "language_loss": 0.7581104, "learning_rate": 2.7174388398670276e-06, "loss": 0.78127801, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.7242462635040283 }, { "auxiliary_loss_clip": 0.01196002, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.05425644, "balance_loss_mlp": 1.02008176, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 2.1431139884821224, "language_loss": 0.92168176, "learning_rate": 2.716711657940773e-06, "loss": 0.94392264, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.6808359622955322 }, { "auxiliary_loss_clip": 0.01143051, "auxiliary_loss_mlp": 0.01002512, "balance_loss_clip": 1.01526237, "balance_loss_mlp": 1.00154614, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8169332649592138, "language_loss": 0.564776, "learning_rate": 2.7159843672899284e-06, "loss": 0.58623165, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 4.272258520126343 }, { "auxiliary_loss_clip": 0.01243858, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.05728781, "balance_loss_mlp": 1.01932812, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 2.2458966713511175, "language_loss": 0.81015992, "learning_rate": 2.715256968024825e-06, "loss": 0.83288026, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.7222230434417725 }, { "auxiliary_loss_clip": 0.01204498, "auxiliary_loss_mlp": 0.01026283, "balance_loss_clip": 1.053087, "balance_loss_mlp": 1.01764596, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.7987043981480022, "language_loss": 0.81985545, "learning_rate": 2.7145294602558083e-06, "loss": 0.84216332, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 3.687080144882202 }, { "auxiliary_loss_clip": 0.01244522, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.05507612, "balance_loss_mlp": 1.02412581, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.7836952798666454, "language_loss": 0.7108376, "learning_rate": 2.713801844093241e-06, "loss": 0.73360723, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.801084518432617 }, { "auxiliary_loss_clip": 0.0124411, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.0543611, "balance_loss_mlp": 1.02104163, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 2.4872325920099847, "language_loss": 0.88656473, "learning_rate": 2.7130741196475014e-06, "loss": 0.90929091, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.803431510925293 }, { "auxiliary_loss_clip": 0.01300824, "auxiliary_loss_mlp": 0.01032158, "balance_loss_clip": 1.0576458, "balance_loss_mlp": 1.02310944, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 1.8593251194811748, "language_loss": 0.78827405, "learning_rate": 2.7123462870289848e-06, "loss": 0.81160378, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.900256395339966 }, { "auxiliary_loss_clip": 0.01292639, "auxiliary_loss_mlp": 0.01031286, "balance_loss_clip": 1.05042195, "balance_loss_mlp": 1.02320611, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.584575970016342, "language_loss": 0.81224638, "learning_rate": 2.711618346348102e-06, "loss": 0.83548564, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.821484327316284 }, { "auxiliary_loss_clip": 0.01289168, "auxiliary_loss_mlp": 0.01029131, "balance_loss_clip": 1.05208254, "balance_loss_mlp": 1.02139688, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 1.568844792833661, "language_loss": 0.63279533, "learning_rate": 2.7108902977152825e-06, "loss": 0.65597832, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.7866334915161133 }, { "auxiliary_loss_clip": 0.01240947, "auxiliary_loss_mlp": 0.01031113, "balance_loss_clip": 1.05274796, "balance_loss_mlp": 1.02250028, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.2900958353060044, "language_loss": 0.74994123, "learning_rate": 2.7101621412409704e-06, "loss": 0.77266181, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.7320616245269775 }, { "auxiliary_loss_clip": 0.01194902, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.0557189, "balance_loss_mlp": 1.01894426, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 2.864373787516934, "language_loss": 0.85661185, "learning_rate": 2.7094338770356256e-06, "loss": 0.87882984, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.7118382453918457 }, { "auxiliary_loss_clip": 0.01289137, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.05408549, "balance_loss_mlp": 1.02193749, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.2344342030670004, "language_loss": 0.64371872, "learning_rate": 2.708705505209726e-06, "loss": 0.66690934, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.759544610977173 }, { "auxiliary_loss_clip": 0.01385509, "auxiliary_loss_mlp": 0.0103162, "balance_loss_clip": 1.04511189, "balance_loss_mlp": 1.02434278, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.1644730636035856, "language_loss": 0.91535425, "learning_rate": 2.7079770258737646e-06, "loss": 0.9395256, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.7793240547180176 }, { "auxiliary_loss_clip": 0.01337383, "auxiliary_loss_mlp": 0.01031125, "balance_loss_clip": 1.04708195, "balance_loss_mlp": 1.02180314, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 5.901012551227482, "language_loss": 0.74688482, "learning_rate": 2.707248439138251e-06, "loss": 0.77056992, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.684919595718384 }, { "auxiliary_loss_clip": 0.01286214, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.05443406, "balance_loss_mlp": 1.02365553, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 3.4025053253156963, "language_loss": 0.65404505, "learning_rate": 2.7065197451137114e-06, "loss": 0.67722493, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.720374345779419 }, { "auxiliary_loss_clip": 0.0128769, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.05275595, "balance_loss_mlp": 1.01802182, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.6490853710077613, "language_loss": 0.67650473, "learning_rate": 2.7057909439106894e-06, "loss": 0.69964087, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.656491994857788 }, { "auxiliary_loss_clip": 0.01237552, "auxiliary_loss_mlp": 0.02568078, "balance_loss_clip": 1.05116451, "balance_loss_mlp": 1.00006366, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 1.9637958988505428, "language_loss": 0.78431571, "learning_rate": 2.7050620356397417e-06, "loss": 0.82237196, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.728999137878418 }, { "auxiliary_loss_clip": 0.01189933, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.05630922, "balance_loss_mlp": 1.02004099, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.942299173066065, "language_loss": 0.72054285, "learning_rate": 2.7043330204114437e-06, "loss": 0.74271286, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.5937905311584473 }, { "auxiliary_loss_clip": 0.01187928, "auxiliary_loss_mlp": 0.01030927, "balance_loss_clip": 1.05330944, "balance_loss_mlp": 1.02386403, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.1834926539590582, "language_loss": 0.85486925, "learning_rate": 2.7036038983363862e-06, "loss": 0.87705779, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 3.4920690059661865 }, { "auxiliary_loss_clip": 0.01236791, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.05353761, "balance_loss_mlp": 1.01803756, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.7737106868961687, "language_loss": 0.84050667, "learning_rate": 2.702874669525177e-06, "loss": 0.86312699, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.7895946502685547 }, { "auxiliary_loss_clip": 0.01348294, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.05732775, "balance_loss_mlp": 1.02693534, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.087625241364062, "language_loss": 0.69874513, "learning_rate": 2.7021453340884394e-06, "loss": 0.72256839, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.74658465385437 }, { "auxiliary_loss_clip": 0.01280426, "auxiliary_loss_mlp": 0.02566409, "balance_loss_clip": 1.05014813, "balance_loss_mlp": 1.00007153, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 3.881042117670996, "language_loss": 0.72844255, "learning_rate": 2.7014158921368125e-06, "loss": 0.76691091, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.6948964595794678 }, { "auxiliary_loss_clip": 0.01193615, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.05677974, "balance_loss_mlp": 1.01954508, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.4329109692580717, "language_loss": 0.85222018, "learning_rate": 2.700686343780953e-06, "loss": 0.87442738, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.6615543365478516 }, { "auxiliary_loss_clip": 0.01289258, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.05116153, "balance_loss_mlp": 1.02137649, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 1.8318350245442518, "language_loss": 0.88415122, "learning_rate": 2.699956689131532e-06, "loss": 0.9073332, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 3.7049500942230225 }, { "auxiliary_loss_clip": 0.01195282, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.05032146, "balance_loss_mlp": 1.02331662, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.43030883603922, "language_loss": 0.85196763, "learning_rate": 2.699226928299238e-06, "loss": 0.87423098, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.687328815460205 }, { "auxiliary_loss_clip": 0.01238754, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 1.05116892, "balance_loss_mlp": 1.01983345, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.9699882536677147, "language_loss": 0.7895304, "learning_rate": 2.698497061394774e-06, "loss": 0.81219435, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.6771328449249268 }, { "auxiliary_loss_clip": 0.01341761, "auxiliary_loss_mlp": 0.02565577, "balance_loss_clip": 1.05213165, "balance_loss_mlp": 1.00015962, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.774272911080005, "language_loss": 0.80549943, "learning_rate": 2.6977670885288627e-06, "loss": 0.84457278, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.8051929473876953 }, { "auxiliary_loss_clip": 0.01284338, "auxiliary_loss_mlp": 0.01028284, "balance_loss_clip": 1.05151141, "balance_loss_mlp": 1.02073526, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.017252792274504, "language_loss": 0.7510587, "learning_rate": 2.6970370098122378e-06, "loss": 0.77418488, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 3.639983892440796 }, { "auxiliary_loss_clip": 0.01191303, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.05339086, "balance_loss_mlp": 1.02642274, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.7398060869934628, "language_loss": 0.86501467, "learning_rate": 2.6963068253556535e-06, "loss": 0.88726711, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.755265951156616 }, { "auxiliary_loss_clip": 0.01151742, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.05219102, "balance_loss_mlp": 1.02316904, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 1.9053074714127805, "language_loss": 0.85824722, "learning_rate": 2.6955765352698763e-06, "loss": 0.88007683, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 3.6500113010406494 }, { "auxiliary_loss_clip": 0.01190865, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.05133247, "balance_loss_mlp": 1.02365375, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 1.9856770466404368, "language_loss": 0.732126, "learning_rate": 2.6948461396656923e-06, "loss": 0.75435042, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.544132709503174 }, { "auxiliary_loss_clip": 0.01148994, "auxiliary_loss_mlp": 0.01027077, "balance_loss_clip": 1.05358052, "balance_loss_mlp": 1.01938248, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.2043164021009742, "language_loss": 0.74543768, "learning_rate": 2.6941156386539013e-06, "loss": 0.76719844, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.699934720993042 }, { "auxiliary_loss_clip": 0.01285229, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.05369568, "balance_loss_mlp": 1.01928687, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 2.586944821179904, "language_loss": 0.81043416, "learning_rate": 2.6933850323453203e-06, "loss": 0.83355391, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.699056386947632 }, { "auxiliary_loss_clip": 0.01192111, "auxiliary_loss_mlp": 0.01027072, "balance_loss_clip": 1.05659914, "balance_loss_mlp": 1.01939797, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 2.29812651259609, "language_loss": 0.75108677, "learning_rate": 2.6926543208507806e-06, "loss": 0.77327859, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.611058235168457 }, { "auxiliary_loss_clip": 0.01239724, "auxiliary_loss_mlp": 0.01031476, "balance_loss_clip": 1.05340123, "balance_loss_mlp": 1.02376318, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 5.665297264710379, "language_loss": 0.79952765, "learning_rate": 2.6919235042811316e-06, "loss": 0.82223964, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.6198229789733887 }, { "auxiliary_loss_clip": 0.01339252, "auxiliary_loss_mlp": 0.01032999, "balance_loss_clip": 1.0509578, "balance_loss_mlp": 1.02535725, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.4522781729053857, "language_loss": 0.76213086, "learning_rate": 2.691192582747237e-06, "loss": 0.78585333, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.802574634552002 }, { "auxiliary_loss_clip": 0.01191214, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.0553112, "balance_loss_mlp": 1.02288759, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.8393495861858566, "language_loss": 0.73829019, "learning_rate": 2.6904615563599765e-06, "loss": 0.76050907, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.6538519859313965 }, { "auxiliary_loss_clip": 0.01334183, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.05053568, "balance_loss_mlp": 1.0200628, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 2.1454294761298596, "language_loss": 0.83649242, "learning_rate": 2.6897304252302477e-06, "loss": 0.86011076, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.778083324432373 }, { "auxiliary_loss_clip": 0.01243756, "auxiliary_loss_mlp": 0.00999922, "balance_loss_clip": 1.02318835, "balance_loss_mlp": 0.99879593, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7885054982236578, "language_loss": 0.54816544, "learning_rate": 2.688999189468962e-06, "loss": 0.5706023, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.159789562225342 }, { "auxiliary_loss_clip": 0.01245686, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.05834889, "balance_loss_mlp": 1.02022302, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.4309364226370267, "language_loss": 0.76403844, "learning_rate": 2.6882678491870464e-06, "loss": 0.78677022, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.6280388832092285 }, { "auxiliary_loss_clip": 0.01244315, "auxiliary_loss_mlp": 0.01022029, "balance_loss_clip": 1.05476868, "balance_loss_mlp": 1.01446486, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 2.46550835983336, "language_loss": 0.71469152, "learning_rate": 2.6875364044954453e-06, "loss": 0.73735499, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.734229326248169 }, { "auxiliary_loss_clip": 0.01287938, "auxiliary_loss_mlp": 0.01028428, "balance_loss_clip": 1.04781783, "balance_loss_mlp": 1.0209893, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.6450951256662354, "language_loss": 0.82492268, "learning_rate": 2.6868048555051185e-06, "loss": 0.8480863, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.678741455078125 }, { "auxiliary_loss_clip": 0.01297468, "auxiliary_loss_mlp": 0.01032404, "balance_loss_clip": 1.05079639, "balance_loss_mlp": 1.02437234, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 3.3359341886798597, "language_loss": 0.85958177, "learning_rate": 2.686073202327041e-06, "loss": 0.88288051, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.7544875144958496 }, { "auxiliary_loss_clip": 0.01283012, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.0473032, "balance_loss_mlp": 1.0171113, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 2.4043769439025886, "language_loss": 0.73507953, "learning_rate": 2.6853414450722043e-06, "loss": 0.75815386, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.723113775253296 }, { "auxiliary_loss_clip": 0.01236028, "auxiliary_loss_mlp": 0.01023336, "balance_loss_clip": 1.05233157, "balance_loss_mlp": 1.01594543, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 2.2481775859278024, "language_loss": 0.85182691, "learning_rate": 2.684609583851616e-06, "loss": 0.87442052, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 3.5689849853515625 }, { "auxiliary_loss_clip": 0.01383572, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.04825401, "balance_loss_mlp": 1.01845253, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 2.3011722095520666, "language_loss": 0.80828285, "learning_rate": 2.683877618776297e-06, "loss": 0.83237803, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.855247974395752 }, { "auxiliary_loss_clip": 0.01291084, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.04971421, "balance_loss_mlp": 1.02151501, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 2.7817630467354624, "language_loss": 0.73974144, "learning_rate": 2.6831455499572876e-06, "loss": 0.7629444, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.761523723602295 }, { "auxiliary_loss_clip": 0.01191434, "auxiliary_loss_mlp": 0.01028519, "balance_loss_clip": 1.05478692, "balance_loss_mlp": 1.02098441, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 2.931507565750685, "language_loss": 0.77997255, "learning_rate": 2.682413377505641e-06, "loss": 0.80217206, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.6485581398010254 }, { "auxiliary_loss_clip": 0.01237629, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 1.05131698, "balance_loss_mlp": 1.0207572, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 2.106073116027979, "language_loss": 0.76870501, "learning_rate": 2.6816811015324284e-06, "loss": 0.79135716, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.6342320442199707 }, { "auxiliary_loss_clip": 0.0108283, "auxiliary_loss_mlp": 0.01001768, "balance_loss_clip": 1.0183692, "balance_loss_mlp": 1.00067711, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7232778722611614, "language_loss": 0.56665003, "learning_rate": 2.6809487221487343e-06, "loss": 0.58749604, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 4.140889406204224 }, { "auxiliary_loss_clip": 0.01236449, "auxiliary_loss_mlp": 0.01027684, "balance_loss_clip": 1.0509789, "balance_loss_mlp": 1.02010846, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.8674100224474133, "language_loss": 0.81620264, "learning_rate": 2.6802162394656605e-06, "loss": 0.83884406, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.679086446762085 }, { "auxiliary_loss_clip": 0.01283614, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 1.04862738, "balance_loss_mlp": 1.01823568, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.8174719761581786, "language_loss": 0.720411, "learning_rate": 2.679483653594324e-06, "loss": 0.74350202, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.670738458633423 }, { "auxiliary_loss_clip": 0.012394, "auxiliary_loss_mlp": 0.01022608, "balance_loss_clip": 1.05057955, "balance_loss_mlp": 1.01555634, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.5515420024287825, "language_loss": 0.76714998, "learning_rate": 2.678750964645857e-06, "loss": 0.78977007, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.6913084983825684 }, { "auxiliary_loss_clip": 0.01247226, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.05836761, "balance_loss_mlp": 1.02427208, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.746921485815622, "language_loss": 0.83854157, "learning_rate": 2.6780181727314094e-06, "loss": 0.86133367, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.5898985862731934 }, { "auxiliary_loss_clip": 0.01339933, "auxiliary_loss_mlp": 0.02564638, "balance_loss_clip": 1.04974008, "balance_loss_mlp": 1.00016177, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.935232502812014, "language_loss": 0.7800256, "learning_rate": 2.6772852779621435e-06, "loss": 0.81907129, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 3.7508294582366943 }, { "auxiliary_loss_clip": 0.01245087, "auxiliary_loss_mlp": 0.02562749, "balance_loss_clip": 1.05972219, "balance_loss_mlp": 1.00016069, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 1.9788830862283877, "language_loss": 0.86872286, "learning_rate": 2.676552280449239e-06, "loss": 0.90680122, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 3.6710002422332764 }, { "auxiliary_loss_clip": 0.01233857, "auxiliary_loss_mlp": 0.01028657, "balance_loss_clip": 1.05018198, "balance_loss_mlp": 1.02114654, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 2.6786452360537254, "language_loss": 0.76243079, "learning_rate": 2.6758191803038917e-06, "loss": 0.78505599, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.663933515548706 }, { "auxiliary_loss_clip": 0.01431709, "auxiliary_loss_mlp": 0.01031739, "balance_loss_clip": 1.04695463, "balance_loss_mlp": 1.02371645, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.8276939095655227, "language_loss": 0.82641292, "learning_rate": 2.6750859776373125e-06, "loss": 0.8510474, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.811918258666992 }, { "auxiliary_loss_clip": 0.0135641, "auxiliary_loss_mlp": 0.01002041, "balance_loss_clip": 1.01945746, "balance_loss_mlp": 1.00090206, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7703250316028225, "language_loss": 0.60370618, "learning_rate": 2.674352672560727e-06, "loss": 0.62729067, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.43412709236145 }, { "auxiliary_loss_clip": 0.01242519, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.0482899, "balance_loss_mlp": 1.02457428, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.7174132667274329, "language_loss": 0.77247441, "learning_rate": 2.673619265185377e-06, "loss": 0.79521847, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.9197418689727783 }, { "auxiliary_loss_clip": 0.01242157, "auxiliary_loss_mlp": 0.01025486, "balance_loss_clip": 1.05114114, "balance_loss_mlp": 1.01817584, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 2.6396740919602584, "language_loss": 0.78435791, "learning_rate": 2.672885755622521e-06, "loss": 0.80703425, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.7261698246002197 }, { "auxiliary_loss_clip": 0.01379716, "auxiliary_loss_mlp": 0.01022121, "balance_loss_clip": 1.04465413, "balance_loss_mlp": 1.0150485, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 8.592940016407335, "language_loss": 0.70203084, "learning_rate": 2.67215214398343e-06, "loss": 0.72604918, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.765868663787842 }, { "auxiliary_loss_clip": 0.01387296, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.04481435, "balance_loss_mlp": 1.02243185, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.666016552108034, "language_loss": 0.7809478, "learning_rate": 2.671418430379393e-06, "loss": 0.80512089, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.8611412048339844 }, { "auxiliary_loss_clip": 0.0118844, "auxiliary_loss_mlp": 0.0102266, "balance_loss_clip": 1.05174673, "balance_loss_mlp": 1.01515007, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 2.914430393149479, "language_loss": 0.83528447, "learning_rate": 2.670684614921715e-06, "loss": 0.85739541, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.612776517868042 }, { "auxiliary_loss_clip": 0.01292322, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.04967308, "balance_loss_mlp": 1.01864374, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 2.312974371386244, "language_loss": 0.69203871, "learning_rate": 2.6699506977217128e-06, "loss": 0.71522498, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.77194881439209 }, { "auxiliary_loss_clip": 0.01236599, "auxiliary_loss_mlp": 0.0102612, "balance_loss_clip": 1.05367649, "balance_loss_mlp": 1.01918817, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 3.107727204260342, "language_loss": 0.70348537, "learning_rate": 2.6692166788907233e-06, "loss": 0.72611254, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 3.036036252975464 }, { "auxiliary_loss_clip": 0.01291122, "auxiliary_loss_mlp": 0.01035465, "balance_loss_clip": 1.05040908, "balance_loss_mlp": 1.02787733, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 2.0828762609037326, "language_loss": 0.77089024, "learning_rate": 2.6684825585400957e-06, "loss": 0.79415607, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.7924749851226807 }, { "auxiliary_loss_clip": 0.01183495, "auxiliary_loss_mlp": 0.01002401, "balance_loss_clip": 1.01866484, "balance_loss_mlp": 1.00131595, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8056190039259631, "language_loss": 0.65086895, "learning_rate": 2.6677483367811947e-06, "loss": 0.672728, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.4256017208099365 }, { "auxiliary_loss_clip": 0.01241701, "auxiliary_loss_mlp": 0.01024396, "balance_loss_clip": 1.05159616, "balance_loss_mlp": 1.01668668, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.8344471732317216, "language_loss": 0.75822699, "learning_rate": 2.6670140137254028e-06, "loss": 0.78088796, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.741098403930664 }, { "auxiliary_loss_clip": 0.01383336, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.04530144, "balance_loss_mlp": 1.02259493, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 3.2703778335006413, "language_loss": 0.898633, "learning_rate": 2.666279589484115e-06, "loss": 0.92276734, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.812255620956421 }, { "auxiliary_loss_clip": 0.01389256, "auxiliary_loss_mlp": 0.01026674, "balance_loss_clip": 1.04498994, "balance_loss_mlp": 1.01936018, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.9749993411546891, "language_loss": 0.81185961, "learning_rate": 2.6655450641687435e-06, "loss": 0.83601886, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.8383944034576416 }, { "auxiliary_loss_clip": 0.01188366, "auxiliary_loss_mlp": 0.01024924, "balance_loss_clip": 1.05402684, "balance_loss_mlp": 1.01760447, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.9658199344511, "language_loss": 0.69017488, "learning_rate": 2.664810437890715e-06, "loss": 0.71230775, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 4.144192218780518 }, { "auxiliary_loss_clip": 0.01423754, "auxiliary_loss_mlp": 0.01025421, "balance_loss_clip": 1.05111098, "balance_loss_mlp": 1.01808977, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 1.9018670832741602, "language_loss": 0.79471815, "learning_rate": 2.6640757107614714e-06, "loss": 0.81920993, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.881629228591919 }, { "auxiliary_loss_clip": 0.01342642, "auxiliary_loss_mlp": 0.01029634, "balance_loss_clip": 1.05431986, "balance_loss_mlp": 1.02189386, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.7070817046834827, "language_loss": 0.69281614, "learning_rate": 2.6633408828924697e-06, "loss": 0.71653885, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.853459358215332 }, { "auxiliary_loss_clip": 0.01344783, "auxiliary_loss_mlp": 0.01026667, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.01972282, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.579221958658815, "language_loss": 0.69871843, "learning_rate": 2.662605954395185e-06, "loss": 0.72243285, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.899362325668335 }, { "auxiliary_loss_clip": 0.01240445, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.04999065, "balance_loss_mlp": 1.02108157, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 2.0480351944001582, "language_loss": 0.83772099, "learning_rate": 2.6618709253811027e-06, "loss": 0.86040449, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 3.673060894012451 }, { "auxiliary_loss_clip": 0.01184151, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 1.05265832, "balance_loss_mlp": 1.01805282, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 2.0385630439023705, "language_loss": 0.87677896, "learning_rate": 2.6611357959617277e-06, "loss": 0.89886439, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.703909397125244 }, { "auxiliary_loss_clip": 0.01329736, "auxiliary_loss_mlp": 0.01022778, "balance_loss_clip": 1.04660356, "balance_loss_mlp": 1.01548862, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 2.174090862382698, "language_loss": 0.91230798, "learning_rate": 2.660400566248578e-06, "loss": 0.93583322, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.7969768047332764 }, { "auxiliary_loss_clip": 0.01337312, "auxiliary_loss_mlp": 0.01022051, "balance_loss_clip": 1.04858851, "balance_loss_mlp": 1.01427567, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.910423674675818, "language_loss": 0.67426181, "learning_rate": 2.6596652363531876e-06, "loss": 0.69785535, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.7155120372772217 }, { "auxiliary_loss_clip": 0.01187601, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.0543251, "balance_loss_mlp": 1.02829576, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.8916494772563757, "language_loss": 0.78286296, "learning_rate": 2.6589298063871055e-06, "loss": 0.80509394, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.6984000205993652 }, { "auxiliary_loss_clip": 0.01189152, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.05382705, "balance_loss_mlp": 1.02287602, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 2.0503536982815715, "language_loss": 0.7028572, "learning_rate": 2.658194276461895e-06, "loss": 0.72505593, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 3.558162212371826 }, { "auxiliary_loss_clip": 0.01291955, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.04692912, "balance_loss_mlp": 1.01949644, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.04710064849765, "language_loss": 0.66770971, "learning_rate": 2.6574586466891368e-06, "loss": 0.69090718, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 3.7355690002441406 }, { "auxiliary_loss_clip": 0.01288194, "auxiliary_loss_mlp": 0.02562237, "balance_loss_clip": 1.05085421, "balance_loss_mlp": 1.00008619, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 3.5174989301280957, "language_loss": 0.64636385, "learning_rate": 2.6567229171804247e-06, "loss": 0.68486822, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.793581247329712 }, { "auxiliary_loss_clip": 0.01344347, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.04875171, "balance_loss_mlp": 1.02030945, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 3.907731571582285, "language_loss": 0.87689269, "learning_rate": 2.655987088047368e-06, "loss": 0.90061867, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.7631752490997314 }, { "auxiliary_loss_clip": 0.01289289, "auxiliary_loss_mlp": 0.01030542, "balance_loss_clip": 1.04984164, "balance_loss_mlp": 1.02273393, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 8.407624754882306, "language_loss": 0.78635609, "learning_rate": 2.6552511594015912e-06, "loss": 0.80955434, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.792881727218628 }, { "auxiliary_loss_clip": 0.01289618, "auxiliary_loss_mlp": 0.01030034, "balance_loss_clip": 1.04701424, "balance_loss_mlp": 1.02196991, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 4.187623977959082, "language_loss": 0.84724671, "learning_rate": 2.654515131354735e-06, "loss": 0.87044322, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.701958656311035 }, { "auxiliary_loss_clip": 0.01337083, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.05029607, "balance_loss_mlp": 1.01819181, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.2264868434221703, "language_loss": 0.8512516, "learning_rate": 2.653779004018453e-06, "loss": 0.87487632, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.770838499069214 }, { "auxiliary_loss_clip": 0.01285869, "auxiliary_loss_mlp": 0.0102458, "balance_loss_clip": 1.04987931, "balance_loss_mlp": 1.01748729, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 2.056065244570253, "language_loss": 0.82363629, "learning_rate": 2.653042777504417e-06, "loss": 0.84674078, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.8045990467071533 }, { "auxiliary_loss_clip": 0.01198208, "auxiliary_loss_mlp": 0.01023749, "balance_loss_clip": 1.05017483, "balance_loss_mlp": 1.01559508, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 2.0731037432394404, "language_loss": 0.79862177, "learning_rate": 2.6523064519243105e-06, "loss": 0.82084137, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.760378360748291 }, { "auxiliary_loss_clip": 0.01241199, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.05485082, "balance_loss_mlp": 1.02154779, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.6877310455088943, "language_loss": 0.78998733, "learning_rate": 2.6515700273898333e-06, "loss": 0.81269312, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.740703821182251 }, { "auxiliary_loss_clip": 0.01290396, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.05524659, "balance_loss_mlp": 1.02404356, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 3.1487213183405847, "language_loss": 0.69150746, "learning_rate": 2.6508335040127018e-06, "loss": 0.71473038, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.7816851139068604 }, { "auxiliary_loss_clip": 0.01244467, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.05397391, "balance_loss_mlp": 1.01857901, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.522179845436325, "language_loss": 0.76766682, "learning_rate": 2.6500968819046446e-06, "loss": 0.79036474, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.8021676540374756 }, { "auxiliary_loss_clip": 0.01329626, "auxiliary_loss_mlp": 0.01026428, "balance_loss_clip": 1.0444839, "balance_loss_mlp": 1.01916218, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.429788153725156, "language_loss": 0.59093642, "learning_rate": 2.649360161177408e-06, "loss": 0.61449695, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.7132630348205566 }, { "auxiliary_loss_clip": 0.01246623, "auxiliary_loss_mlp": 0.01026567, "balance_loss_clip": 1.0535388, "balance_loss_mlp": 1.01837087, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 2.666224697727589, "language_loss": 0.73453641, "learning_rate": 2.6486233419427504e-06, "loss": 0.75726825, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.7287209033966064 }, { "auxiliary_loss_clip": 0.01334895, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.05178642, "balance_loss_mlp": 1.01918268, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.5994008615321924, "language_loss": 0.75279236, "learning_rate": 2.6478864243124484e-06, "loss": 0.77641284, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.726447582244873 }, { "auxiliary_loss_clip": 0.01240117, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.05079842, "balance_loss_mlp": 1.0145607, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 2.106694929150256, "language_loss": 0.8519429, "learning_rate": 2.6471494083982903e-06, "loss": 0.87457126, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.696732759475708 }, { "auxiliary_loss_clip": 0.01340516, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.04787827, "balance_loss_mlp": 1.01936817, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 2.3383256361539324, "language_loss": 0.74661726, "learning_rate": 2.6464122943120818e-06, "loss": 0.77029479, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 3.9563517570495605 }, { "auxiliary_loss_clip": 0.01335185, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.05068946, "balance_loss_mlp": 1.0211103, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 3.954108249018504, "language_loss": 0.81703019, "learning_rate": 2.645675082165642e-06, "loss": 0.84066927, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.7422218322753906 }, { "auxiliary_loss_clip": 0.01288689, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.05482066, "balance_loss_mlp": 1.02163696, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.525371250857849, "language_loss": 0.75165063, "learning_rate": 2.644937772070806e-06, "loss": 0.7748329, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.77412748336792 }, { "auxiliary_loss_clip": 0.01191515, "auxiliary_loss_mlp": 0.01027287, "balance_loss_clip": 1.0548749, "balance_loss_mlp": 1.02002692, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 4.227914518325167, "language_loss": 0.83190191, "learning_rate": 2.6442003641394225e-06, "loss": 0.85408992, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.604081869125366 }, { "auxiliary_loss_clip": 0.01288316, "auxiliary_loss_mlp": 0.01024531, "balance_loss_clip": 1.04704285, "balance_loss_mlp": 1.01710463, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.735743921684924, "language_loss": 0.84071338, "learning_rate": 2.643462858483356e-06, "loss": 0.86384189, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.7406692504882812 }, { "auxiliary_loss_clip": 0.01394011, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.05126333, "balance_loss_mlp": 1.01931751, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.9783840137364428, "language_loss": 0.7296164, "learning_rate": 2.6427252552144856e-06, "loss": 0.7538256, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 3.7323341369628906 }, { "auxiliary_loss_clip": 0.0118824, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.05247414, "balance_loss_mlp": 1.02098775, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 4.019175047902539, "language_loss": 0.75255948, "learning_rate": 2.6419875544447044e-06, "loss": 0.77472723, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.677738904953003 }, { "auxiliary_loss_clip": 0.01190873, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.05345976, "balance_loss_mlp": 1.01962376, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 4.548010104554467, "language_loss": 0.71635532, "learning_rate": 2.6412497562859218e-06, "loss": 0.73853946, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.71471905708313 }, { "auxiliary_loss_clip": 0.01246065, "auxiliary_loss_mlp": 0.01025024, "balance_loss_clip": 1.05312979, "balance_loss_mlp": 1.01751363, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.4943536854922197, "language_loss": 0.76508611, "learning_rate": 2.6405118608500617e-06, "loss": 0.78779703, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.6716742515563965 }, { "auxiliary_loss_clip": 0.01329587, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 1.05298138, "balance_loss_mlp": 1.02241993, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 2.106277937127679, "language_loss": 0.81415582, "learning_rate": 2.6397738682490613e-06, "loss": 0.83774871, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 3.639651298522949 }, { "auxiliary_loss_clip": 0.01191194, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.05397439, "balance_loss_mlp": 1.01974988, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.822489201483627, "language_loss": 0.75232446, "learning_rate": 2.6390357785948734e-06, "loss": 0.77451319, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.6237621307373047 }, { "auxiliary_loss_clip": 0.0123899, "auxiliary_loss_mlp": 0.01024683, "balance_loss_clip": 1.05391073, "balance_loss_mlp": 1.01671338, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.8716311464410627, "language_loss": 0.80341017, "learning_rate": 2.6382975919994667e-06, "loss": 0.82604688, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 3.6483283042907715 }, { "auxiliary_loss_clip": 0.01290623, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.05079269, "balance_loss_mlp": 1.02166557, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 2.025649773561189, "language_loss": 0.7311272, "learning_rate": 2.637559308574822e-06, "loss": 0.75432241, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.7601804733276367 }, { "auxiliary_loss_clip": 0.0119078, "auxiliary_loss_mlp": 0.01027261, "balance_loss_clip": 1.05383062, "balance_loss_mlp": 1.01978016, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 2.291885187387876, "language_loss": 0.7115587, "learning_rate": 2.6368209284329376e-06, "loss": 0.73373902, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.7165627479553223 }, { "auxiliary_loss_clip": 0.01240799, "auxiliary_loss_mlp": 0.01035212, "balance_loss_clip": 1.05159295, "balance_loss_mlp": 1.02721274, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 2.159340070245855, "language_loss": 0.75457895, "learning_rate": 2.636082451685825e-06, "loss": 0.77733904, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.714768886566162 }, { "auxiliary_loss_clip": 0.0129165, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.05421591, "balance_loss_mlp": 1.02953267, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 4.418626119274522, "language_loss": 0.86444253, "learning_rate": 2.6353438784455094e-06, "loss": 0.88773501, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.750631332397461 }, { "auxiliary_loss_clip": 0.01288077, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.05043745, "balance_loss_mlp": 1.02506101, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 3.0326179560812716, "language_loss": 0.71790469, "learning_rate": 2.6346052088240326e-06, "loss": 0.74111342, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.7449517250061035 }, { "auxiliary_loss_clip": 0.01290428, "auxiliary_loss_mlp": 0.01026279, "balance_loss_clip": 1.05258536, "balance_loss_mlp": 1.0186317, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 2.1229418480673026, "language_loss": 0.77351755, "learning_rate": 2.63386644293345e-06, "loss": 0.79668462, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.736422061920166 }, { "auxiliary_loss_clip": 0.01340046, "auxiliary_loss_mlp": 0.01024508, "balance_loss_clip": 1.04702592, "balance_loss_mlp": 1.0170579, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.2451186699814785, "language_loss": 0.83104342, "learning_rate": 2.633127580885833e-06, "loss": 0.854689, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.6913840770721436 }, { "auxiliary_loss_clip": 0.01189626, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.05533004, "balance_loss_mlp": 1.01968729, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 8.270469014984116, "language_loss": 0.64535695, "learning_rate": 2.632388622793265e-06, "loss": 0.66752195, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.7149815559387207 }, { "auxiliary_loss_clip": 0.01242714, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.05354428, "balance_loss_mlp": 1.02997684, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.7708306661256419, "language_loss": 0.67720616, "learning_rate": 2.6316495687678457e-06, "loss": 0.7000072, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.654083013534546 }, { "auxiliary_loss_clip": 0.0138611, "auxiliary_loss_mlp": 0.01022203, "balance_loss_clip": 1.04687047, "balance_loss_mlp": 1.01471066, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 3.462554846085933, "language_loss": 0.76662314, "learning_rate": 2.6309104189216887e-06, "loss": 0.79070628, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.9080283641815186 }, { "auxiliary_loss_clip": 0.01329838, "auxiliary_loss_mlp": 0.02569634, "balance_loss_clip": 1.04704988, "balance_loss_mlp": 1.00015163, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.2308392140511857, "language_loss": 0.7474463, "learning_rate": 2.630171173366923e-06, "loss": 0.78644097, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.806871175765991 }, { "auxiliary_loss_clip": 0.01389093, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.04782808, "balance_loss_mlp": 1.01934052, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 3.7243125509293673, "language_loss": 0.73957539, "learning_rate": 2.629431832215691e-06, "loss": 0.76373649, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.834475517272949 }, { "auxiliary_loss_clip": 0.01284005, "auxiliary_loss_mlp": 0.01027782, "balance_loss_clip": 1.05241072, "balance_loss_mlp": 1.02055526, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.7626758313345239, "language_loss": 0.87154514, "learning_rate": 2.628692395580151e-06, "loss": 0.89466298, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.815366506576538 }, { "auxiliary_loss_clip": 0.01426457, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.04092336, "balance_loss_mlp": 1.02491903, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 2.3203596239577577, "language_loss": 0.79634529, "learning_rate": 2.6279528635724747e-06, "loss": 0.82093793, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.888267755508423 }, { "auxiliary_loss_clip": 0.01240644, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.05039001, "balance_loss_mlp": 1.02428985, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 3.731732230332921, "language_loss": 0.78120089, "learning_rate": 2.627213236304848e-06, "loss": 0.80393487, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 3.7810938358306885 }, { "auxiliary_loss_clip": 0.01242335, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.05150509, "balance_loss_mlp": 1.01752901, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 1.9433567396067204, "language_loss": 0.70655692, "learning_rate": 2.626473513889472e-06, "loss": 0.72923398, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.698026180267334 }, { "auxiliary_loss_clip": 0.01236263, "auxiliary_loss_mlp": 0.01027241, "balance_loss_clip": 1.05233169, "balance_loss_mlp": 1.01983261, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 2.197467739205825, "language_loss": 0.8246485, "learning_rate": 2.625733696438562e-06, "loss": 0.84728354, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.690098285675049 }, { "auxiliary_loss_clip": 0.01288211, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.05062962, "balance_loss_mlp": 1.02613711, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.6923923185990672, "language_loss": 0.75099432, "learning_rate": 2.6249937840643476e-06, "loss": 0.77421391, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.658403158187866 }, { "auxiliary_loss_clip": 0.01195116, "auxiliary_loss_mlp": 0.02567838, "balance_loss_clip": 1.05813634, "balance_loss_mlp": 1.00021863, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.8901763269783447, "language_loss": 0.67046702, "learning_rate": 2.6242537768790733e-06, "loss": 0.7080965, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 3.5315098762512207 }, { "auxiliary_loss_clip": 0.01243615, "auxiliary_loss_mlp": 0.01032404, "balance_loss_clip": 1.05612171, "balance_loss_mlp": 1.02474809, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 5.097838478021265, "language_loss": 0.68654919, "learning_rate": 2.6235136749949975e-06, "loss": 0.70930934, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.7521822452545166 }, { "auxiliary_loss_clip": 0.01190097, "auxiliary_loss_mlp": 0.010241, "balance_loss_clip": 1.05454552, "balance_loss_mlp": 1.01634264, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.333605120141151, "language_loss": 0.6106025, "learning_rate": 2.6227734785243924e-06, "loss": 0.63274449, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.7114744186401367 }, { "auxiliary_loss_clip": 0.01432293, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.0445298, "balance_loss_mlp": 1.01909554, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.9118279290737445, "language_loss": 0.79138803, "learning_rate": 2.6220331875795466e-06, "loss": 0.81597561, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.814940929412842 }, { "auxiliary_loss_clip": 0.01238541, "auxiliary_loss_mlp": 0.01028401, "balance_loss_clip": 1.05196273, "balance_loss_mlp": 1.02097118, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.7464027061665646, "language_loss": 0.74936521, "learning_rate": 2.62129280227276e-06, "loss": 0.7720347, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.6665444374084473 }, { "auxiliary_loss_clip": 0.01247451, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05562901, "balance_loss_mlp": 1.02070808, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.4194097598613276, "language_loss": 0.68604165, "learning_rate": 2.62055232271635e-06, "loss": 0.70879519, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 4.023009300231934 }, { "auxiliary_loss_clip": 0.0133433, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 1.04724002, "balance_loss_mlp": 1.02285063, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.5186017570183346, "language_loss": 0.87773311, "learning_rate": 2.619811749022646e-06, "loss": 0.90137935, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.8331046104431152 }, { "auxiliary_loss_clip": 0.01245047, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.05641317, "balance_loss_mlp": 1.020437, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.4597500506819867, "language_loss": 0.70997238, "learning_rate": 2.6190710813039917e-06, "loss": 0.7327078, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.594266176223755 }, { "auxiliary_loss_clip": 0.01387783, "auxiliary_loss_mlp": 0.02571702, "balance_loss_clip": 1.04560041, "balance_loss_mlp": 1.00018656, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 3.467024486296244, "language_loss": 0.84057593, "learning_rate": 2.618330319672747e-06, "loss": 0.88017076, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 3.7282044887542725 }, { "auxiliary_loss_clip": 0.01191928, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.05496597, "balance_loss_mlp": 1.02168107, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.651459237973374, "language_loss": 0.91905099, "learning_rate": 2.617589464241284e-06, "loss": 0.94126654, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.7227745056152344 }, { "auxiliary_loss_clip": 0.01248438, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.05239415, "balance_loss_mlp": 1.0211935, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 2.06703165967826, "language_loss": 0.74273658, "learning_rate": 2.6168485151219914e-06, "loss": 0.76550537, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.8631722927093506 }, { "auxiliary_loss_clip": 0.01240661, "auxiliary_loss_mlp": 0.01027181, "balance_loss_clip": 1.05475175, "balance_loss_mlp": 1.019418, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.4356102059242524, "language_loss": 0.71638489, "learning_rate": 2.616107472427269e-06, "loss": 0.73906332, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.649883508682251 }, { "auxiliary_loss_clip": 0.01248157, "auxiliary_loss_mlp": 0.01027184, "balance_loss_clip": 1.05478501, "balance_loss_mlp": 1.01985264, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.590023919821982, "language_loss": 0.76269937, "learning_rate": 2.615366336269533e-06, "loss": 0.78545284, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.6002180576324463 }, { "auxiliary_loss_clip": 0.01194133, "auxiliary_loss_mlp": 0.01027464, "balance_loss_clip": 1.0544951, "balance_loss_mlp": 1.01946557, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.4651797773111346, "language_loss": 0.80277061, "learning_rate": 2.6146251067612126e-06, "loss": 0.82498658, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.656378984451294 }, { "auxiliary_loss_clip": 0.0125083, "auxiliary_loss_mlp": 0.01024734, "balance_loss_clip": 1.06088769, "balance_loss_mlp": 1.0173583, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 2.4231134727503805, "language_loss": 0.82494718, "learning_rate": 2.6138837840147525e-06, "loss": 0.84770286, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.6421399116516113 }, { "auxiliary_loss_clip": 0.01336995, "auxiliary_loss_mlp": 0.01028532, "balance_loss_clip": 1.04931748, "balance_loss_mlp": 1.02108741, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 2.644039456723297, "language_loss": 0.75954604, "learning_rate": 2.6131423681426103e-06, "loss": 0.78320134, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.713409900665283 }, { "auxiliary_loss_clip": 0.0119062, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.05510783, "balance_loss_mlp": 1.02302265, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 1.7009372524814963, "language_loss": 0.72774798, "learning_rate": 2.6124008592572587e-06, "loss": 0.74996042, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.742433786392212 }, { "auxiliary_loss_clip": 0.01195454, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.05525708, "balance_loss_mlp": 1.02051854, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 3.165018618754427, "language_loss": 0.81570005, "learning_rate": 2.6116592574711835e-06, "loss": 0.83793408, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.6686594486236572 }, { "auxiliary_loss_clip": 0.01197809, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.05726624, "balance_loss_mlp": 1.02650917, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 3.730774587817408, "language_loss": 0.84339887, "learning_rate": 2.6109175628968853e-06, "loss": 0.86572248, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.6197946071624756 }, { "auxiliary_loss_clip": 0.0123374, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.0501647, "balance_loss_mlp": 1.02399647, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 1.9574662678005559, "language_loss": 0.82956469, "learning_rate": 2.610175775646878e-06, "loss": 0.85221314, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.727952003479004 }, { "auxiliary_loss_clip": 0.01285731, "auxiliary_loss_mlp": 0.01026204, "balance_loss_clip": 1.04791939, "balance_loss_mlp": 1.01822257, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 2.30933263204728, "language_loss": 0.73413873, "learning_rate": 2.6094338958336907e-06, "loss": 0.75725812, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.7601747512817383 }, { "auxiliary_loss_clip": 0.01291296, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.05446768, "balance_loss_mlp": 1.02427876, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 2.437947556974515, "language_loss": 0.82288826, "learning_rate": 2.608691923569867e-06, "loss": 0.84611511, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.795093059539795 }, { "auxiliary_loss_clip": 0.01244702, "auxiliary_loss_mlp": 0.01034777, "balance_loss_clip": 1.05605948, "balance_loss_mlp": 1.02725506, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.7835820277788854, "language_loss": 0.758035, "learning_rate": 2.6079498589679616e-06, "loss": 0.78082979, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.8356852531433105 }, { "auxiliary_loss_clip": 0.01439934, "auxiliary_loss_mlp": 0.01030998, "balance_loss_clip": 1.04547167, "balance_loss_mlp": 1.021873, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 1.8673736910226, "language_loss": 0.76319587, "learning_rate": 2.6072077021405465e-06, "loss": 0.78790516, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 3.7237002849578857 }, { "auxiliary_loss_clip": 0.01345931, "auxiliary_loss_mlp": 0.01027387, "balance_loss_clip": 1.04845047, "balance_loss_mlp": 1.01988256, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.7737324433810486, "language_loss": 0.69377434, "learning_rate": 2.6064654532002054e-06, "loss": 0.71750754, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.7856907844543457 }, { "auxiliary_loss_clip": 0.01194255, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.05702901, "balance_loss_mlp": 1.0211072, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 2.697961330916967, "language_loss": 0.75965565, "learning_rate": 2.6057231122595375e-06, "loss": 0.7818886, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.6755478382110596 }, { "auxiliary_loss_clip": 0.01288445, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.04874575, "balance_loss_mlp": 1.02823138, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.7801986371993321, "language_loss": 0.72842264, "learning_rate": 2.604980679431154e-06, "loss": 0.75166863, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 3.660038471221924 }, { "auxiliary_loss_clip": 0.01242639, "auxiliary_loss_mlp": 0.01026519, "balance_loss_clip": 1.05201375, "balance_loss_mlp": 1.01917529, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 1.9277142987086304, "language_loss": 0.744946, "learning_rate": 2.604238154827684e-06, "loss": 0.76763755, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.688359260559082 }, { "auxiliary_loss_clip": 0.01242999, "auxiliary_loss_mlp": 0.01024976, "balance_loss_clip": 1.05353546, "balance_loss_mlp": 1.01716769, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 2.3149940429365965, "language_loss": 0.72489792, "learning_rate": 2.6034955385617656e-06, "loss": 0.74757767, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.6302592754364014 }, { "auxiliary_loss_clip": 0.01250018, "auxiliary_loss_mlp": 0.00999584, "balance_loss_clip": 1.02473354, "balance_loss_mlp": 0.99845713, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7166707974741341, "language_loss": 0.61576146, "learning_rate": 2.6027528307460544e-06, "loss": 0.63825744, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.358452558517456 }, { "auxiliary_loss_clip": 0.01193236, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.05498719, "balance_loss_mlp": 1.0287168, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 1.680685894449732, "language_loss": 0.86466801, "learning_rate": 2.602010031493217e-06, "loss": 0.88696325, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.657012462615967 }, { "auxiliary_loss_clip": 0.01344378, "auxiliary_loss_mlp": 0.01023714, "balance_loss_clip": 1.05530214, "balance_loss_mlp": 1.01627564, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 2.536653802429296, "language_loss": 0.86763251, "learning_rate": 2.6012671409159367e-06, "loss": 0.89131343, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.7806475162506104 }, { "auxiliary_loss_clip": 0.01285965, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.05105913, "balance_loss_mlp": 1.02158272, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.7557637558502752, "language_loss": 0.81484747, "learning_rate": 2.6005241591269097e-06, "loss": 0.83799827, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 3.6815176010131836 }, { "auxiliary_loss_clip": 0.01334414, "auxiliary_loss_mlp": 0.01027675, "balance_loss_clip": 1.05321789, "balance_loss_mlp": 1.0208441, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.6509348905637071, "language_loss": 0.80016756, "learning_rate": 2.5997810862388454e-06, "loss": 0.8237884, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 3.645993232727051 }, { "auxiliary_loss_clip": 0.01287249, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.04890513, "balance_loss_mlp": 1.02051735, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.39139876279663, "language_loss": 0.7574538, "learning_rate": 2.599037922364467e-06, "loss": 0.78060555, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.761430501937866 }, { "auxiliary_loss_clip": 0.01339757, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.05566716, "balance_loss_mlp": 1.02276063, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.559251718837426, "language_loss": 0.75700676, "learning_rate": 2.5982946676165112e-06, "loss": 0.78071141, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.7895472049713135 }, { "auxiliary_loss_clip": 0.01262329, "auxiliary_loss_mlp": 0.01001469, "balance_loss_clip": 1.03492129, "balance_loss_mlp": 1.0004437, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7287752425719596, "language_loss": 0.57558417, "learning_rate": 2.5975513221077313e-06, "loss": 0.59822226, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.3751442432403564 }, { "auxiliary_loss_clip": 0.01283874, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04909086, "balance_loss_mlp": 1.02027118, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.3404125564368625, "language_loss": 0.88575697, "learning_rate": 2.5968078859508897e-06, "loss": 0.90887058, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.732548713684082 }, { "auxiliary_loss_clip": 0.01237519, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.05269003, "balance_loss_mlp": 1.02075839, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 2.309272240206507, "language_loss": 0.80328012, "learning_rate": 2.5960643592587673e-06, "loss": 0.82593763, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.6634674072265625 }, { "auxiliary_loss_clip": 0.01335402, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.04752696, "balance_loss_mlp": 1.02492166, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 1.8946584905134976, "language_loss": 0.81443924, "learning_rate": 2.5953207421441553e-06, "loss": 0.83811635, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.8137407302856445 }, { "auxiliary_loss_clip": 0.01353082, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.05727375, "balance_loss_mlp": 1.03073192, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.7956781925204393, "language_loss": 0.75119758, "learning_rate": 2.5945770347198603e-06, "loss": 0.77510738, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.8254239559173584 }, { "auxiliary_loss_clip": 0.01193069, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.04927611, "balance_loss_mlp": 1.02203369, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 2.0022972274092914, "language_loss": 0.82005894, "learning_rate": 2.593833237098701e-06, "loss": 0.84228188, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.8157765865325928 }, { "auxiliary_loss_clip": 0.01240527, "auxiliary_loss_mlp": 0.01026393, "balance_loss_clip": 1.05050433, "balance_loss_mlp": 1.01870131, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 2.0750702509307026, "language_loss": 0.62558603, "learning_rate": 2.593089349393512e-06, "loss": 0.64825523, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.743659496307373 }, { "auxiliary_loss_clip": 0.01248207, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.05893517, "balance_loss_mlp": 1.02047729, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 2.172459671529353, "language_loss": 0.83858824, "learning_rate": 2.592345371717141e-06, "loss": 0.86135662, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.6558923721313477 }, { "auxiliary_loss_clip": 0.01244635, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.05870664, "balance_loss_mlp": 1.02308273, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.195073841806471, "language_loss": 0.71847153, "learning_rate": 2.591601304182448e-06, "loss": 0.74122411, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.663313388824463 }, { "auxiliary_loss_clip": 0.01290839, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 1.05605173, "balance_loss_mlp": 1.01904035, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.7726631750954647, "language_loss": 0.79306513, "learning_rate": 2.5908571469023067e-06, "loss": 0.81623274, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.679189443588257 }, { "auxiliary_loss_clip": 0.01191506, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.05522776, "balance_loss_mlp": 1.01875818, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.762360493613974, "language_loss": 0.75418633, "learning_rate": 2.5901128999896067e-06, "loss": 0.77636111, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.5917623043060303 }, { "auxiliary_loss_clip": 0.01240815, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.05527091, "balance_loss_mlp": 1.01707959, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 3.532590861987271, "language_loss": 0.68374795, "learning_rate": 2.5893685635572487e-06, "loss": 0.70640385, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.8240742683410645 }, { "auxiliary_loss_clip": 0.01290365, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.05556083, "balance_loss_mlp": 1.02345014, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 2.1362133545207143, "language_loss": 0.69550979, "learning_rate": 2.5886241377181483e-06, "loss": 0.7187264, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.7421350479125977 }, { "auxiliary_loss_clip": 0.01240934, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.0548811, "balance_loss_mlp": 1.01970053, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 1.8695213007405325, "language_loss": 0.81148392, "learning_rate": 2.587879622585234e-06, "loss": 0.83416933, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 3.6194565296173096 }, { "auxiliary_loss_clip": 0.01243433, "auxiliary_loss_mlp": 0.0102351, "balance_loss_clip": 1.05889523, "balance_loss_mlp": 1.0158658, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 2.5785791369569226, "language_loss": 0.75266647, "learning_rate": 2.5871350182714486e-06, "loss": 0.77533597, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.6951751708984375 }, { "auxiliary_loss_clip": 0.01191745, "auxiliary_loss_mlp": 0.01028828, "balance_loss_clip": 1.05594182, "balance_loss_mlp": 1.02137172, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 2.288602604767143, "language_loss": 0.80711192, "learning_rate": 2.586390324889748e-06, "loss": 0.82931763, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.532463312149048 }, { "auxiliary_loss_clip": 0.01242615, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.05659795, "balance_loss_mlp": 1.01857853, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.924565262351091, "language_loss": 0.67689395, "learning_rate": 2.5856455425531003e-06, "loss": 0.6995796, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 3.6152708530426025 }, { "auxiliary_loss_clip": 0.01236732, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.05536962, "balance_loss_mlp": 1.01678681, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 2.0270736140241645, "language_loss": 0.80835187, "learning_rate": 2.5849006713744902e-06, "loss": 0.83096457, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.7182681560516357 }, { "auxiliary_loss_clip": 0.01293415, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 1.05491924, "balance_loss_mlp": 1.01773977, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.007678195692539, "language_loss": 0.73223752, "learning_rate": 2.5841557114669135e-06, "loss": 0.75542414, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.65470027923584 }, { "auxiliary_loss_clip": 0.01197817, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.05752158, "balance_loss_mlp": 1.02420735, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 4.5315287505972295, "language_loss": 0.67348766, "learning_rate": 2.58341066294338e-06, "loss": 0.69579017, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.595306158065796 }, { "auxiliary_loss_clip": 0.01393603, "auxiliary_loss_mlp": 0.02568763, "balance_loss_clip": 1.05111432, "balance_loss_mlp": 1.00006914, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 2.1974688104485027, "language_loss": 0.85728723, "learning_rate": 2.5826655259169124e-06, "loss": 0.89691091, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.7252016067504883 }, { "auxiliary_loss_clip": 0.01195535, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.05896306, "balance_loss_mlp": 1.02424836, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 2.236034283812978, "language_loss": 0.90505362, "learning_rate": 2.5819203005005475e-06, "loss": 0.92732596, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 3.5041046142578125 }, { "auxiliary_loss_clip": 0.01284529, "auxiliary_loss_mlp": 0.01023084, "balance_loss_clip": 1.05291319, "balance_loss_mlp": 1.01540673, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.7779120112742763, "language_loss": 0.78672218, "learning_rate": 2.581174986807336e-06, "loss": 0.8097983, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.706537961959839 }, { "auxiliary_loss_clip": 0.01238185, "auxiliary_loss_mlp": 0.02566968, "balance_loss_clip": 1.0537256, "balance_loss_mlp": 1.00009394, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 3.676865209884386, "language_loss": 0.91295719, "learning_rate": 2.580429584950341e-06, "loss": 0.95100874, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 3.511000871658325 }, { "auxiliary_loss_clip": 0.01264212, "auxiliary_loss_mlp": 0.01029149, "balance_loss_clip": 1.06081188, "balance_loss_mlp": 1.02115631, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.1064937604000256, "language_loss": 0.66694582, "learning_rate": 2.5796840950426397e-06, "loss": 0.68987948, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.763439893722534 }, { "auxiliary_loss_clip": 0.01239245, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.05467749, "balance_loss_mlp": 1.0185504, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.7263638505378958, "language_loss": 0.66059816, "learning_rate": 2.578938517197322e-06, "loss": 0.68324995, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.7407844066619873 }, { "auxiliary_loss_clip": 0.01287547, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.05277312, "balance_loss_mlp": 1.02670765, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.610715029345118, "language_loss": 0.62515181, "learning_rate": 2.5781928515274916e-06, "loss": 0.64837396, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.7033684253692627 }, { "auxiliary_loss_clip": 0.01247299, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.05833101, "balance_loss_mlp": 1.01998913, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 2.2528227067783058, "language_loss": 0.68186784, "learning_rate": 2.577447098146265e-06, "loss": 0.70461524, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.627251625061035 }, { "auxiliary_loss_clip": 0.01346442, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.05580533, "balance_loss_mlp": 1.01691329, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.5664870670784932, "language_loss": 0.78998137, "learning_rate": 2.5767012571667724e-06, "loss": 0.81369615, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.7716145515441895 }, { "auxiliary_loss_clip": 0.01246512, "auxiliary_loss_mlp": 0.0102769, "balance_loss_clip": 1.05437255, "balance_loss_mlp": 1.01951849, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.7232185349614153, "language_loss": 0.68571705, "learning_rate": 2.5759553287021587e-06, "loss": 0.70845908, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.645460367202759 }, { "auxiliary_loss_clip": 0.0129921, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.05862033, "balance_loss_mlp": 1.01982963, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 1.9034215753723833, "language_loss": 0.78174138, "learning_rate": 2.5752093128655786e-06, "loss": 0.80500883, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.6956915855407715 }, { "auxiliary_loss_clip": 0.01284537, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.05015421, "balance_loss_mlp": 1.02119601, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 2.0592459710215674, "language_loss": 0.74063557, "learning_rate": 2.574463209770204e-06, "loss": 0.76376772, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.707911491394043 }, { "auxiliary_loss_clip": 0.01344018, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.0500977, "balance_loss_mlp": 1.01824653, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 4.808814599795064, "language_loss": 0.79674786, "learning_rate": 2.5737170195292165e-06, "loss": 0.82044828, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.7150142192840576 }, { "auxiliary_loss_clip": 0.01342341, "auxiliary_loss_mlp": 0.01027364, "balance_loss_clip": 1.05123389, "balance_loss_mlp": 1.01960397, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 1.8592996368004149, "language_loss": 0.78512931, "learning_rate": 2.572970742255814e-06, "loss": 0.80882633, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.8216636180877686 }, { "auxiliary_loss_clip": 0.01241737, "auxiliary_loss_mlp": 0.01027467, "balance_loss_clip": 1.05636072, "balance_loss_mlp": 1.02031469, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.9981243260482835, "language_loss": 0.81669378, "learning_rate": 2.5722243780632046e-06, "loss": 0.83938587, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.6477410793304443 }, { "auxiliary_loss_clip": 0.0130493, "auxiliary_loss_mlp": 0.0100691, "balance_loss_clip": 1.02494049, "balance_loss_mlp": 1.00572419, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7531108739722716, "language_loss": 0.60396814, "learning_rate": 2.5714779270646125e-06, "loss": 0.62708652, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.4196176528930664 }, { "auxiliary_loss_clip": 0.01292804, "auxiliary_loss_mlp": 0.02571622, "balance_loss_clip": 1.05537045, "balance_loss_mlp": 1.00000596, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.5360214313249685, "language_loss": 0.77856195, "learning_rate": 2.5707313893732735e-06, "loss": 0.8172062, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.672560453414917 }, { "auxiliary_loss_clip": 0.01485771, "auxiliary_loss_mlp": 0.01025651, "balance_loss_clip": 1.04229975, "balance_loss_mlp": 1.01756287, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.7469175447855103, "language_loss": 0.77361822, "learning_rate": 2.5699847651024364e-06, "loss": 0.79873252, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.835223913192749 }, { "auxiliary_loss_clip": 0.01238979, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.05731893, "balance_loss_mlp": 1.02310967, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 2.6879814013380248, "language_loss": 0.76816285, "learning_rate": 2.5692380543653627e-06, "loss": 0.79085982, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.7077231407165527 }, { "auxiliary_loss_clip": 0.01247944, "auxiliary_loss_mlp": 0.02570472, "balance_loss_clip": 1.05578637, "balance_loss_mlp": 1.0000155, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 2.22753368745173, "language_loss": 0.69766974, "learning_rate": 2.5684912572753293e-06, "loss": 0.73585391, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 3.5436928272247314 }, { "auxiliary_loss_clip": 0.01190955, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.05663276, "balance_loss_mlp": 1.02052498, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 2.0257777959589793, "language_loss": 0.84058696, "learning_rate": 2.5677443739456245e-06, "loss": 0.86277694, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.7449755668640137 }, { "auxiliary_loss_clip": 0.01303327, "auxiliary_loss_mlp": 0.01027143, "balance_loss_clip": 1.06181717, "balance_loss_mlp": 1.0194422, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.909293083878819, "language_loss": 0.79077256, "learning_rate": 2.5669974044895495e-06, "loss": 0.81407726, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.6300253868103027 }, { "auxiliary_loss_clip": 0.01348719, "auxiliary_loss_mlp": 0.01027657, "balance_loss_clip": 1.05229509, "balance_loss_mlp": 1.02042139, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 2.9824126445702817, "language_loss": 0.79919058, "learning_rate": 2.5662503490204187e-06, "loss": 0.8229543, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 3.4174816608428955 }, { "auxiliary_loss_clip": 0.01288971, "auxiliary_loss_mlp": 0.01030351, "balance_loss_clip": 1.05053997, "balance_loss_mlp": 1.02286768, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 2.2356793285365613, "language_loss": 0.75934434, "learning_rate": 2.5655032076515603e-06, "loss": 0.78253758, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.7116644382476807 }, { "auxiliary_loss_clip": 0.01197079, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.05496716, "balance_loss_mlp": 1.02274954, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 2.1308708996788868, "language_loss": 0.82761085, "learning_rate": 2.5647559804963155e-06, "loss": 0.84988546, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.671523332595825 }, { "auxiliary_loss_clip": 0.0139785, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05335832, "balance_loss_mlp": 1.02537036, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 2.7195347735023625, "language_loss": 0.78757823, "learning_rate": 2.5640086676680364e-06, "loss": 0.81188697, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.8430466651916504 }, { "auxiliary_loss_clip": 0.01243135, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.05592179, "balance_loss_mlp": 1.0237813, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.99722508486976, "language_loss": 0.80737597, "learning_rate": 2.5632612692800923e-06, "loss": 0.83012849, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.71234130859375 }, { "auxiliary_loss_clip": 0.01341115, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.05361176, "balance_loss_mlp": 1.02284825, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 2.325844986068865, "language_loss": 0.75789869, "learning_rate": 2.5625137854458603e-06, "loss": 0.78162152, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 3.8944945335388184 }, { "auxiliary_loss_clip": 0.01294618, "auxiliary_loss_mlp": 0.01023542, "balance_loss_clip": 1.05379248, "balance_loss_mlp": 1.01624608, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.9674791835302412, "language_loss": 0.80052876, "learning_rate": 2.561766216278735e-06, "loss": 0.82371032, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.780121326446533 }, { "auxiliary_loss_clip": 0.01396706, "auxiliary_loss_mlp": 0.01026419, "balance_loss_clip": 1.05531931, "balance_loss_mlp": 1.01825285, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 2.324379885559823, "language_loss": 0.81247318, "learning_rate": 2.561018561892121e-06, "loss": 0.83670449, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 3.659587860107422 }, { "auxiliary_loss_clip": 0.01292031, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.0537591, "balance_loss_mlp": 1.02694893, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 1.9947263611112562, "language_loss": 0.76795536, "learning_rate": 2.5602708223994363e-06, "loss": 0.79122204, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.8169944286346436 }, { "auxiliary_loss_clip": 0.01344315, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.04896462, "balance_loss_mlp": 1.01838386, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.266426271754617, "language_loss": 0.67705476, "learning_rate": 2.559522997914115e-06, "loss": 0.70075512, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.7629234790802 }, { "auxiliary_loss_clip": 0.01194764, "auxiliary_loss_mlp": 0.01026932, "balance_loss_clip": 1.06098175, "balance_loss_mlp": 1.0199163, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 2.0701687610964714, "language_loss": 0.84992635, "learning_rate": 2.558775088549599e-06, "loss": 0.87214333, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.6729769706726074 }, { "auxiliary_loss_clip": 0.01154757, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.0561868, "balance_loss_mlp": 1.0241611, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 2.3433573477305027, "language_loss": 0.66219997, "learning_rate": 2.5580270944193467e-06, "loss": 0.68407321, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.6235625743865967 }, { "auxiliary_loss_clip": 0.01090595, "auxiliary_loss_mlp": 0.01000382, "balance_loss_clip": 1.02603984, "balance_loss_mlp": 0.99940491, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.7473540014464652, "language_loss": 0.55488926, "learning_rate": 2.557279015636827e-06, "loss": 0.57579911, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.2072298526763916 }, { "auxiliary_loss_clip": 0.01139343, "auxiliary_loss_mlp": 0.01002525, "balance_loss_clip": 1.02489305, "balance_loss_mlp": 1.00150585, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.768860048668422, "language_loss": 0.61214817, "learning_rate": 2.5565308523155245e-06, "loss": 0.63356686, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.180443286895752 }, { "auxiliary_loss_clip": 0.01382405, "auxiliary_loss_mlp": 0.01030204, "balance_loss_clip": 1.05183423, "balance_loss_mlp": 1.02234864, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 3.033121383110696, "language_loss": 0.82201719, "learning_rate": 2.5557826045689336e-06, "loss": 0.84614325, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.705798625946045 }, { "auxiliary_loss_clip": 0.01272862, "auxiliary_loss_mlp": 0.01004213, "balance_loss_clip": 1.04388881, "balance_loss_mlp": 1.00318229, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8308411567922598, "language_loss": 0.58800876, "learning_rate": 2.5550342725105643e-06, "loss": 0.61077952, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.204542398452759 }, { "auxiliary_loss_clip": 0.01246304, "auxiliary_loss_mlp": 0.01024074, "balance_loss_clip": 1.06065798, "balance_loss_mlp": 1.01617098, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.7546188931214184, "language_loss": 0.80925834, "learning_rate": 2.554285856253937e-06, "loss": 0.83196217, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.6638576984405518 }, { "auxiliary_loss_clip": 0.0128681, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.0550617, "balance_loss_mlp": 1.02189898, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.9230491824097171, "language_loss": 0.77853775, "learning_rate": 2.5535373559125855e-06, "loss": 0.8017031, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.777161121368408 }, { "auxiliary_loss_clip": 0.01495372, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 1.04932904, "balance_loss_mlp": 1.02054238, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.6257580476609694, "language_loss": 0.82303089, "learning_rate": 2.552788771600057e-06, "loss": 0.8482765, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 3.0004918575286865 }, { "auxiliary_loss_clip": 0.01345335, "auxiliary_loss_mlp": 0.01039259, "balance_loss_clip": 1.05518508, "balance_loss_mlp": 1.03132606, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 2.315677009081112, "language_loss": 0.82332397, "learning_rate": 2.5520401034299118e-06, "loss": 0.84716994, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.8085553646087646 }, { "auxiliary_loss_clip": 0.01246376, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.05730081, "balance_loss_mlp": 1.02145243, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 2.1996771887952025, "language_loss": 0.87950635, "learning_rate": 2.551291351515722e-06, "loss": 0.9022733, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.5949506759643555 }, { "auxiliary_loss_clip": 0.01340485, "auxiliary_loss_mlp": 0.02570751, "balance_loss_clip": 1.05006886, "balance_loss_mlp": 1.00000691, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.8784400070445777, "language_loss": 0.85652399, "learning_rate": 2.5505425159710726e-06, "loss": 0.89563638, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.7702322006225586 }, { "auxiliary_loss_clip": 0.01299989, "auxiliary_loss_mlp": 0.02570409, "balance_loss_clip": 1.05268931, "balance_loss_mlp": 1.00000262, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 3.127848622508519, "language_loss": 0.82758331, "learning_rate": 2.549793596909561e-06, "loss": 0.86628729, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 3.7199530601501465 }, { "auxiliary_loss_clip": 0.01295814, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.05844581, "balance_loss_mlp": 1.02053237, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 3.4916275386461795, "language_loss": 0.66671491, "learning_rate": 2.5490445944447976e-06, "loss": 0.68995488, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.708069086074829 }, { "auxiliary_loss_clip": 0.01241267, "auxiliary_loss_mlp": 0.0103119, "balance_loss_clip": 1.05464792, "balance_loss_mlp": 1.0232451, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 2.9548041056716023, "language_loss": 0.65143079, "learning_rate": 2.548295508690406e-06, "loss": 0.67415535, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.70430850982666 }, { "auxiliary_loss_clip": 0.01248164, "auxiliary_loss_mlp": 0.01035875, "balance_loss_clip": 1.05547071, "balance_loss_mlp": 1.02717876, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 1.912112274365101, "language_loss": 0.76339102, "learning_rate": 2.5475463397600217e-06, "loss": 0.78623146, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.771775245666504 }, { "auxiliary_loss_clip": 0.01195107, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.05662882, "balance_loss_mlp": 1.02364016, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 2.03358795252261, "language_loss": 0.77524245, "learning_rate": 2.546797087767293e-06, "loss": 0.79751074, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 3.571986675262451 }, { "auxiliary_loss_clip": 0.0138744, "auxiliary_loss_mlp": 0.01027328, "balance_loss_clip": 1.05182791, "balance_loss_mlp": 1.02007961, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.780893920072732, "language_loss": 0.87017858, "learning_rate": 2.546047752825881e-06, "loss": 0.89432621, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.744964361190796 }, { "auxiliary_loss_clip": 0.01392909, "auxiliary_loss_mlp": 0.01027765, "balance_loss_clip": 1.04789031, "balance_loss_mlp": 1.02050853, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.1006968125619294, "language_loss": 0.93210018, "learning_rate": 2.5452983350494595e-06, "loss": 0.95630693, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.7038044929504395 }, { "auxiliary_loss_clip": 0.01242646, "auxiliary_loss_mlp": 0.02569194, "balance_loss_clip": 1.05448759, "balance_loss_mlp": 0.99999237, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.460125088345148, "language_loss": 0.64920831, "learning_rate": 2.544548834551713e-06, "loss": 0.68732673, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.6535279750823975 }, { "auxiliary_loss_clip": 0.01338062, "auxiliary_loss_mlp": 0.02570295, "balance_loss_clip": 1.051669, "balance_loss_mlp": 1.00001121, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 3.5564630932949743, "language_loss": 0.94711471, "learning_rate": 2.5437992514463424e-06, "loss": 0.98619831, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.75175404548645 }, { "auxiliary_loss_clip": 0.01241935, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.0549624, "balance_loss_mlp": 1.02276683, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 1.7744226391641464, "language_loss": 0.88117385, "learning_rate": 2.5430495858470565e-06, "loss": 0.90390116, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 3.530515432357788 }, { "auxiliary_loss_clip": 0.01239811, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05519962, "balance_loss_mlp": 1.02144527, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 3.5693149182415365, "language_loss": 0.77725315, "learning_rate": 2.54229983786758e-06, "loss": 0.79994297, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.635326623916626 }, { "auxiliary_loss_clip": 0.01295603, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.05277228, "balance_loss_mlp": 1.02541184, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 2.3768172865900765, "language_loss": 0.85085183, "learning_rate": 2.541550007621651e-06, "loss": 0.87413907, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 3.6216514110565186 }, { "auxiliary_loss_clip": 0.01240587, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.05714202, "balance_loss_mlp": 1.0236814, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 1.7799199470541633, "language_loss": 0.79562271, "learning_rate": 2.5408000952230156e-06, "loss": 0.81834656, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.6858022212982178 }, { "auxiliary_loss_clip": 0.01257112, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.05297065, "balance_loss_mlp": 1.0245471, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 2.661135266159236, "language_loss": 0.90802622, "learning_rate": 2.5400501007854357e-06, "loss": 0.93092853, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.792750120162964 }, { "auxiliary_loss_clip": 0.01390454, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.04789948, "balance_loss_mlp": 1.02279305, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 2.265690662587956, "language_loss": 0.75883347, "learning_rate": 2.539300024422685e-06, "loss": 0.78303808, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.710742473602295 }, { "auxiliary_loss_clip": 0.01142882, "auxiliary_loss_mlp": 0.0100747, "balance_loss_clip": 1.01827943, "balance_loss_mlp": 1.00647438, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7900334355245108, "language_loss": 0.60900342, "learning_rate": 2.538549866248549e-06, "loss": 0.63050699, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.137239694595337 }, { "auxiliary_loss_clip": 0.0124247, "auxiliary_loss_mlp": 0.01024725, "balance_loss_clip": 1.05470502, "balance_loss_mlp": 1.01649904, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 1.8993112083641563, "language_loss": 0.81144607, "learning_rate": 2.5377996263768274e-06, "loss": 0.83411801, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.583667278289795 }, { "auxiliary_loss_clip": 0.01240363, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.05289495, "balance_loss_mlp": 1.02335858, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.8824157123991863, "language_loss": 0.68417001, "learning_rate": 2.5370493049213293e-06, "loss": 0.70688629, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.7586658000946045 }, { "auxiliary_loss_clip": 0.01529355, "auxiliary_loss_mlp": 0.01025333, "balance_loss_clip": 1.0461638, "balance_loss_mlp": 1.01732206, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 2.1720308980229377, "language_loss": 0.79835868, "learning_rate": 2.536298901995878e-06, "loss": 0.82390547, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 3.0233190059661865 }, { "auxiliary_loss_clip": 0.01293489, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.05288196, "balance_loss_mlp": 1.02257216, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.7635418619708798, "language_loss": 0.80211282, "learning_rate": 2.535548417714311e-06, "loss": 0.82535499, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.9549143314361572 }, { "auxiliary_loss_clip": 0.01149977, "auxiliary_loss_mlp": 0.01026013, "balance_loss_clip": 1.05243182, "balance_loss_mlp": 1.01881313, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.6330469918827237, "language_loss": 0.87451494, "learning_rate": 2.534797852190474e-06, "loss": 0.89627481, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.6934356689453125 }, { "auxiliary_loss_clip": 0.0124257, "auxiliary_loss_mlp": 0.01033316, "balance_loss_clip": 1.05428982, "balance_loss_mlp": 1.02579379, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 1.850993931918705, "language_loss": 0.81556547, "learning_rate": 2.5340472055382283e-06, "loss": 0.83832431, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.755305528640747 }, { "auxiliary_loss_clip": 0.01343848, "auxiliary_loss_mlp": 0.01026337, "balance_loss_clip": 1.04861331, "balance_loss_mlp": 1.01864839, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.1415975395444273, "language_loss": 0.81018138, "learning_rate": 2.5332964778714468e-06, "loss": 0.83388323, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.811648368835449 }, { "auxiliary_loss_clip": 0.01337526, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.05323923, "balance_loss_mlp": 1.02155209, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.7102871796397119, "language_loss": 0.66320336, "learning_rate": 2.5325456693040123e-06, "loss": 0.68687099, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.6743550300598145 }, { "auxiliary_loss_clip": 0.01154895, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.05504107, "balance_loss_mlp": 1.02541077, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 9.69028263982647, "language_loss": 0.74740815, "learning_rate": 2.531794779949824e-06, "loss": 0.76929879, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.6792287826538086 }, { "auxiliary_loss_clip": 0.01333214, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.04881895, "balance_loss_mlp": 1.02191377, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.8523865754337328, "language_loss": 0.87996209, "learning_rate": 2.5310438099227903e-06, "loss": 0.90358508, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.860227584838867 }, { "auxiliary_loss_clip": 0.01133563, "auxiliary_loss_mlp": 0.01001135, "balance_loss_clip": 1.01718211, "balance_loss_mlp": 1.00014555, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.8290640411962298, "language_loss": 0.5334813, "learning_rate": 2.530292759336833e-06, "loss": 0.55482829, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.3653109073638916 }, { "auxiliary_loss_clip": 0.01293295, "auxiliary_loss_mlp": 0.01034321, "balance_loss_clip": 1.05447459, "balance_loss_mlp": 1.02607131, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 14.952921790676909, "language_loss": 0.69639206, "learning_rate": 2.5295416283058855e-06, "loss": 0.71966827, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 4.447504281997681 }, { "auxiliary_loss_clip": 0.01290613, "auxiliary_loss_mlp": 0.02568062, "balance_loss_clip": 1.05196655, "balance_loss_mlp": 1.00007296, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.5641422205372977, "language_loss": 0.66200602, "learning_rate": 2.5287904169438943e-06, "loss": 0.7005927, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.6793129444122314 }, { "auxiliary_loss_clip": 0.01494587, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.04786921, "balance_loss_mlp": 1.02169824, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 2.9997293001812775, "language_loss": 0.6419096, "learning_rate": 2.528039125364817e-06, "loss": 0.66715348, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 3.8477768898010254 }, { "auxiliary_loss_clip": 0.01340983, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.05060482, "balance_loss_mlp": 1.01968741, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.246251639864913, "language_loss": 0.75869066, "learning_rate": 2.5272877536826246e-06, "loss": 0.78238094, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.7765753269195557 }, { "auxiliary_loss_clip": 0.01389218, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.04577994, "balance_loss_mlp": 1.02172601, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.3784720656511813, "language_loss": 0.70493144, "learning_rate": 2.5265363020112986e-06, "loss": 0.72911739, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 2.893726110458374 }, { "auxiliary_loss_clip": 0.01241188, "auxiliary_loss_mlp": 0.0102955, "balance_loss_clip": 1.05521214, "balance_loss_mlp": 1.02145553, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 1.9873171639659757, "language_loss": 0.8401683, "learning_rate": 2.5257847704648344e-06, "loss": 0.8628757, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.739973545074463 }, { "auxiliary_loss_clip": 0.01190616, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.05433631, "balance_loss_mlp": 1.02054524, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 3.43082209638467, "language_loss": 0.75661325, "learning_rate": 2.525033159157239e-06, "loss": 0.77880371, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.5428669452667236 }, { "auxiliary_loss_clip": 0.01241774, "auxiliary_loss_mlp": 0.01036734, "balance_loss_clip": 1.05187821, "balance_loss_mlp": 1.02874708, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.874280818922286, "language_loss": 0.76558959, "learning_rate": 2.52428146820253e-06, "loss": 0.78837466, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 3.617720365524292 }, { "auxiliary_loss_clip": 0.01338947, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.05124879, "balance_loss_mlp": 1.02197504, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.8196948739087184, "language_loss": 0.82072461, "learning_rate": 2.52352969771474e-06, "loss": 0.8444162, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.7816038131713867 }, { "auxiliary_loss_clip": 0.01292657, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.05231667, "balance_loss_mlp": 1.02239108, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.6960873155031555, "language_loss": 0.88483429, "learning_rate": 2.5227778478079106e-06, "loss": 0.90805876, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.6740357875823975 }, { "auxiliary_loss_clip": 0.01238165, "auxiliary_loss_mlp": 0.01026263, "balance_loss_clip": 1.05163169, "balance_loss_mlp": 1.01874971, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.6665002503545883, "language_loss": 0.76613605, "learning_rate": 2.522025918596098e-06, "loss": 0.78878033, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 3.720994234085083 }, { "auxiliary_loss_clip": 0.01147625, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.05471706, "balance_loss_mlp": 1.02474654, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.5881193457736333, "language_loss": 0.65508318, "learning_rate": 2.521273910193368e-06, "loss": 0.67688084, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.7395517826080322 }, { "auxiliary_loss_clip": 0.01248377, "auxiliary_loss_mlp": 0.01036082, "balance_loss_clip": 1.05558491, "balance_loss_mlp": 1.02721262, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.0415932757078474, "language_loss": 0.8719849, "learning_rate": 2.5205218227138006e-06, "loss": 0.89482945, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.6291465759277344 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01023918, "balance_loss_clip": 1.05354416, "balance_loss_mlp": 1.01556742, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 2.0502764172025065, "language_loss": 0.79223502, "learning_rate": 2.519769656271486e-06, "loss": 0.81438017, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.6317641735076904 }, { "auxiliary_loss_clip": 0.01381459, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.04880881, "balance_loss_mlp": 1.02126837, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 3.1248161950213347, "language_loss": 0.68024945, "learning_rate": 2.5190174109805285e-06, "loss": 0.70435667, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.6972553730010986 }, { "auxiliary_loss_clip": 0.01292437, "auxiliary_loss_mlp": 0.0103369, "balance_loss_clip": 1.05473685, "balance_loss_mlp": 1.02542245, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 3.124676244333142, "language_loss": 0.63684702, "learning_rate": 2.518265086955042e-06, "loss": 0.66010833, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.7032532691955566 }, { "auxiliary_loss_clip": 0.01192685, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.0544138, "balance_loss_mlp": 1.02590466, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 1.8638113803137257, "language_loss": 0.83531141, "learning_rate": 2.5175126843091534e-06, "loss": 0.85757411, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.632643699645996 }, { "auxiliary_loss_clip": 0.0119684, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.05409539, "balance_loss_mlp": 1.01867819, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.21269220547321, "language_loss": 0.75520551, "learning_rate": 2.5167602031570034e-06, "loss": 0.77743697, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.798128366470337 }, { "auxiliary_loss_clip": 0.01190805, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.05357969, "balance_loss_mlp": 1.01930857, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.676458072038454, "language_loss": 0.73517108, "learning_rate": 2.51600764361274e-06, "loss": 0.7573489, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.6677186489105225 }, { "auxiliary_loss_clip": 0.01194664, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.05534923, "balance_loss_mlp": 1.02378845, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.8860735270827207, "language_loss": 0.78691703, "learning_rate": 2.5152550057905283e-06, "loss": 0.80918193, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.6827006340026855 }, { "auxiliary_loss_clip": 0.01244882, "auxiliary_loss_mlp": 0.025741, "balance_loss_clip": 1.05687308, "balance_loss_mlp": 1.00007641, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.7138049623920337, "language_loss": 0.77619267, "learning_rate": 2.5145022898045415e-06, "loss": 0.81438249, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.636960744857788 }, { "auxiliary_loss_clip": 0.01200049, "auxiliary_loss_mlp": 0.01028371, "balance_loss_clip": 1.05025852, "balance_loss_mlp": 1.02036881, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 2.4450813421506936, "language_loss": 0.89908433, "learning_rate": 2.5137494957689664e-06, "loss": 0.92136854, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.7191097736358643 }, { "auxiliary_loss_clip": 0.01191167, "auxiliary_loss_mlp": 0.00999812, "balance_loss_clip": 1.02324796, "balance_loss_mlp": 0.99863786, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7650114423034884, "language_loss": 0.57298207, "learning_rate": 2.5129966237980016e-06, "loss": 0.59489191, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.2720253467559814 }, { "auxiliary_loss_clip": 0.01340087, "auxiliary_loss_mlp": 0.01034748, "balance_loss_clip": 1.04953003, "balance_loss_mlp": 1.02682042, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 1.8513623679821196, "language_loss": 0.78021407, "learning_rate": 2.512243674005857e-06, "loss": 0.80396241, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.7402632236480713 }, { "auxiliary_loss_clip": 0.01438711, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.04444861, "balance_loss_mlp": 1.02120042, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 1.9874637486426991, "language_loss": 0.86295724, "learning_rate": 2.5114906465067537e-06, "loss": 0.8876344, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.8907909393310547 }, { "auxiliary_loss_clip": 0.01241426, "auxiliary_loss_mlp": 0.01026959, "balance_loss_clip": 1.05144572, "balance_loss_mlp": 1.01917422, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 7.144919884638316, "language_loss": 0.74966246, "learning_rate": 2.5107375414149264e-06, "loss": 0.77234626, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.663081645965576 }, { "auxiliary_loss_clip": 0.01381836, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.04357719, "balance_loss_mlp": 1.01855886, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 2.345727007557022, "language_loss": 0.71985662, "learning_rate": 2.5099843588446197e-06, "loss": 0.74394506, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 4.2744810581207275 }, { "auxiliary_loss_clip": 0.01294539, "auxiliary_loss_mlp": 0.01029337, "balance_loss_clip": 1.04914558, "balance_loss_mlp": 1.0212549, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 1.6109583145395872, "language_loss": 0.61530256, "learning_rate": 2.509231098910091e-06, "loss": 0.63854134, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.827373504638672 }, { "auxiliary_loss_clip": 0.01299672, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.06062663, "balance_loss_mlp": 1.02624726, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 4.922012104101964, "language_loss": 0.74836552, "learning_rate": 2.508477761725611e-06, "loss": 0.77171004, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.73087215423584 }, { "auxiliary_loss_clip": 0.01247107, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.05420721, "balance_loss_mlp": 1.02018619, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 1.933452134185116, "language_loss": 0.81131309, "learning_rate": 2.507724347405458e-06, "loss": 0.83406782, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 3.55426025390625 }, { "auxiliary_loss_clip": 0.01386269, "auxiliary_loss_mlp": 0.01024798, "balance_loss_clip": 1.04603243, "balance_loss_mlp": 1.01703799, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.312905227365521, "language_loss": 0.81924099, "learning_rate": 2.5069708560639243e-06, "loss": 0.84335166, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.763002634048462 }, { "auxiliary_loss_clip": 0.01342296, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.05032814, "balance_loss_mlp": 1.02176738, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 2.146308433032509, "language_loss": 0.61625993, "learning_rate": 2.5062172878153158e-06, "loss": 0.63998008, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.8498942852020264 }, { "auxiliary_loss_clip": 0.01349227, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.04864907, "balance_loss_mlp": 1.02184927, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.1182757923338453, "language_loss": 0.87684572, "learning_rate": 2.505463642773947e-06, "loss": 0.900639, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.879458427429199 }, { "auxiliary_loss_clip": 0.01343888, "auxiliary_loss_mlp": 0.02574847, "balance_loss_clip": 1.05242205, "balance_loss_mlp": 1.00013793, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 3.216625369964374, "language_loss": 0.75334132, "learning_rate": 2.504709921054146e-06, "loss": 0.79252863, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 3.5762667655944824 }, { "auxiliary_loss_clip": 0.01339566, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.04541063, "balance_loss_mlp": 1.02565956, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.2298991511003967, "language_loss": 0.83816218, "learning_rate": 2.50395612277025e-06, "loss": 0.86189747, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.814850330352783 }, { "auxiliary_loss_clip": 0.01295758, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.05116487, "balance_loss_mlp": 1.02440441, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.307418920225434, "language_loss": 0.73304403, "learning_rate": 2.503202248036612e-06, "loss": 0.75632155, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.7428195476531982 }, { "auxiliary_loss_clip": 0.01191828, "auxiliary_loss_mlp": 0.01028653, "balance_loss_clip": 1.05563748, "balance_loss_mlp": 1.02077913, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 1.795330105084189, "language_loss": 0.73391038, "learning_rate": 2.5024482969675927e-06, "loss": 0.7561152, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 3.567664861679077 }, { "auxiliary_loss_clip": 0.01297147, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04730368, "balance_loss_mlp": 1.01779246, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.367792406696381, "language_loss": 0.84377515, "learning_rate": 2.501694269677566e-06, "loss": 0.86699903, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.828904628753662 }, { "auxiliary_loss_clip": 0.01246403, "auxiliary_loss_mlp": 0.01027769, "balance_loss_clip": 1.05303621, "balance_loss_mlp": 1.01963282, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 2.001875209734955, "language_loss": 0.80396557, "learning_rate": 2.500940166280918e-06, "loss": 0.82670724, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.642428159713745 }, { "auxiliary_loss_clip": 0.01241045, "auxiliary_loss_mlp": 0.01026841, "balance_loss_clip": 1.05320048, "balance_loss_mlp": 1.01924753, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.9815927877909907, "language_loss": 0.79090714, "learning_rate": 2.500185986892045e-06, "loss": 0.813586, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.753901243209839 }, { "auxiliary_loss_clip": 0.01240785, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.05135584, "balance_loss_mlp": 1.02215934, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.120155099575838, "language_loss": 0.77652025, "learning_rate": 2.499431731625355e-06, "loss": 0.7992329, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.6922478675842285 }, { "auxiliary_loss_clip": 0.01196396, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.05610311, "balance_loss_mlp": 1.02066588, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 1.8491617082904117, "language_loss": 0.79402196, "learning_rate": 2.4986774005952686e-06, "loss": 0.81627333, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.7138524055480957 }, { "auxiliary_loss_clip": 0.01246559, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.05756688, "balance_loss_mlp": 1.02708864, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 2.4967405997775884, "language_loss": 0.84628278, "learning_rate": 2.4979229939162166e-06, "loss": 0.86910033, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.660477876663208 }, { "auxiliary_loss_clip": 0.01238013, "auxiliary_loss_mlp": 0.01036787, "balance_loss_clip": 1.05377555, "balance_loss_mlp": 1.02913988, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.6867878510196712, "language_loss": 0.80646408, "learning_rate": 2.4971685117026433e-06, "loss": 0.82921207, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.7222275733947754 }, { "auxiliary_loss_clip": 0.01245753, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.05547547, "balance_loss_mlp": 1.01898873, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 2.264335382624015, "language_loss": 0.76747549, "learning_rate": 2.4964139540690018e-06, "loss": 0.79020309, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.700502634048462 }, { "auxiliary_loss_clip": 0.01343106, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.05207634, "balance_loss_mlp": 1.02408409, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 1.9039227832274725, "language_loss": 0.72678655, "learning_rate": 2.495659321129758e-06, "loss": 0.75054753, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.798823356628418 }, { "auxiliary_loss_clip": 0.01240436, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.05127108, "balance_loss_mlp": 1.01938367, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 3.896935195436091, "language_loss": 0.7554028, "learning_rate": 2.494904612999389e-06, "loss": 0.77807742, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.7444708347320557 }, { "auxiliary_loss_clip": 0.01130302, "auxiliary_loss_mlp": 0.01001747, "balance_loss_clip": 1.01770616, "balance_loss_mlp": 1.00076318, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7725913654156559, "language_loss": 0.56476879, "learning_rate": 2.4941498297923843e-06, "loss": 0.58608925, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.1892731189727783 }, { "auxiliary_loss_clip": 0.01239828, "auxiliary_loss_mlp": 0.01026698, "balance_loss_clip": 1.05492544, "balance_loss_mlp": 1.01912808, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.743196922355191, "language_loss": 0.70010269, "learning_rate": 2.4933949716232424e-06, "loss": 0.72276795, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.729036569595337 }, { "auxiliary_loss_clip": 0.01249989, "auxiliary_loss_mlp": 0.01034384, "balance_loss_clip": 1.05666208, "balance_loss_mlp": 1.02568185, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.2643169847666216, "language_loss": 0.73676264, "learning_rate": 2.492640038606476e-06, "loss": 0.75960642, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.835496425628662 }, { "auxiliary_loss_clip": 0.01241819, "auxiliary_loss_mlp": 0.01027351, "balance_loss_clip": 1.05287921, "balance_loss_mlp": 1.01911986, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 2.384688142173742, "language_loss": 0.79066879, "learning_rate": 2.491885030856608e-06, "loss": 0.81336051, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.746540069580078 }, { "auxiliary_loss_clip": 0.01295217, "auxiliary_loss_mlp": 0.01028577, "balance_loss_clip": 1.05398142, "balance_loss_mlp": 1.0201968, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 3.2380095752449267, "language_loss": 0.82589281, "learning_rate": 2.4911299484881713e-06, "loss": 0.84913075, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.6745572090148926 }, { "auxiliary_loss_clip": 0.01287247, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.04986382, "balance_loss_mlp": 1.02773714, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 1.660037849228677, "language_loss": 0.80948502, "learning_rate": 2.490374791615712e-06, "loss": 0.83271098, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 4.103867292404175 }, { "auxiliary_loss_clip": 0.01199812, "auxiliary_loss_mlp": 0.02576741, "balance_loss_clip": 1.05863428, "balance_loss_mlp": 1.00017452, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 4.081661212294089, "language_loss": 0.77691877, "learning_rate": 2.4896195603537867e-06, "loss": 0.81468427, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.6325931549072266 }, { "auxiliary_loss_clip": 0.01379605, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.05219519, "balance_loss_mlp": 1.02155936, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.3413320540595377, "language_loss": 0.74221408, "learning_rate": 2.488864254816964e-06, "loss": 0.76630366, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.747596263885498 }, { "auxiliary_loss_clip": 0.01246241, "auxiliary_loss_mlp": 0.01031937, "balance_loss_clip": 1.05719304, "balance_loss_mlp": 1.02312136, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 3.619054590764539, "language_loss": 0.68349457, "learning_rate": 2.4881088751198218e-06, "loss": 0.70627636, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 3.5251617431640625 }, { "auxiliary_loss_clip": 0.01293429, "auxiliary_loss_mlp": 0.01027402, "balance_loss_clip": 1.05135286, "balance_loss_mlp": 1.01899457, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 2.7518245982952125, "language_loss": 0.63593435, "learning_rate": 2.4873534213769517e-06, "loss": 0.65914273, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.66157865524292 }, { "auxiliary_loss_clip": 0.0133532, "auxiliary_loss_mlp": 0.01025288, "balance_loss_clip": 1.05147636, "balance_loss_mlp": 1.01768267, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 2.1379320777519215, "language_loss": 0.71607924, "learning_rate": 2.4865978937029547e-06, "loss": 0.7396853, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.717647075653076 }, { "auxiliary_loss_clip": 0.0138093, "auxiliary_loss_mlp": 0.01026313, "balance_loss_clip": 1.04888868, "balance_loss_mlp": 1.01859474, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.6468010401713709, "language_loss": 0.66342866, "learning_rate": 2.485842292212445e-06, "loss": 0.68750107, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.845449447631836 }, { "auxiliary_loss_clip": 0.01196153, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.05723822, "balance_loss_mlp": 1.02304435, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 1.8824213127660807, "language_loss": 0.80477983, "learning_rate": 2.485086617020045e-06, "loss": 0.82705092, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 3.5354011058807373 }, { "auxiliary_loss_clip": 0.01286508, "auxiliary_loss_mlp": 0.01030828, "balance_loss_clip": 1.04961646, "balance_loss_mlp": 1.02239954, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 2.2315388401603036, "language_loss": 0.81584716, "learning_rate": 2.4843308682403903e-06, "loss": 0.83902049, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.6608755588531494 }, { "auxiliary_loss_clip": 0.01191689, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.05412161, "balance_loss_mlp": 1.01719487, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 3.5788394175186817, "language_loss": 0.82979465, "learning_rate": 2.4835750459881294e-06, "loss": 0.85195839, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.578390121459961 }, { "auxiliary_loss_clip": 0.01287396, "auxiliary_loss_mlp": 0.01026843, "balance_loss_clip": 1.04741979, "balance_loss_mlp": 1.01889753, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 1.9285165040900338, "language_loss": 0.81532592, "learning_rate": 2.4828191503779177e-06, "loss": 0.83846831, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 3.7613468170166016 }, { "auxiliary_loss_clip": 0.01335855, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.04758954, "balance_loss_mlp": 1.02329254, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 1.9463114340833605, "language_loss": 0.89893234, "learning_rate": 2.482063181524425e-06, "loss": 0.92261106, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.8031158447265625 }, { "auxiliary_loss_clip": 0.0119641, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.05652285, "balance_loss_mlp": 1.02168787, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.0044187706947296, "language_loss": 0.81445193, "learning_rate": 2.4813071395423307e-06, "loss": 0.83672225, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.5990357398986816 }, { "auxiliary_loss_clip": 0.01241302, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.05244267, "balance_loss_mlp": 1.01945877, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 1.9768455460907064, "language_loss": 0.64964122, "learning_rate": 2.4805510245463263e-06, "loss": 0.67232627, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.690753936767578 }, { "auxiliary_loss_clip": 0.01242316, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.05158448, "balance_loss_mlp": 1.02352309, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.5244088465145547, "language_loss": 0.60660112, "learning_rate": 2.4797948366511137e-06, "loss": 0.62934136, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.658458709716797 }, { "auxiliary_loss_clip": 0.01339094, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.04716456, "balance_loss_mlp": 1.01997161, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 2.8768115687304006, "language_loss": 0.76277453, "learning_rate": 2.4790385759714055e-06, "loss": 0.78643966, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.7350387573242188 }, { "auxiliary_loss_clip": 0.01248108, "auxiliary_loss_mlp": 0.01025229, "balance_loss_clip": 1.05920815, "balance_loss_mlp": 1.01718879, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.9554695720631914, "language_loss": 0.71298957, "learning_rate": 2.478282242621926e-06, "loss": 0.73572296, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.637824535369873 }, { "auxiliary_loss_clip": 0.01245779, "auxiliary_loss_mlp": 0.00998786, "balance_loss_clip": 1.01877964, "balance_loss_mlp": 0.9975878, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8502763865470065, "language_loss": 0.59497631, "learning_rate": 2.477525836717411e-06, "loss": 0.61742187, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.40004563331604 }, { "auxiliary_loss_clip": 0.01239981, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.05162477, "balance_loss_mlp": 1.03098536, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 4.319037692219301, "language_loss": 0.79439437, "learning_rate": 2.476769358372606e-06, "loss": 0.81718904, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.7879490852355957 }, { "auxiliary_loss_clip": 0.01339794, "auxiliary_loss_mlp": 0.01027083, "balance_loss_clip": 1.05434799, "balance_loss_mlp": 1.01906574, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.0520763278678653, "language_loss": 0.74913609, "learning_rate": 2.4760128077022683e-06, "loss": 0.77280492, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.7001712322235107 }, { "auxiliary_loss_clip": 0.01287741, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.04776144, "balance_loss_mlp": 1.01997244, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.5453901491034103, "language_loss": 0.68307656, "learning_rate": 2.4752561848211672e-06, "loss": 0.70623153, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.8263654708862305 }, { "auxiliary_loss_clip": 0.01239709, "auxiliary_loss_mlp": 0.0103557, "balance_loss_clip": 1.05674171, "balance_loss_mlp": 1.02846479, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 3.9084065352777784, "language_loss": 0.71456718, "learning_rate": 2.4744994898440797e-06, "loss": 0.73731995, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.637880802154541 }, { "auxiliary_loss_clip": 0.01342968, "auxiliary_loss_mlp": 0.0102767, "balance_loss_clip": 1.04875469, "balance_loss_mlp": 1.0196228, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 1.9982485306790896, "language_loss": 0.83934611, "learning_rate": 2.473742722885797e-06, "loss": 0.86305249, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.7398924827575684 }, { "auxiliary_loss_clip": 0.01248248, "auxiliary_loss_mlp": 0.02575791, "balance_loss_clip": 1.05866778, "balance_loss_mlp": 1.00019598, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 7.76065153445084, "language_loss": 0.64965665, "learning_rate": 2.4729858840611197e-06, "loss": 0.68789703, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.675870418548584 }, { "auxiliary_loss_clip": 0.01191012, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.05493975, "balance_loss_mlp": 1.02569604, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 2.2496651145399844, "language_loss": 0.72850543, "learning_rate": 2.4722289734848605e-06, "loss": 0.75074685, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.721208333969116 }, { "auxiliary_loss_clip": 0.01345948, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.05645096, "balance_loss_mlp": 1.02092671, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 1.9688862116279817, "language_loss": 0.77659667, "learning_rate": 2.471471991271841e-06, "loss": 0.80034274, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.7162418365478516 }, { "auxiliary_loss_clip": 0.01236974, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.05043101, "balance_loss_mlp": 1.02562928, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 2.041069827738184, "language_loss": 0.79501867, "learning_rate": 2.470714937536896e-06, "loss": 0.81772423, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 3.5676069259643555 }, { "auxiliary_loss_clip": 0.01394514, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.05287576, "balance_loss_mlp": 1.02236187, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 1.923843422672611, "language_loss": 0.7073096, "learning_rate": 2.469957812394868e-06, "loss": 0.73156697, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.804262638092041 }, { "auxiliary_loss_clip": 0.01192687, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.05763066, "balance_loss_mlp": 1.0194627, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 2.010929986598282, "language_loss": 0.76056063, "learning_rate": 2.4692006159606148e-06, "loss": 0.78275585, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 3.4874372482299805 }, { "auxiliary_loss_clip": 0.01190138, "auxiliary_loss_mlp": 0.01034086, "balance_loss_clip": 1.05263257, "balance_loss_mlp": 1.02636147, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 2.226479243552932, "language_loss": 0.78773665, "learning_rate": 2.468443348349e-06, "loss": 0.80997884, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.6435623168945312 }, { "auxiliary_loss_clip": 0.01385898, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.04609835, "balance_loss_mlp": 1.02393949, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 2.395775891984414, "language_loss": 0.82746494, "learning_rate": 2.467686009674902e-06, "loss": 0.85164338, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.729168176651001 }, { "auxiliary_loss_clip": 0.01238638, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.05047107, "balance_loss_mlp": 1.02087021, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 2.3097031710249687, "language_loss": 0.85226822, "learning_rate": 2.466928600053209e-06, "loss": 0.87494344, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.656160593032837 }, { "auxiliary_loss_clip": 0.0129276, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.05105937, "balance_loss_mlp": 1.02191806, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 2.3003399172885777, "language_loss": 0.71247113, "learning_rate": 2.466171119598818e-06, "loss": 0.73569578, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.726193428039551 }, { "auxiliary_loss_clip": 0.01244131, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.05011499, "balance_loss_mlp": 1.01811326, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.8317933104616118, "language_loss": 0.76972616, "learning_rate": 2.465413568426639e-06, "loss": 0.79242909, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.6508617401123047 }, { "auxiliary_loss_clip": 0.01233668, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.05008137, "balance_loss_mlp": 1.02080774, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.713300305501779, "language_loss": 0.81220222, "learning_rate": 2.464655946651591e-06, "loss": 0.83482128, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 3.636655330657959 }, { "auxiliary_loss_clip": 0.01240736, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.0536195, "balance_loss_mlp": 1.02439702, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 1.9606686780714202, "language_loss": 0.80882102, "learning_rate": 2.4638982543886065e-06, "loss": 0.8315466, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.652500629425049 }, { "auxiliary_loss_clip": 0.01242936, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.05404699, "balance_loss_mlp": 1.03209245, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.285180986394316, "language_loss": 0.8700397, "learning_rate": 2.4631404917526254e-06, "loss": 0.89287251, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 3.578205108642578 }, { "auxiliary_loss_clip": 0.01238667, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.05125952, "balance_loss_mlp": 1.02269077, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.576433020144998, "language_loss": 0.79254675, "learning_rate": 2.4623826588586e-06, "loss": 0.81523198, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.6929454803466797 }, { "auxiliary_loss_clip": 0.01288374, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.04859853, "balance_loss_mlp": 1.02463531, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.5528572497136857, "language_loss": 0.82668424, "learning_rate": 2.461624755821492e-06, "loss": 0.84989613, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.6083621978759766 }, { "auxiliary_loss_clip": 0.01243317, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.0503583, "balance_loss_mlp": 1.02152169, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.6981724054760885, "language_loss": 0.76274276, "learning_rate": 2.4608667827562763e-06, "loss": 0.7854687, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.7795984745025635 }, { "auxiliary_loss_clip": 0.01249437, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.0218091, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 2.6305949077882547, "language_loss": 0.90071869, "learning_rate": 2.460108739777936e-06, "loss": 0.92351854, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.6429452896118164 }, { "auxiliary_loss_clip": 0.01290859, "auxiliary_loss_mlp": 0.0102617, "balance_loss_clip": 1.05358386, "balance_loss_mlp": 1.01849902, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.6729632591677888, "language_loss": 0.76426852, "learning_rate": 2.4593506270014656e-06, "loss": 0.78743881, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.6688597202301025 }, { "auxiliary_loss_clip": 0.01290612, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.04938781, "balance_loss_mlp": 1.02143526, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.829886679436439, "language_loss": 0.81875074, "learning_rate": 2.45859244454187e-06, "loss": 0.84195614, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.729163646697998 }, { "auxiliary_loss_clip": 0.01235654, "auxiliary_loss_mlp": 0.01029769, "balance_loss_clip": 1.05409551, "balance_loss_mlp": 1.02231216, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.9311850136490099, "language_loss": 0.66700423, "learning_rate": 2.4578341925141655e-06, "loss": 0.68965846, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.693181276321411 }, { "auxiliary_loss_clip": 0.01247625, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.05311584, "balance_loss_mlp": 1.01809883, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 2.077117068977518, "language_loss": 0.72281384, "learning_rate": 2.457075871033378e-06, "loss": 0.7455492, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.781879186630249 }, { "auxiliary_loss_clip": 0.013362, "auxiliary_loss_mlp": 0.01028549, "balance_loss_clip": 1.04954839, "balance_loss_mlp": 1.02083623, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 2.2687762630086437, "language_loss": 0.88627636, "learning_rate": 2.4563174802145445e-06, "loss": 0.90992391, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.7024874687194824 }, { "auxiliary_loss_clip": 0.01190686, "auxiliary_loss_mlp": 0.009984, "balance_loss_clip": 1.01979434, "balance_loss_mlp": 0.99738044, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6347478576403353, "language_loss": 0.48635247, "learning_rate": 2.455559020172712e-06, "loss": 0.50824332, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.3425819873809814 }, { "auxiliary_loss_clip": 0.01390471, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.05336583, "balance_loss_mlp": 1.02603936, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 2.2295318912027806, "language_loss": 0.89800674, "learning_rate": 2.4548004910229385e-06, "loss": 0.92224979, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.7460951805114746 }, { "auxiliary_loss_clip": 0.01242309, "auxiliary_loss_mlp": 0.02572979, "balance_loss_clip": 1.05286455, "balance_loss_mlp": 1.00028479, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 16.422846651453995, "language_loss": 0.87031204, "learning_rate": 2.4540418928802913e-06, "loss": 0.90846491, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.7322394847869873 }, { "auxiliary_loss_clip": 0.01294676, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.05296242, "balance_loss_mlp": 1.02142787, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.1858803640084097, "language_loss": 0.6574589, "learning_rate": 2.4532832258598506e-06, "loss": 0.68070161, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.657613515853882 }, { "auxiliary_loss_clip": 0.01190192, "auxiliary_loss_mlp": 0.01028287, "balance_loss_clip": 1.05512357, "balance_loss_mlp": 1.02065778, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.8067751417135207, "language_loss": 0.80545902, "learning_rate": 2.4525244900767047e-06, "loss": 0.82764387, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.727884531021118 }, { "auxiliary_loss_clip": 0.01140289, "auxiliary_loss_mlp": 0.01001892, "balance_loss_clip": 1.02708447, "balance_loss_mlp": 1.00083661, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7800839528244163, "language_loss": 0.60482812, "learning_rate": 2.4517656856459536e-06, "loss": 0.62624991, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.336541175842285 }, { "auxiliary_loss_clip": 0.01244892, "auxiliary_loss_mlp": 0.0103185, "balance_loss_clip": 1.05599344, "balance_loss_mlp": 1.023785, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.683193250307588, "language_loss": 0.68536592, "learning_rate": 2.4510068126827073e-06, "loss": 0.70813334, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 3.6834263801574707 }, { "auxiliary_loss_clip": 0.01290449, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.05224621, "balance_loss_mlp": 1.017802, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.3199011470148987, "language_loss": 0.8172797, "learning_rate": 2.450247871302086e-06, "loss": 0.84044242, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.723374366760254 }, { "auxiliary_loss_clip": 0.01149026, "auxiliary_loss_mlp": 0.01023885, "balance_loss_clip": 1.054497, "balance_loss_mlp": 1.01655674, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.5971370662413453, "language_loss": 0.8426773, "learning_rate": 2.44948886161922e-06, "loss": 0.86440647, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 3.511808156967163 }, { "auxiliary_loss_clip": 0.0124418, "auxiliary_loss_mlp": 0.01024847, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.01728296, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 3.5906810708127233, "language_loss": 0.84849453, "learning_rate": 2.4487297837492524e-06, "loss": 0.87118471, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.637890577316284 }, { "auxiliary_loss_clip": 0.01336759, "auxiliary_loss_mlp": 0.01029919, "balance_loss_clip": 1.04950511, "balance_loss_mlp": 1.02194965, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 2.3521210355766824, "language_loss": 0.62255698, "learning_rate": 2.4479706378073323e-06, "loss": 0.64622372, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.702260971069336 }, { "auxiliary_loss_clip": 0.01332781, "auxiliary_loss_mlp": 0.01024545, "balance_loss_clip": 1.04517436, "balance_loss_mlp": 1.01713586, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 3.6008378175362146, "language_loss": 0.83900249, "learning_rate": 2.447211423908623e-06, "loss": 0.86257577, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.78127121925354 }, { "auxiliary_loss_clip": 0.0124196, "auxiliary_loss_mlp": 0.01023266, "balance_loss_clip": 1.05199814, "balance_loss_mlp": 1.01573253, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 2.5810667340950846, "language_loss": 0.75063103, "learning_rate": 2.4464521421682966e-06, "loss": 0.7732833, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.6940622329711914 }, { "auxiliary_loss_clip": 0.01234643, "auxiliary_loss_mlp": 0.01021816, "balance_loss_clip": 1.05329978, "balance_loss_mlp": 1.01487541, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.3760451228982526, "language_loss": 0.87751299, "learning_rate": 2.4456927927015345e-06, "loss": 0.90007758, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 3.5958380699157715 }, { "auxiliary_loss_clip": 0.01296013, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.02262425, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.097516071621075, "language_loss": 0.76086938, "learning_rate": 2.4449333756235307e-06, "loss": 0.78413409, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.6564993858337402 }, { "auxiliary_loss_clip": 0.01147771, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.05492282, "balance_loss_mlp": 1.0196656, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 4.385392629449563, "language_loss": 0.79401582, "learning_rate": 2.4441738910494876e-06, "loss": 0.81576478, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 3.59679913520813 }, { "auxiliary_loss_clip": 0.01294464, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.04873812, "balance_loss_mlp": 1.02202666, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 1.9019144346155867, "language_loss": 0.82293272, "learning_rate": 2.4434143390946176e-06, "loss": 0.84617841, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.6975271701812744 }, { "auxiliary_loss_clip": 0.01332791, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.04926145, "balance_loss_mlp": 1.02262294, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 2.4009411949666437, "language_loss": 0.85677922, "learning_rate": 2.4426547198741457e-06, "loss": 0.8804121, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.7406575679779053 }, { "auxiliary_loss_clip": 0.01388968, "auxiliary_loss_mlp": 0.01028017, "balance_loss_clip": 1.05263972, "balance_loss_mlp": 1.02063751, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.3604504946926452, "language_loss": 0.74464369, "learning_rate": 2.441895033503305e-06, "loss": 0.76881355, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.691481828689575 }, { "auxiliary_loss_clip": 0.01240086, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 1.05272925, "balance_loss_mlp": 1.01921248, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.7615770687975116, "language_loss": 0.82123739, "learning_rate": 2.4411352800973375e-06, "loss": 0.8439151, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.676727294921875 }, { "auxiliary_loss_clip": 0.01335839, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.0479362, "balance_loss_mlp": 1.02005053, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.459165516222412, "language_loss": 0.75003099, "learning_rate": 2.4403754597715005e-06, "loss": 0.77367431, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.7106218338012695 }, { "auxiliary_loss_clip": 0.01293393, "auxiliary_loss_mlp": 0.01028578, "balance_loss_clip": 1.04811358, "balance_loss_mlp": 1.02048361, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 7.602396509808295, "language_loss": 0.93315625, "learning_rate": 2.4396155726410553e-06, "loss": 0.95637596, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.743741512298584 }, { "auxiliary_loss_clip": 0.01149275, "auxiliary_loss_mlp": 0.01028577, "balance_loss_clip": 1.05221891, "balance_loss_mlp": 1.02038193, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.559781605195445, "language_loss": 0.9080739, "learning_rate": 2.438855618821278e-06, "loss": 0.92985243, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.645972967147827 }, { "auxiliary_loss_clip": 0.01235582, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.04982448, "balance_loss_mlp": 1.02296972, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 2.1279969550035105, "language_loss": 0.67548454, "learning_rate": 2.4380955984274517e-06, "loss": 0.69814289, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.6464011669158936 }, { "auxiliary_loss_clip": 0.01242152, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.05128515, "balance_loss_mlp": 1.02278948, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 2.9328385849130942, "language_loss": 0.77010018, "learning_rate": 2.4373355115748716e-06, "loss": 0.79282862, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.6546247005462646 }, { "auxiliary_loss_clip": 0.01284948, "auxiliary_loss_mlp": 0.01025573, "balance_loss_clip": 1.04979134, "balance_loss_mlp": 1.01831007, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.8423883595885777, "language_loss": 0.72192341, "learning_rate": 2.436575358378842e-06, "loss": 0.74502861, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.702970027923584 }, { "auxiliary_loss_clip": 0.01294114, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.05216324, "balance_loss_mlp": 1.02415764, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 7.562878547212082, "language_loss": 0.82822949, "learning_rate": 2.4358151389546782e-06, "loss": 0.85148466, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.644186019897461 }, { "auxiliary_loss_clip": 0.01193138, "auxiliary_loss_mlp": 0.01030646, "balance_loss_clip": 1.05595374, "balance_loss_mlp": 1.02249169, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 3.4924914284154545, "language_loss": 0.76343095, "learning_rate": 2.4350548534177035e-06, "loss": 0.78566885, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.6119513511657715 }, { "auxiliary_loss_clip": 0.01340161, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.04995191, "balance_loss_mlp": 1.02310765, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.7113879772095146, "language_loss": 0.66618979, "learning_rate": 2.434294501883254e-06, "loss": 0.68989694, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.9217934608459473 }, { "auxiliary_loss_clip": 0.01286668, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.0491668, "balance_loss_mlp": 1.01843858, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.783134079730421, "language_loss": 0.65695965, "learning_rate": 2.433534084466674e-06, "loss": 0.68008232, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.6394989490509033 }, { "auxiliary_loss_clip": 0.0118811, "auxiliary_loss_mlp": 0.01024636, "balance_loss_clip": 1.05409729, "balance_loss_mlp": 1.01733482, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.4284424971931367, "language_loss": 0.70918661, "learning_rate": 2.4327736012833178e-06, "loss": 0.73131412, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.6845009326934814 }, { "auxiliary_loss_clip": 0.01241397, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.05439556, "balance_loss_mlp": 1.02439928, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.0754534051987648, "language_loss": 0.76696777, "learning_rate": 2.4320130524485506e-06, "loss": 0.78970176, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.6409871578216553 }, { "auxiliary_loss_clip": 0.01291971, "auxiliary_loss_mlp": 0.01034345, "balance_loss_clip": 1.05943751, "balance_loss_mlp": 1.02699518, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 1.7697018722845022, "language_loss": 0.7963953, "learning_rate": 2.431252438077746e-06, "loss": 0.8196584, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 3.630763053894043 }, { "auxiliary_loss_clip": 0.01244955, "auxiliary_loss_mlp": 0.02572192, "balance_loss_clip": 1.05243397, "balance_loss_mlp": 1.00029027, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.2355415896932214, "language_loss": 0.77575248, "learning_rate": 2.4304917582862906e-06, "loss": 0.81392395, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.654783010482788 }, { "auxiliary_loss_clip": 0.01190495, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.0543232, "balance_loss_mlp": 1.01698935, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.004255668168773, "language_loss": 0.88026631, "learning_rate": 2.4297310131895774e-06, "loss": 0.9024151, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.6098766326904297 }, { "auxiliary_loss_clip": 0.01238964, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.05191779, "balance_loss_mlp": 1.02343535, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 2.126159557469395, "language_loss": 0.75374532, "learning_rate": 2.4289702029030113e-06, "loss": 0.77644467, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 3.5250134468078613 }, { "auxiliary_loss_clip": 0.01241402, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.05515909, "balance_loss_mlp": 1.01923084, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 2.1522964378988676, "language_loss": 0.82931137, "learning_rate": 2.4282093275420057e-06, "loss": 0.85199994, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.6070189476013184 }, { "auxiliary_loss_clip": 0.01147564, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.05336916, "balance_loss_mlp": 1.02144837, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.0605486830990305, "language_loss": 0.70365649, "learning_rate": 2.4274483872219863e-06, "loss": 0.7254222, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.6517491340637207 }, { "auxiliary_loss_clip": 0.0123842, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.05215263, "balance_loss_mlp": 1.02541363, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 2.0805774134270507, "language_loss": 0.93463743, "learning_rate": 2.426687382058386e-06, "loss": 0.95735067, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 2.6423158645629883 }, { "auxiliary_loss_clip": 0.01134886, "auxiliary_loss_mlp": 0.00999598, "balance_loss_clip": 1.0219419, "balance_loss_mlp": 0.99848986, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8610757040144524, "language_loss": 0.59829783, "learning_rate": 2.425926312166649e-06, "loss": 0.61964273, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.946404218673706 }, { "auxiliary_loss_clip": 0.01296165, "auxiliary_loss_mlp": 0.01032419, "balance_loss_clip": 1.05412507, "balance_loss_mlp": 1.02414, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 3.322130977112531, "language_loss": 0.73114991, "learning_rate": 2.42516517766223e-06, "loss": 0.75443578, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.8545007705688477 }, { "auxiliary_loss_clip": 0.01190746, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.05646288, "balance_loss_mlp": 1.02483988, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.1078443529155386, "language_loss": 0.68077314, "learning_rate": 2.4244039786605907e-06, "loss": 0.70300281, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.628124952316284 }, { "auxiliary_loss_clip": 0.01395806, "auxiliary_loss_mlp": 0.01027268, "balance_loss_clip": 1.04957414, "balance_loss_mlp": 1.01934314, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.727985663337072, "language_loss": 0.82495773, "learning_rate": 2.4236427152772055e-06, "loss": 0.84918839, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 3.6828114986419678 }, { "auxiliary_loss_clip": 0.01229049, "auxiliary_loss_mlp": 0.01005259, "balance_loss_clip": 1.02051556, "balance_loss_mlp": 1.0041678, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8424922756011921, "language_loss": 0.57366902, "learning_rate": 2.422881387627557e-06, "loss": 0.59601206, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 2.948422908782959 }, { "auxiliary_loss_clip": 0.01196525, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.05260074, "balance_loss_mlp": 1.02288806, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.6175395854326855, "language_loss": 0.77591336, "learning_rate": 2.422119995827139e-06, "loss": 0.79818141, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.7915284633636475 }, { "auxiliary_loss_clip": 0.01150937, "auxiliary_loss_mlp": 0.01032159, "balance_loss_clip": 1.05635905, "balance_loss_mlp": 1.02430344, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 3.1596265851656717, "language_loss": 0.73565793, "learning_rate": 2.4213585399914528e-06, "loss": 0.75748897, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.652933120727539 }, { "auxiliary_loss_clip": 0.01236015, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.05277944, "balance_loss_mlp": 1.02045226, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.8780888002989298, "language_loss": 0.85373259, "learning_rate": 2.4205970202360113e-06, "loss": 0.87637311, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.625178098678589 }, { "auxiliary_loss_clip": 0.01382655, "auxiliary_loss_mlp": 0.01025631, "balance_loss_clip": 1.04756999, "balance_loss_mlp": 1.01803792, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 2.4043587564760354, "language_loss": 0.78227782, "learning_rate": 2.4198354366763354e-06, "loss": 0.80636072, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.770158052444458 }, { "auxiliary_loss_clip": 0.01291345, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.05092001, "balance_loss_mlp": 1.02100086, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 2.214545018096871, "language_loss": 0.78434503, "learning_rate": 2.4190737894279587e-06, "loss": 0.80754697, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.627232789993286 }, { "auxiliary_loss_clip": 0.01330592, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.04283929, "balance_loss_mlp": 1.02076077, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 3.2609682076343334, "language_loss": 0.80605197, "learning_rate": 2.4183120786064203e-06, "loss": 0.829638, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.7067580223083496 }, { "auxiliary_loss_clip": 0.01240688, "auxiliary_loss_mlp": 0.02572512, "balance_loss_clip": 1.05456722, "balance_loss_mlp": 1.00026679, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.3274356843168564, "language_loss": 0.85519111, "learning_rate": 2.417550304327273e-06, "loss": 0.89332312, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.6546127796173096 }, { "auxiliary_loss_clip": 0.01195008, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.05634916, "balance_loss_mlp": 1.01809502, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 3.6757397910318397, "language_loss": 0.75987935, "learning_rate": 2.4167884667060763e-06, "loss": 0.7820974, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.728443145751953 }, { "auxiliary_loss_clip": 0.01295748, "auxiliary_loss_mlp": 0.01027197, "balance_loss_clip": 1.05213737, "balance_loss_mlp": 1.01819658, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.2534084406764796, "language_loss": 0.87413067, "learning_rate": 2.4160265658584e-06, "loss": 0.89736009, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.6248879432678223 }, { "auxiliary_loss_clip": 0.01246317, "auxiliary_loss_mlp": 0.01028728, "balance_loss_clip": 1.05477238, "balance_loss_mlp": 1.02109218, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 2.4001490442870046, "language_loss": 0.68589705, "learning_rate": 2.4152646018998253e-06, "loss": 0.70864749, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.63167142868042 }, { "auxiliary_loss_clip": 0.01234419, "auxiliary_loss_mlp": 0.01027239, "balance_loss_clip": 1.05173075, "balance_loss_mlp": 1.01883531, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.6317378981776416, "language_loss": 0.71991462, "learning_rate": 2.4145025749459403e-06, "loss": 0.74253118, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.6444029808044434 }, { "auxiliary_loss_clip": 0.01493731, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.05128551, "balance_loss_mlp": 1.02408147, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 1.843726805120693, "language_loss": 0.70123398, "learning_rate": 2.413740485112344e-06, "loss": 0.72648942, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.8397233486175537 }, { "auxiliary_loss_clip": 0.01289797, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.05613923, "balance_loss_mlp": 1.02320719, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 2.119312321412857, "language_loss": 0.82376075, "learning_rate": 2.412978332514646e-06, "loss": 0.84696692, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.8666911125183105 }, { "auxiliary_loss_clip": 0.01294305, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.05308676, "balance_loss_mlp": 1.0180583, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 1.9738335627143062, "language_loss": 0.72519988, "learning_rate": 2.4122161172684623e-06, "loss": 0.74840289, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 4.1330931186676025 }, { "auxiliary_loss_clip": 0.01197249, "auxiliary_loss_mlp": 0.01023857, "balance_loss_clip": 1.05304468, "balance_loss_mlp": 1.01598287, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 11.158845244137302, "language_loss": 0.84432101, "learning_rate": 2.4114538394894216e-06, "loss": 0.86653209, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.780290126800537 }, { "auxiliary_loss_clip": 0.01287173, "auxiliary_loss_mlp": 0.01025836, "balance_loss_clip": 1.04719615, "balance_loss_mlp": 1.01845694, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.8320710319742433, "language_loss": 0.83035618, "learning_rate": 2.410691499293161e-06, "loss": 0.85348624, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.6646435260772705 }, { "auxiliary_loss_clip": 0.01240296, "auxiliary_loss_mlp": 0.01021993, "balance_loss_clip": 1.05361485, "balance_loss_mlp": 1.01454282, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 2.0072652166886678, "language_loss": 0.74380028, "learning_rate": 2.409929096795326e-06, "loss": 0.76642323, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.7151083946228027 }, { "auxiliary_loss_clip": 0.01238498, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.04946828, "balance_loss_mlp": 1.02646899, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 1.9641472131863893, "language_loss": 0.79249746, "learning_rate": 2.409166632111573e-06, "loss": 0.8152253, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 3.5687038898468018 }, { "auxiliary_loss_clip": 0.01247629, "auxiliary_loss_mlp": 0.01027031, "balance_loss_clip": 1.05341995, "balance_loss_mlp": 1.01916325, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 2.408056542237496, "language_loss": 0.80849463, "learning_rate": 2.4084041053575674e-06, "loss": 0.83124125, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.6511526107788086 }, { "auxiliary_loss_clip": 0.01200548, "auxiliary_loss_mlp": 0.01022082, "balance_loss_clip": 1.05422103, "balance_loss_mlp": 1.01439571, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 1.9625880814995473, "language_loss": 0.72051525, "learning_rate": 2.4076415166489834e-06, "loss": 0.74274153, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.717254638671875 }, { "auxiliary_loss_clip": 0.01299459, "auxiliary_loss_mlp": 0.01026354, "balance_loss_clip": 1.05069959, "balance_loss_mlp": 1.01937485, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 2.1091342424594206, "language_loss": 0.7922554, "learning_rate": 2.406878866101506e-06, "loss": 0.81551349, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.745755910873413 }, { "auxiliary_loss_clip": 0.01193139, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.05760837, "balance_loss_mlp": 1.0222919, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 5.228285237625514, "language_loss": 0.78506172, "learning_rate": 2.4061161538308273e-06, "loss": 0.80729508, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 3.465075969696045 }, { "auxiliary_loss_clip": 0.01239167, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.0539279, "balance_loss_mlp": 1.02395153, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.7994305621921198, "language_loss": 0.89218819, "learning_rate": 2.4053533799526523e-06, "loss": 0.91489697, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.665757894515991 }, { "auxiliary_loss_clip": 0.01280344, "auxiliary_loss_mlp": 0.01023454, "balance_loss_clip": 1.05118561, "balance_loss_mlp": 1.01603031, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.9760134220883396, "language_loss": 0.86507332, "learning_rate": 2.404590544582691e-06, "loss": 0.88811123, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.710015296936035 }, { "auxiliary_loss_clip": 0.0139368, "auxiliary_loss_mlp": 0.01028552, "balance_loss_clip": 1.04398036, "balance_loss_mlp": 1.02108097, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.6337276316873381, "language_loss": 0.81255722, "learning_rate": 2.403827647836666e-06, "loss": 0.83677959, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 3.8277535438537598 }, { "auxiliary_loss_clip": 0.01193956, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 1.05554533, "balance_loss_mlp": 1.01867437, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 2.8573908355479145, "language_loss": 0.69340968, "learning_rate": 2.4030646898303075e-06, "loss": 0.71561098, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.5686209201812744 }, { "auxiliary_loss_clip": 0.01295246, "auxiliary_loss_mlp": 0.01029737, "balance_loss_clip": 1.05134761, "balance_loss_mlp": 1.02177942, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.2930931055239907, "language_loss": 0.81911272, "learning_rate": 2.4023016706793566e-06, "loss": 0.84236252, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.773864269256592 }, { "auxiliary_loss_clip": 0.01247756, "auxiliary_loss_mlp": 0.01002805, "balance_loss_clip": 1.01754284, "balance_loss_mlp": 1.00160146, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7757053055278016, "language_loss": 0.56829971, "learning_rate": 2.401538590499561e-06, "loss": 0.59080535, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.32999849319458 }, { "auxiliary_loss_clip": 0.01245086, "auxiliary_loss_mlp": 0.02574962, "balance_loss_clip": 1.05427337, "balance_loss_mlp": 1.00023282, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 2.6318939199140345, "language_loss": 0.71916378, "learning_rate": 2.400775449406682e-06, "loss": 0.75736427, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.7250149250030518 }, { "auxiliary_loss_clip": 0.01240035, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.05167437, "balance_loss_mlp": 1.01952839, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 1.7769371975097383, "language_loss": 0.73263663, "learning_rate": 2.400012247516485e-06, "loss": 0.75530702, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.6692867279052734 }, { "auxiliary_loss_clip": 0.01343721, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.05039597, "balance_loss_mlp": 1.0302496, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.696614396341837, "language_loss": 0.90295804, "learning_rate": 2.3992489849447484e-06, "loss": 0.92677808, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.680875539779663 }, { "auxiliary_loss_clip": 0.01247192, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.05208731, "balance_loss_mlp": 1.01947474, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 1.606540719140495, "language_loss": 0.78940904, "learning_rate": 2.3984856618072584e-06, "loss": 0.81215179, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.6931560039520264 }, { "auxiliary_loss_clip": 0.0134466, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.05013061, "balance_loss_mlp": 1.02110755, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 1.9719938210876449, "language_loss": 0.74178606, "learning_rate": 2.3977222782198098e-06, "loss": 0.76551986, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.694934368133545 }, { "auxiliary_loss_clip": 0.01340897, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 1.05171669, "balance_loss_mlp": 1.02297258, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 2.3363946334963344, "language_loss": 0.75187653, "learning_rate": 2.3969588342982077e-06, "loss": 0.77559566, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.7590036392211914 }, { "auxiliary_loss_clip": 0.01238487, "auxiliary_loss_mlp": 0.01031703, "balance_loss_clip": 1.05597019, "balance_loss_mlp": 1.02378178, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 2.0435823618122733, "language_loss": 0.73015046, "learning_rate": 2.396195330158267e-06, "loss": 0.75285232, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.6724274158477783 }, { "auxiliary_loss_clip": 0.01191082, "auxiliary_loss_mlp": 0.01028062, "balance_loss_clip": 1.05421352, "balance_loss_mlp": 1.01983023, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.906045884705502, "language_loss": 0.79778373, "learning_rate": 2.3954317659158094e-06, "loss": 0.81997514, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.6691017150878906 }, { "auxiliary_loss_clip": 0.01082304, "auxiliary_loss_mlp": 0.00999525, "balance_loss_clip": 1.01901031, "balance_loss_mlp": 0.99845171, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8862425267990435, "language_loss": 0.56923795, "learning_rate": 2.394668141686667e-06, "loss": 0.59005618, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.270596742630005 }, { "auxiliary_loss_clip": 0.01239773, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.05283093, "balance_loss_mlp": 1.02689517, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 2.1108735666060894, "language_loss": 0.69795978, "learning_rate": 2.3939044575866813e-06, "loss": 0.72070134, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.843432664871216 }, { "auxiliary_loss_clip": 0.01289054, "auxiliary_loss_mlp": 0.02571807, "balance_loss_clip": 1.05097246, "balance_loss_mlp": 1.00015533, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.162069015810933, "language_loss": 0.75345063, "learning_rate": 2.3931407137317024e-06, "loss": 0.7920593, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.8259613513946533 }, { "auxiliary_loss_clip": 0.01241973, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.04690516, "balance_loss_mlp": 1.02187347, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 1.9653931279219832, "language_loss": 0.84858596, "learning_rate": 2.3923769102375907e-06, "loss": 0.87129915, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 3.652050495147705 }, { "auxiliary_loss_clip": 0.01346026, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.05186093, "balance_loss_mlp": 1.02016425, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.3976704416712358, "language_loss": 0.78881907, "learning_rate": 2.391613047220213e-06, "loss": 0.81256258, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.806893825531006 }, { "auxiliary_loss_clip": 0.01299572, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.04989171, "balance_loss_mlp": 1.02552533, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 1.82200292235073, "language_loss": 0.79150575, "learning_rate": 2.390849124795447e-06, "loss": 0.8148334, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.785508871078491 }, { "auxiliary_loss_clip": 0.0119102, "auxiliary_loss_mlp": 0.01028628, "balance_loss_clip": 1.05545187, "balance_loss_mlp": 1.02099824, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 2.4708814174936817, "language_loss": 0.84064639, "learning_rate": 2.3900851430791804e-06, "loss": 0.86284292, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.6401987075805664 }, { "auxiliary_loss_clip": 0.01195226, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.05603218, "balance_loss_mlp": 1.02990437, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 2.0466586828471334, "language_loss": 0.84908569, "learning_rate": 2.389321102187307e-06, "loss": 0.87142432, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 3.6507809162139893 }, { "auxiliary_loss_clip": 0.01288145, "auxiliary_loss_mlp": 0.02574227, "balance_loss_clip": 1.05324721, "balance_loss_mlp": 1.00027108, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 1.8438026485290797, "language_loss": 0.81710696, "learning_rate": 2.3885570022357326e-06, "loss": 0.85573071, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.7101633548736572 }, { "auxiliary_loss_clip": 0.01150222, "auxiliary_loss_mlp": 0.01005678, "balance_loss_clip": 1.01582134, "balance_loss_mlp": 1.00457513, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.8204444831447317, "language_loss": 0.6084919, "learning_rate": 2.38779284334037e-06, "loss": 0.6300509, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.3596444129943848 }, { "auxiliary_loss_clip": 0.01377125, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.04406738, "balance_loss_mlp": 1.02365947, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.0444942985253163, "language_loss": 0.7899406, "learning_rate": 2.387028625617141e-06, "loss": 0.81402898, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 3.693798780441284 }, { "auxiliary_loss_clip": 0.01278999, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.04688954, "balance_loss_mlp": 1.02040982, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 2.0653990066218424, "language_loss": 0.8466332, "learning_rate": 2.3862643491819766e-06, "loss": 0.8697027, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.6826469898223877 }, { "auxiliary_loss_clip": 0.01236119, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.04859519, "balance_loss_mlp": 1.02412546, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 2.415012622731512, "language_loss": 0.84360826, "learning_rate": 2.3855000141508186e-06, "loss": 0.86629021, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.720057725906372 }, { "auxiliary_loss_clip": 0.01302892, "auxiliary_loss_mlp": 0.01027134, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.01896811, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.2518827185403874, "language_loss": 0.84140873, "learning_rate": 2.3847356206396143e-06, "loss": 0.86470902, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 3.640944242477417 }, { "auxiliary_loss_clip": 0.0119059, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.05601668, "balance_loss_mlp": 1.02519441, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.8948190942857028, "language_loss": 0.78547728, "learning_rate": 2.3839711687643227e-06, "loss": 0.80771005, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.6233291625976562 }, { "auxiliary_loss_clip": 0.01240711, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.05318618, "balance_loss_mlp": 1.02493143, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 2.0201608715337005, "language_loss": 0.73555052, "learning_rate": 2.38320665864091e-06, "loss": 0.75828928, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.6667556762695312 }, { "auxiliary_loss_clip": 0.01438266, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.0442121, "balance_loss_mlp": 1.02117634, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.9653944428299872, "language_loss": 0.82559812, "learning_rate": 2.3824420903853516e-06, "loss": 0.85027224, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.7948105335235596 }, { "auxiliary_loss_clip": 0.01239507, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 1.05474234, "balance_loss_mlp": 1.02293599, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.2121125141569506, "language_loss": 0.81801242, "learning_rate": 2.3816774641136324e-06, "loss": 0.84071541, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.7031588554382324 }, { "auxiliary_loss_clip": 0.01239656, "auxiliary_loss_mlp": 0.02572735, "balance_loss_clip": 1.05416226, "balance_loss_mlp": 1.00019193, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.6245423809058133, "language_loss": 0.71372426, "learning_rate": 2.380912779941745e-06, "loss": 0.7518481, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.7770488262176514 }, { "auxiliary_loss_clip": 0.01242131, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.04954028, "balance_loss_mlp": 1.02223337, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 3.610844004318401, "language_loss": 0.8348943, "learning_rate": 2.3801480379856918e-06, "loss": 0.85762137, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.7507071495056152 }, { "auxiliary_loss_clip": 0.01288446, "auxiliary_loss_mlp": 0.01025813, "balance_loss_clip": 1.05233777, "balance_loss_mlp": 1.01806152, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 2.647962130462285, "language_loss": 0.83737952, "learning_rate": 2.379383238361484e-06, "loss": 0.86052209, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.668191432952881 }, { "auxiliary_loss_clip": 0.0123489, "auxiliary_loss_mlp": 0.01022063, "balance_loss_clip": 1.05227828, "balance_loss_mlp": 1.01454079, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 1.7806433561384112, "language_loss": 0.79418778, "learning_rate": 2.3786183811851407e-06, "loss": 0.81675732, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.7531847953796387 }, { "auxiliary_loss_clip": 0.01194804, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 1.05826211, "balance_loss_mlp": 1.02596831, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 2.1113601466581042, "language_loss": 0.80348104, "learning_rate": 2.3778534665726892e-06, "loss": 0.82577187, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.56135630607605 }, { "auxiliary_loss_clip": 0.01233451, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.05144179, "balance_loss_mlp": 1.02203596, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 2.2324512098020604, "language_loss": 0.72790921, "learning_rate": 2.377088494640168e-06, "loss": 0.7505424, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.7323715686798096 }, { "auxiliary_loss_clip": 0.01237506, "auxiliary_loss_mlp": 0.01031243, "balance_loss_clip": 1.05449665, "balance_loss_mlp": 1.02391791, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.9540346161211581, "language_loss": 0.78030717, "learning_rate": 2.3763234655036216e-06, "loss": 0.80299473, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.683030128479004 }, { "auxiliary_loss_clip": 0.01341368, "auxiliary_loss_mlp": 0.01026846, "balance_loss_clip": 1.04600585, "balance_loss_mlp": 1.01935339, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.239520564353719, "language_loss": 0.87056434, "learning_rate": 2.3755583792791046e-06, "loss": 0.89424646, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.7555291652679443 }, { "auxiliary_loss_clip": 0.0124031, "auxiliary_loss_mlp": 0.01029923, "balance_loss_clip": 1.05193353, "balance_loss_mlp": 1.02197742, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 2.0792173679350903, "language_loss": 0.74881482, "learning_rate": 2.3747932360826803e-06, "loss": 0.77151716, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.650646924972534 }, { "auxiliary_loss_clip": 0.01238254, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.05324721, "balance_loss_mlp": 1.02773929, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 3.2662488937391143, "language_loss": 0.81705034, "learning_rate": 2.3740280360304205e-06, "loss": 0.8397941, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.6398158073425293 }, { "auxiliary_loss_clip": 0.01334947, "auxiliary_loss_mlp": 0.01029002, "balance_loss_clip": 1.05160809, "balance_loss_mlp": 1.02150416, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 7.082715003444192, "language_loss": 0.68177891, "learning_rate": 2.3732627792384038e-06, "loss": 0.70541841, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.74052095413208 }, { "auxiliary_loss_clip": 0.01192128, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.05408585, "balance_loss_mlp": 1.02112484, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 1.8609238395939316, "language_loss": 0.75474596, "learning_rate": 2.3724974658227207e-06, "loss": 0.77696133, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.6296122074127197 }, { "auxiliary_loss_clip": 0.01288963, "auxiliary_loss_mlp": 0.02572513, "balance_loss_clip": 1.05369163, "balance_loss_mlp": 1.00024033, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 2.0080662262903344, "language_loss": 0.71537262, "learning_rate": 2.3717320958994687e-06, "loss": 0.75398737, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 3.5845093727111816 }, { "auxiliary_loss_clip": 0.01341068, "auxiliary_loss_mlp": 0.01025639, "balance_loss_clip": 1.0442735, "balance_loss_mlp": 1.01806927, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 2.648083261223988, "language_loss": 0.70697004, "learning_rate": 2.3709666695847534e-06, "loss": 0.73063707, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.668367385864258 }, { "auxiliary_loss_clip": 0.01384037, "auxiliary_loss_mlp": 0.01025961, "balance_loss_clip": 1.04565036, "balance_loss_mlp": 1.01827455, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.8976836174292617, "language_loss": 0.70081341, "learning_rate": 2.370201186994689e-06, "loss": 0.72491336, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.9069762229919434 }, { "auxiliary_loss_clip": 0.01282162, "auxiliary_loss_mlp": 0.01029195, "balance_loss_clip": 1.05015039, "balance_loss_mlp": 1.02154768, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 2.0271877913603875, "language_loss": 0.69914877, "learning_rate": 2.369435648245399e-06, "loss": 0.72226232, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 3.6675643920898438 }, { "auxiliary_loss_clip": 0.01289891, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.05225801, "balance_loss_mlp": 1.02069592, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.9119481201830448, "language_loss": 0.85318375, "learning_rate": 2.368670053453015e-06, "loss": 0.87636638, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.6565229892730713 }, { "auxiliary_loss_clip": 0.01246728, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.05562401, "balance_loss_mlp": 1.02545381, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 11.217891514699565, "language_loss": 0.74100953, "learning_rate": 2.3679044027336757e-06, "loss": 0.76381433, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 2.651259660720825 }, { "auxiliary_loss_clip": 0.01192039, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.05394518, "balance_loss_mlp": 1.02532446, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 2.530207605006795, "language_loss": 0.69434786, "learning_rate": 2.3671386962035326e-06, "loss": 0.71660268, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.605445146560669 }, { "auxiliary_loss_clip": 0.0124379, "auxiliary_loss_mlp": 0.0102653, "balance_loss_clip": 1.05482984, "balance_loss_mlp": 1.01863241, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.251672658932789, "language_loss": 0.68758178, "learning_rate": 2.3663729339787405e-06, "loss": 0.71028495, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 3.5178382396698 }, { "auxiliary_loss_clip": 0.01191454, "auxiliary_loss_mlp": 0.01034856, "balance_loss_clip": 1.05425, "balance_loss_mlp": 1.02710724, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.9737099364518675, "language_loss": 0.73558927, "learning_rate": 2.365607116175466e-06, "loss": 0.75785232, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.6345810890197754 }, { "auxiliary_loss_clip": 0.0118936, "auxiliary_loss_mlp": 0.01027827, "balance_loss_clip": 1.05395615, "balance_loss_mlp": 1.02044511, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 2.4001923545394446, "language_loss": 0.66926008, "learning_rate": 2.3648412429098825e-06, "loss": 0.691432, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.585380792617798 }, { "auxiliary_loss_clip": 0.01337742, "auxiliary_loss_mlp": 0.01028219, "balance_loss_clip": 1.04953599, "balance_loss_mlp": 1.01955211, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 1.9943718656519582, "language_loss": 0.81728518, "learning_rate": 2.364075314298172e-06, "loss": 0.84094483, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 3.64162015914917 }, { "auxiliary_loss_clip": 0.01244038, "auxiliary_loss_mlp": 0.02572066, "balance_loss_clip": 1.0525043, "balance_loss_mlp": 1.00015593, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 1.9566767249174601, "language_loss": 0.70559084, "learning_rate": 2.3633093304565267e-06, "loss": 0.74375188, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.6926941871643066 }, { "auxiliary_loss_clip": 0.0119563, "auxiliary_loss_mlp": 0.01029397, "balance_loss_clip": 1.05601478, "balance_loss_mlp": 1.02121913, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 1.971035727839775, "language_loss": 0.62916529, "learning_rate": 2.3625432915011443e-06, "loss": 0.65141559, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.6404056549072266 }, { "auxiliary_loss_clip": 0.01284168, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.05022264, "balance_loss_mlp": 1.01932669, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.7610226979676078, "language_loss": 0.65285987, "learning_rate": 2.3617771975482334e-06, "loss": 0.67598081, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.680150032043457 }, { "auxiliary_loss_clip": 0.01385535, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.04764128, "balance_loss_mlp": 1.02426219, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 2.3390824718578656, "language_loss": 0.74648345, "learning_rate": 2.3610110487140083e-06, "loss": 0.77065462, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.879978656768799 }, { "auxiliary_loss_clip": 0.01294908, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.05451012, "balance_loss_mlp": 1.02167034, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.738336404596747, "language_loss": 0.80646324, "learning_rate": 2.360244845114695e-06, "loss": 0.8297112, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.844585418701172 }, { "auxiliary_loss_clip": 0.01290455, "auxiliary_loss_mlp": 0.01026389, "balance_loss_clip": 1.05674374, "balance_loss_mlp": 1.01845527, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.3622480972939677, "language_loss": 0.68030834, "learning_rate": 2.3594785868665245e-06, "loss": 0.70347679, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.6979055404663086 }, { "auxiliary_loss_clip": 0.01335443, "auxiliary_loss_mlp": 0.02571774, "balance_loss_clip": 1.04770648, "balance_loss_mlp": 1.00018692, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 2.1946609926955007, "language_loss": 0.80950904, "learning_rate": 2.3587122740857386e-06, "loss": 0.84858119, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.7742745876312256 }, { "auxiliary_loss_clip": 0.01236732, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.05138326, "balance_loss_mlp": 1.02316761, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.6766761926432985, "language_loss": 0.78185833, "learning_rate": 2.357945906888586e-06, "loss": 0.80453706, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.6121435165405273 }, { "auxiliary_loss_clip": 0.01241127, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.0538342, "balance_loss_mlp": 1.02045858, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 2.4003641969147163, "language_loss": 0.800978, "learning_rate": 2.357179485391324e-06, "loss": 0.82368159, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.6502604484558105 }, { "auxiliary_loss_clip": 0.01185681, "auxiliary_loss_mlp": 0.0102745, "balance_loss_clip": 1.0523448, "balance_loss_mlp": 1.02010357, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 1.899972200241331, "language_loss": 0.86098677, "learning_rate": 2.3564130097102173e-06, "loss": 0.88311803, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.5731513500213623 }, { "auxiliary_loss_clip": 0.01293767, "auxiliary_loss_mlp": 0.01035174, "balance_loss_clip": 1.05716705, "balance_loss_mlp": 1.02694845, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.7492497690841724, "language_loss": 0.75084633, "learning_rate": 2.355646479961541e-06, "loss": 0.77413571, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.7721428871154785 }, { "auxiliary_loss_clip": 0.01189392, "auxiliary_loss_mlp": 0.01031132, "balance_loss_clip": 1.05443263, "balance_loss_mlp": 1.02322865, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.9579369219528553, "language_loss": 0.71623933, "learning_rate": 2.354879896261576e-06, "loss": 0.73844457, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.6812431812286377 }, { "auxiliary_loss_clip": 0.01331431, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.05123973, "balance_loss_mlp": 1.02483165, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 2.676217698682711, "language_loss": 0.56904352, "learning_rate": 2.3541132587266133e-06, "loss": 0.5926851, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.8692526817321777 }, { "auxiliary_loss_clip": 0.01343162, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.02062702, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 2.6788369998359505, "language_loss": 0.69467956, "learning_rate": 2.3533465674729515e-06, "loss": 0.71839488, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.745424747467041 }, { "auxiliary_loss_clip": 0.01192441, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.05639625, "balance_loss_mlp": 1.01941478, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 1.894337244581451, "language_loss": 0.7312004, "learning_rate": 2.352579822616895e-06, "loss": 0.75340271, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 3.534074068069458 }, { "auxiliary_loss_clip": 0.01192552, "auxiliary_loss_mlp": 0.01027362, "balance_loss_clip": 1.05332279, "balance_loss_mlp": 1.01979828, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.8920544572995615, "language_loss": 0.777246, "learning_rate": 2.351813024274761e-06, "loss": 0.79944515, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.7319180965423584 }, { "auxiliary_loss_clip": 0.01341443, "auxiliary_loss_mlp": 0.01020772, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.01285672, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.920987220726425, "language_loss": 0.73696518, "learning_rate": 2.3510461725628693e-06, "loss": 0.76058733, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.7346620559692383 }, { "auxiliary_loss_clip": 0.01338609, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.0490036, "balance_loss_mlp": 1.02019119, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 3.3960501003335546, "language_loss": 0.71244437, "learning_rate": 2.350279267597554e-06, "loss": 0.73610741, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.7440078258514404 }, { "auxiliary_loss_clip": 0.012409, "auxiliary_loss_mlp": 0.01029263, "balance_loss_clip": 1.05416906, "balance_loss_mlp": 1.02061987, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.5420020654804603, "language_loss": 0.82981455, "learning_rate": 2.3495123094951515e-06, "loss": 0.85251617, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 3.604879856109619 }, { "auxiliary_loss_clip": 0.0128251, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.05017579, "balance_loss_mlp": 1.02262127, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 2.6648086171973686, "language_loss": 0.76215899, "learning_rate": 2.34874529837201e-06, "loss": 0.7852881, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 2.9201557636260986 }, { "auxiliary_loss_clip": 0.0143032, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.0441618, "balance_loss_mlp": 1.02197194, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 1.8672540102189963, "language_loss": 0.79092324, "learning_rate": 2.347978234344483e-06, "loss": 0.81553155, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.803262233734131 }, { "auxiliary_loss_clip": 0.01245801, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.05458593, "balance_loss_mlp": 1.02363205, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.9659964585638647, "language_loss": 0.68971902, "learning_rate": 2.347211117528935e-06, "loss": 0.71249139, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 3.6241583824157715 }, { "auxiliary_loss_clip": 0.0134866, "auxiliary_loss_mlp": 0.01029825, "balance_loss_clip": 1.05534399, "balance_loss_mlp": 1.02226686, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 2.8829025999502917, "language_loss": 0.7159692, "learning_rate": 2.3464439480417374e-06, "loss": 0.73975402, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.703158378601074 }, { "auxiliary_loss_clip": 0.0124189, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 1.05371499, "balance_loss_mlp": 1.01630092, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 15.635150617582998, "language_loss": 0.77053308, "learning_rate": 2.3456767259992676e-06, "loss": 0.79319656, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.6110827922821045 }, { "auxiliary_loss_clip": 0.01193844, "auxiliary_loss_mlp": 0.02568455, "balance_loss_clip": 1.05594635, "balance_loss_mlp": 1.00020957, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.3173641939712284, "language_loss": 0.8901853, "learning_rate": 2.3449094515179135e-06, "loss": 0.92780834, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 3.4762136936187744 }, { "auxiliary_loss_clip": 0.01293739, "auxiliary_loss_mlp": 0.0102611, "balance_loss_clip": 1.05033576, "balance_loss_mlp": 1.01845098, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.5480555633087338, "language_loss": 0.81836867, "learning_rate": 2.34414212471407e-06, "loss": 0.84156716, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.8071155548095703 }, { "auxiliary_loss_clip": 0.01247657, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.05414832, "balance_loss_mlp": 1.01689684, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 2.9636621252214206, "language_loss": 0.73047394, "learning_rate": 2.3433747457041394e-06, "loss": 0.75320089, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.6484642028808594 }, { "auxiliary_loss_clip": 0.01342393, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.05313444, "balance_loss_mlp": 1.02456975, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 6.568252629198071, "language_loss": 0.84850806, "learning_rate": 2.342607314604533e-06, "loss": 0.87226504, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.8291783332824707 }, { "auxiliary_loss_clip": 0.01245938, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.0587728, "balance_loss_mlp": 1.02181768, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 1.8522557388596232, "language_loss": 0.84383571, "learning_rate": 2.3418398315316694e-06, "loss": 0.86659372, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.6408703327178955 }, { "auxiliary_loss_clip": 0.01191556, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.05556786, "balance_loss_mlp": 1.02697086, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.35199553352666, "language_loss": 0.78359401, "learning_rate": 2.3410722966019755e-06, "loss": 0.80585563, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.649674415588379 }, { "auxiliary_loss_clip": 0.01238289, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.05266333, "balance_loss_mlp": 1.02316308, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.8048458367479314, "language_loss": 0.65530419, "learning_rate": 2.3403047099318848e-06, "loss": 0.67799199, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.7783634662628174 }, { "auxiliary_loss_clip": 0.0137982, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.04550195, "balance_loss_mlp": 1.02492619, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.9047903771485366, "language_loss": 0.75847697, "learning_rate": 2.3395370716378405e-06, "loss": 0.78260064, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.7800769805908203 }, { "auxiliary_loss_clip": 0.01243597, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.05370629, "balance_loss_mlp": 1.02418351, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.0965437585827775, "language_loss": 0.7221607, "learning_rate": 2.338769381836292e-06, "loss": 0.74490964, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.6813509464263916 }, { "auxiliary_loss_clip": 0.01342958, "auxiliary_loss_mlp": 0.01030042, "balance_loss_clip": 1.05467081, "balance_loss_mlp": 1.02202797, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 1.984260830798692, "language_loss": 0.73034894, "learning_rate": 2.3380016406436984e-06, "loss": 0.75407898, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.6821343898773193 }, { "auxiliary_loss_clip": 0.01390271, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.05113637, "balance_loss_mlp": 1.02381134, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 2.01460061179832, "language_loss": 0.81514555, "learning_rate": 2.337233848176524e-06, "loss": 0.83936477, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.879979372024536 }, { "auxiliary_loss_clip": 0.01383794, "auxiliary_loss_mlp": 0.01031887, "balance_loss_clip": 1.04712367, "balance_loss_mlp": 1.02354813, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 2.0361011978829855, "language_loss": 0.83033037, "learning_rate": 2.3364660045512435e-06, "loss": 0.85448718, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.664414405822754 }, { "auxiliary_loss_clip": 0.01185461, "auxiliary_loss_mlp": 0.0100237, "balance_loss_clip": 1.02154899, "balance_loss_mlp": 1.0013566, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.7486996006766862, "language_loss": 0.58196676, "learning_rate": 2.335698109884337e-06, "loss": 0.60384512, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.4197018146514893 }, { "auxiliary_loss_clip": 0.01313312, "auxiliary_loss_mlp": 0.01019517, "balance_loss_clip": 1.03424931, "balance_loss_mlp": 1.01840806, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7892008360447, "language_loss": 0.59867829, "learning_rate": 2.334930164292294e-06, "loss": 0.6220066, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.5182158946990967 }, { "auxiliary_loss_clip": 0.01385191, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.04520607, "balance_loss_mlp": 1.01804948, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 1.8486022245140306, "language_loss": 0.79701912, "learning_rate": 2.334162167891612e-06, "loss": 0.8211292, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.920813798904419 }, { "auxiliary_loss_clip": 0.01295922, "auxiliary_loss_mlp": 0.0102781, "balance_loss_clip": 1.05124497, "balance_loss_mlp": 1.01997828, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.0389089370066094, "language_loss": 0.74874353, "learning_rate": 2.333394120798795e-06, "loss": 0.77198076, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.6729819774627686 }, { "auxiliary_loss_clip": 0.01289979, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.05015373, "balance_loss_mlp": 1.02538109, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.1931798317197866, "language_loss": 0.71918738, "learning_rate": 2.3326260231303545e-06, "loss": 0.74241984, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 3.8385283946990967 }, { "auxiliary_loss_clip": 0.01188669, "auxiliary_loss_mlp": 0.01023279, "balance_loss_clip": 1.05504, "balance_loss_mlp": 1.01595962, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.667287444767265, "language_loss": 0.87012988, "learning_rate": 2.331857875002811e-06, "loss": 0.89224935, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.603156805038452 }, { "auxiliary_loss_clip": 0.0128935, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.0536921, "balance_loss_mlp": 1.01982856, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 2.7357952467480193, "language_loss": 0.76256466, "learning_rate": 2.3310896765326916e-06, "loss": 0.78573143, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.7322206497192383 }, { "auxiliary_loss_clip": 0.0134273, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.05188942, "balance_loss_mlp": 1.02295017, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.628071144377028, "language_loss": 0.84261227, "learning_rate": 2.330321427836531e-06, "loss": 0.86634767, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.7455902099609375 }, { "auxiliary_loss_clip": 0.01238758, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.05294752, "balance_loss_mlp": 1.0230478, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 4.937000546848872, "language_loss": 0.82667112, "learning_rate": 2.3295531290308733e-06, "loss": 0.84936416, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 3.5424699783325195 }, { "auxiliary_loss_clip": 0.01195501, "auxiliary_loss_mlp": 0.02571163, "balance_loss_clip": 1.05774343, "balance_loss_mlp": 1.00020623, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 3.1475662327111453, "language_loss": 0.75693989, "learning_rate": 2.3287847802322678e-06, "loss": 0.79460651, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.5762007236480713 }, { "auxiliary_loss_clip": 0.01207859, "auxiliary_loss_mlp": 0.01028746, "balance_loss_clip": 1.05828667, "balance_loss_mlp": 1.0202527, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 2.0983025961162025, "language_loss": 0.83909732, "learning_rate": 2.3280163815572723e-06, "loss": 0.86146343, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.719430923461914 }, { "auxiliary_loss_clip": 0.01284345, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.05091989, "balance_loss_mlp": 1.0253582, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 2.9309296954607698, "language_loss": 0.76878822, "learning_rate": 2.3272479331224522e-06, "loss": 0.79196483, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 3.6008126735687256 }, { "auxiliary_loss_clip": 0.01192302, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.05543804, "balance_loss_mlp": 1.02458394, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.8485893133633902, "language_loss": 0.78107607, "learning_rate": 2.3264794350443817e-06, "loss": 0.80332571, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.6886115074157715 }, { "auxiliary_loss_clip": 0.01238715, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.05025005, "balance_loss_mlp": 1.02311397, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.85439789290648, "language_loss": 0.7869975, "learning_rate": 2.3257108874396396e-06, "loss": 0.80969518, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.6694624423980713 }, { "auxiliary_loss_clip": 0.01288811, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.05049086, "balance_loss_mlp": 1.02510548, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 2.171391523733925, "language_loss": 0.73928106, "learning_rate": 2.3249422904248152e-06, "loss": 0.76249003, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 3.5090813636779785 }, { "auxiliary_loss_clip": 0.01243684, "auxiliary_loss_mlp": 0.0102784, "balance_loss_clip": 1.05290926, "balance_loss_mlp": 1.01982927, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.5285880639697718, "language_loss": 0.87379962, "learning_rate": 2.324173644116504e-06, "loss": 0.89651489, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.758568286895752 }, { "auxiliary_loss_clip": 0.01237388, "auxiliary_loss_mlp": 0.0102636, "balance_loss_clip": 1.05519736, "balance_loss_mlp": 1.01827729, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.9221556911853304, "language_loss": 0.81545585, "learning_rate": 2.3234049486313087e-06, "loss": 0.83809328, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.6738998889923096 }, { "auxiliary_loss_clip": 0.01238708, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.05396533, "balance_loss_mlp": 1.02279902, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.7699653322686977, "language_loss": 0.76188815, "learning_rate": 2.322636204085839e-06, "loss": 0.78457475, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.6841983795166016 }, { "auxiliary_loss_clip": 0.0128318, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.04678071, "balance_loss_mlp": 1.02472484, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 3.3241189315631283, "language_loss": 0.78545845, "learning_rate": 2.3218674105967143e-06, "loss": 0.8086192, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.6614644527435303 }, { "auxiliary_loss_clip": 0.01285479, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.05003321, "balance_loss_mlp": 1.02368522, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 2.7076702555854495, "language_loss": 0.83645105, "learning_rate": 2.3210985682805593e-06, "loss": 0.85961318, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.78041934967041 }, { "auxiliary_loss_clip": 0.01195687, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.05923581, "balance_loss_mlp": 1.02192175, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 4.629587961674402, "language_loss": 0.68059623, "learning_rate": 2.320329677254007e-06, "loss": 0.70284849, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.6413164138793945 }, { "auxiliary_loss_clip": 0.01190632, "auxiliary_loss_mlp": 0.01028145, "balance_loss_clip": 1.05544472, "balance_loss_mlp": 1.02018523, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.4077953343932315, "language_loss": 0.73006856, "learning_rate": 2.319560737633697e-06, "loss": 0.75225633, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.608645439147949 }, { "auxiliary_loss_clip": 0.01341945, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.04761922, "balance_loss_mlp": 1.01966524, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.5565961237058026, "language_loss": 0.68239713, "learning_rate": 2.3187917495362775e-06, "loss": 0.70609164, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.9852609634399414 }, { "auxiliary_loss_clip": 0.01383008, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.04716206, "balance_loss_mlp": 1.02509844, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.5247329692906884, "language_loss": 0.77067667, "learning_rate": 2.318022713078403e-06, "loss": 0.79483384, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.702501058578491 }, { "auxiliary_loss_clip": 0.01288365, "auxiliary_loss_mlp": 0.0102671, "balance_loss_clip": 1.05135345, "balance_loss_mlp": 1.01914024, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 4.107848349146468, "language_loss": 0.85276395, "learning_rate": 2.3172536283767354e-06, "loss": 0.87591469, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.598076581954956 }, { "auxiliary_loss_clip": 0.01336223, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.02191448, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 2.1561596720107854, "language_loss": 0.80348647, "learning_rate": 2.3164844955479447e-06, "loss": 0.82713968, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.616000175476074 }, { "auxiliary_loss_clip": 0.01299993, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.04999661, "balance_loss_mlp": 1.02444065, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 2.0230906597644274, "language_loss": 0.70597959, "learning_rate": 2.3157153147087082e-06, "loss": 0.72929925, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.8357338905334473 }, { "auxiliary_loss_clip": 0.01295985, "auxiliary_loss_mlp": 0.01025678, "balance_loss_clip": 1.05066943, "balance_loss_mlp": 1.01872826, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.743677653916453, "language_loss": 0.83377421, "learning_rate": 2.314946085975709e-06, "loss": 0.85699081, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.7015841007232666 }, { "auxiliary_loss_clip": 0.01327354, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.04807866, "balance_loss_mlp": 1.01890326, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 6.416994234632226, "language_loss": 0.82155848, "learning_rate": 2.3141768094656393e-06, "loss": 0.84509087, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.721611976623535 }, { "auxiliary_loss_clip": 0.01491748, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.04491878, "balance_loss_mlp": 1.02118099, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 5.994106248219784, "language_loss": 0.82807565, "learning_rate": 2.3134074852951966e-06, "loss": 0.85327512, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.8420121669769287 }, { "auxiliary_loss_clip": 0.01384264, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.04529953, "balance_loss_mlp": 1.02653313, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 2.306896696275707, "language_loss": 0.78073144, "learning_rate": 2.312638113581088e-06, "loss": 0.80491644, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 3.2542166709899902 }, { "auxiliary_loss_clip": 0.01237385, "auxiliary_loss_mlp": 0.01026751, "balance_loss_clip": 1.05006003, "balance_loss_mlp": 1.01860917, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.898512923291977, "language_loss": 0.78067297, "learning_rate": 2.311868694440027e-06, "loss": 0.80331433, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 3.666517496109009 }, { "auxiliary_loss_clip": 0.01084922, "auxiliary_loss_mlp": 0.01004907, "balance_loss_clip": 1.02116275, "balance_loss_mlp": 1.00382257, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.7478051995290669, "language_loss": 0.62397528, "learning_rate": 2.3110992279887323e-06, "loss": 0.6448735, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.356684684753418 }, { "auxiliary_loss_clip": 0.01340633, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.05046868, "balance_loss_mlp": 1.01678109, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.5107467784915443, "language_loss": 0.85553062, "learning_rate": 2.310329714343932e-06, "loss": 0.879188, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.6964762210845947 }, { "auxiliary_loss_clip": 0.01286131, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.05197501, "balance_loss_mlp": 1.0245558, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 1.9243652318358317, "language_loss": 0.81725645, "learning_rate": 2.309560153622361e-06, "loss": 0.84043849, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 3.6289138793945312 }, { "auxiliary_loss_clip": 0.01334686, "auxiliary_loss_mlp": 0.01036254, "balance_loss_clip": 1.05118823, "balance_loss_mlp": 1.02808785, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 2.483906238663339, "language_loss": 0.74869001, "learning_rate": 2.3087905459407602e-06, "loss": 0.77239943, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.728893995285034 }, { "auxiliary_loss_clip": 0.01039317, "auxiliary_loss_mlp": 0.00999144, "balance_loss_clip": 1.01983309, "balance_loss_mlp": 0.9981184, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.7873649697721482, "language_loss": 0.62854487, "learning_rate": 2.3080208914158795e-06, "loss": 0.64892942, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 4.114591121673584 }, { "auxiliary_loss_clip": 0.01284294, "auxiliary_loss_mlp": 0.01020969, "balance_loss_clip": 1.0538063, "balance_loss_mlp": 1.0134083, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.252387317682799, "language_loss": 0.71942103, "learning_rate": 2.3072511901644753e-06, "loss": 0.7424736, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.638237714767456 }, { "auxiliary_loss_clip": 0.01187938, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.05504048, "balance_loss_mlp": 1.02141237, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 4.248376903970199, "language_loss": 0.80380124, "learning_rate": 2.306481442303309e-06, "loss": 0.82596898, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.528179883956909 }, { "auxiliary_loss_clip": 0.01239482, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.05370271, "balance_loss_mlp": 1.02265763, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 2.1150221966962386, "language_loss": 0.73359996, "learning_rate": 2.3057116479491515e-06, "loss": 0.756302, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.530686616897583 }, { "auxiliary_loss_clip": 0.01235788, "auxiliary_loss_mlp": 0.01022515, "balance_loss_clip": 1.04960752, "balance_loss_mlp": 1.01477242, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 2.1420045730883506, "language_loss": 0.75860715, "learning_rate": 2.30494180721878e-06, "loss": 0.78119016, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 3.4020655155181885 }, { "auxiliary_loss_clip": 0.01235641, "auxiliary_loss_mlp": 0.01024363, "balance_loss_clip": 1.05013847, "balance_loss_mlp": 1.01726127, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 2.375298980528182, "language_loss": 0.90315241, "learning_rate": 2.3041719202289794e-06, "loss": 0.9257524, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.6203625202178955 }, { "auxiliary_loss_clip": 0.01242271, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.05561066, "balance_loss_mlp": 1.01689923, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.9127098147427022, "language_loss": 0.80433381, "learning_rate": 2.30340198709654e-06, "loss": 0.8269968, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.6290643215179443 }, { "auxiliary_loss_clip": 0.01292987, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.04982877, "balance_loss_mlp": 1.0212903, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.1037873169842927, "language_loss": 0.74612689, "learning_rate": 2.3026320079382605e-06, "loss": 0.76934183, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.7236270904541016 }, { "auxiliary_loss_clip": 0.01188937, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.055004, "balance_loss_mlp": 1.0217011, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 2.990807685843572, "language_loss": 0.76568913, "learning_rate": 2.3018619828709454e-06, "loss": 0.78787005, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.758061647415161 }, { "auxiliary_loss_clip": 0.01233408, "auxiliary_loss_mlp": 0.02567574, "balance_loss_clip": 1.05220079, "balance_loss_mlp": 1.00017691, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 1.8623998526309142, "language_loss": 0.81980181, "learning_rate": 2.3010919120114084e-06, "loss": 0.85781163, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.665712356567383 }, { "auxiliary_loss_clip": 0.01236521, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.04951477, "balance_loss_mlp": 1.02144432, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 8.108567294012968, "language_loss": 0.65924871, "learning_rate": 2.3003217954764672e-06, "loss": 0.68190634, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.6742501258850098 }, { "auxiliary_loss_clip": 0.0123974, "auxiliary_loss_mlp": 0.01023258, "balance_loss_clip": 1.04980767, "balance_loss_mlp": 1.0161413, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.9334657421026111, "language_loss": 0.7889837, "learning_rate": 2.299551633382949e-06, "loss": 0.81161368, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.7105257511138916 }, { "auxiliary_loss_clip": 0.01284432, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.04896653, "balance_loss_mlp": 1.02200592, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 1.8932394859409858, "language_loss": 0.85997438, "learning_rate": 2.2987814258476854e-06, "loss": 0.88311815, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.651214122772217 }, { "auxiliary_loss_clip": 0.01392173, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.04541063, "balance_loss_mlp": 1.02360129, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.400775946285375, "language_loss": 0.67831135, "learning_rate": 2.2980111729875177e-06, "loss": 0.7025522, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.7033140659332275 }, { "auxiliary_loss_clip": 0.0128202, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.05202007, "balance_loss_mlp": 1.01851463, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.671405268958826, "language_loss": 0.8216784, "learning_rate": 2.2972408749192917e-06, "loss": 0.84476233, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.6845109462738037 }, { "auxiliary_loss_clip": 0.0123695, "auxiliary_loss_mlp": 0.02565252, "balance_loss_clip": 1.05410206, "balance_loss_mlp": 1.00013745, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 2.295814603021938, "language_loss": 0.67158353, "learning_rate": 2.296470531759861e-06, "loss": 0.70960546, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.6139473915100098 }, { "auxiliary_loss_clip": 0.01329459, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.04692769, "balance_loss_mlp": 1.02370214, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 2.4372271378378425, "language_loss": 0.7959249, "learning_rate": 2.2957001436260866e-06, "loss": 0.81953704, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.7673256397247314 }, { "auxiliary_loss_clip": 0.01287674, "auxiliary_loss_mlp": 0.01028281, "balance_loss_clip": 1.05063128, "balance_loss_mlp": 1.02066898, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.6557719706940468, "language_loss": 0.73064095, "learning_rate": 2.294929710634836e-06, "loss": 0.75380045, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.6921041011810303 }, { "auxiliary_loss_clip": 0.01237465, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04958045, "balance_loss_mlp": 1.01854467, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 1.7840306153228374, "language_loss": 0.61354971, "learning_rate": 2.2941592329029823e-06, "loss": 0.63618779, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.805628776550293 }, { "auxiliary_loss_clip": 0.0123443, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.05044317, "balance_loss_mlp": 1.02430868, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.8704253847915475, "language_loss": 0.78853863, "learning_rate": 2.2933887105474067e-06, "loss": 0.81120324, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.6583614349365234 }, { "auxiliary_loss_clip": 0.01233571, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.05239332, "balance_loss_mlp": 1.01844251, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.9940618922678748, "language_loss": 0.81432635, "learning_rate": 2.2926181436849974e-06, "loss": 0.8369174, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.6614344120025635 }, { "auxiliary_loss_clip": 0.01234559, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.05116057, "balance_loss_mlp": 1.02503872, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.681803990448591, "language_loss": 0.72715849, "learning_rate": 2.2918475324326478e-06, "loss": 0.74983621, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 4.0589306354522705 }, { "auxiliary_loss_clip": 0.01241157, "auxiliary_loss_mlp": 0.02573023, "balance_loss_clip": 1.05415344, "balance_loss_mlp": 1.00013995, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 7.554441628052622, "language_loss": 0.90862489, "learning_rate": 2.2910768769072603e-06, "loss": 0.94676673, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.736666679382324 }, { "auxiliary_loss_clip": 0.01235044, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.05088413, "balance_loss_mlp": 1.01953268, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.9065051119870893, "language_loss": 0.75851679, "learning_rate": 2.2903061772257417e-06, "loss": 0.78113765, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 3.7427358627319336 }, { "auxiliary_loss_clip": 0.01233858, "auxiliary_loss_mlp": 0.01023904, "balance_loss_clip": 1.05204117, "balance_loss_mlp": 1.01616764, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.6738935323694455, "language_loss": 0.78383553, "learning_rate": 2.289535433505007e-06, "loss": 0.80641311, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.9135754108428955 }, { "auxiliary_loss_clip": 0.0119042, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.04832363, "balance_loss_mlp": 1.02013016, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 2.3431891396469195, "language_loss": 0.63479877, "learning_rate": 2.2887646458619767e-06, "loss": 0.65697944, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.802206516265869 }, { "auxiliary_loss_clip": 0.01342178, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.04953861, "balance_loss_mlp": 1.0262711, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 3.4738316532290523, "language_loss": 0.76671708, "learning_rate": 2.2879938144135797e-06, "loss": 0.79048526, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 3.668818235397339 }, { "auxiliary_loss_clip": 0.01332811, "auxiliary_loss_mlp": 0.02565897, "balance_loss_clip": 1.04567683, "balance_loss_mlp": 1.0002048, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.6527090530638666, "language_loss": 0.75087452, "learning_rate": 2.2872229392767496e-06, "loss": 0.78986162, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.8628573417663574 }, { "auxiliary_loss_clip": 0.01242193, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.05422044, "balance_loss_mlp": 1.02184021, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.6113938928246734, "language_loss": 0.74825269, "learning_rate": 2.286452020568428e-06, "loss": 0.77096963, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.764180898666382 }, { "auxiliary_loss_clip": 0.01191615, "auxiliary_loss_mlp": 0.01032873, "balance_loss_clip": 1.0528394, "balance_loss_mlp": 1.02503812, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.8337817391145739, "language_loss": 0.73128062, "learning_rate": 2.2856810584055637e-06, "loss": 0.75352556, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.701190710067749 }, { "auxiliary_loss_clip": 0.01235778, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.04959679, "balance_loss_mlp": 1.01889277, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.5506755319063072, "language_loss": 0.67993486, "learning_rate": 2.2849100529051085e-06, "loss": 0.70255613, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 3.8579235076904297 }, { "auxiliary_loss_clip": 0.01186462, "auxiliary_loss_mlp": 0.01025168, "balance_loss_clip": 1.05357242, "balance_loss_mlp": 1.01772046, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.975055790916585, "language_loss": 0.8044709, "learning_rate": 2.284139004184026e-06, "loss": 0.8265872, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.660277843475342 }, { "auxiliary_loss_clip": 0.01187203, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.05367887, "balance_loss_mlp": 1.02452993, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 2.43849666289561, "language_loss": 0.74714828, "learning_rate": 2.2833679123592814e-06, "loss": 0.76934075, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.673644781112671 }, { "auxiliary_loss_clip": 0.0128288, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.05097151, "balance_loss_mlp": 1.01941931, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.833962628451763, "language_loss": 0.63441914, "learning_rate": 2.2825967775478508e-06, "loss": 0.65751845, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.816105842590332 }, { "auxiliary_loss_clip": 0.0118566, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.05080593, "balance_loss_mlp": 1.02520466, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.363930401147122, "language_loss": 0.83542126, "learning_rate": 2.2818255998667135e-06, "loss": 0.85760713, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.725762128829956 }, { "auxiliary_loss_clip": 0.01234836, "auxiliary_loss_mlp": 0.0102426, "balance_loss_clip": 1.052495, "balance_loss_mlp": 1.01680398, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.7958809792003754, "language_loss": 0.79170597, "learning_rate": 2.2810543794328566e-06, "loss": 0.81429696, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.8254036903381348 }, { "auxiliary_loss_clip": 0.01239801, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.05076838, "balance_loss_mlp": 1.01815319, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.705236295464475, "language_loss": 0.82401478, "learning_rate": 2.2802831163632735e-06, "loss": 0.8466695, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.715057373046875 }, { "auxiliary_loss_clip": 0.01435909, "auxiliary_loss_mlp": 0.01025209, "balance_loss_clip": 1.0472188, "balance_loss_mlp": 1.01721597, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.8173060506324743, "language_loss": 0.74362874, "learning_rate": 2.279511810774965e-06, "loss": 0.76823986, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.966808319091797 }, { "auxiliary_loss_clip": 0.01189151, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.05330479, "balance_loss_mlp": 1.02382791, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 1.8949857578185092, "language_loss": 0.71778643, "learning_rate": 2.2787404627849364e-06, "loss": 0.73999441, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.8080499172210693 }, { "auxiliary_loss_clip": 0.01286847, "auxiliary_loss_mlp": 0.01019657, "balance_loss_clip": 1.04826212, "balance_loss_mlp": 1.01232874, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.8122090142116205, "language_loss": 0.79063654, "learning_rate": 2.277969072510202e-06, "loss": 0.81370163, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.67380690574646 }, { "auxiliary_loss_clip": 0.01288044, "auxiliary_loss_mlp": 0.01029117, "balance_loss_clip": 1.05140662, "balance_loss_mlp": 1.02185118, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.613905506677496, "language_loss": 0.81380552, "learning_rate": 2.2771976400677803e-06, "loss": 0.83697712, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.6948089599609375 }, { "auxiliary_loss_clip": 0.0137358, "auxiliary_loss_mlp": 0.01026088, "balance_loss_clip": 1.04292881, "balance_loss_mlp": 1.01845813, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.892630437250918, "language_loss": 0.78943223, "learning_rate": 2.2764261655746965e-06, "loss": 0.81342894, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.818697929382324 }, { "auxiliary_loss_clip": 0.01334178, "auxiliary_loss_mlp": 0.01030857, "balance_loss_clip": 1.04755116, "balance_loss_mlp": 1.02312589, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.8359433197767585, "language_loss": 0.75925875, "learning_rate": 2.2756546491479832e-06, "loss": 0.78290904, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.7749733924865723 }, { "auxiliary_loss_clip": 0.01188793, "auxiliary_loss_mlp": 0.02567814, "balance_loss_clip": 1.0528332, "balance_loss_mlp": 1.00021577, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 3.6964995985432156, "language_loss": 0.79862845, "learning_rate": 2.274883090904679e-06, "loss": 0.83619452, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.71064829826355 }, { "auxiliary_loss_clip": 0.01190962, "auxiliary_loss_mlp": 0.01032534, "balance_loss_clip": 1.05483794, "balance_loss_mlp": 1.0253098, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 2.39846411248549, "language_loss": 0.67616296, "learning_rate": 2.2741114909618283e-06, "loss": 0.69839787, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.675497531890869 }, { "auxiliary_loss_clip": 0.01332995, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.0483408, "balance_loss_mlp": 1.01854324, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.6638324099100241, "language_loss": 0.72121102, "learning_rate": 2.2733398494364828e-06, "loss": 0.74480188, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.791306734085083 }, { "auxiliary_loss_clip": 0.01287824, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.05499268, "balance_loss_mlp": 1.02270615, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 2.7699453221914396, "language_loss": 0.84438396, "learning_rate": 2.272568166445699e-06, "loss": 0.86756706, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.6907429695129395 }, { "auxiliary_loss_clip": 0.01236563, "auxiliary_loss_mlp": 0.01024644, "balance_loss_clip": 1.05134284, "balance_loss_mlp": 1.0172528, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 2.11056153672006, "language_loss": 0.64665806, "learning_rate": 2.271796442106541e-06, "loss": 0.6692701, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 4.231297254562378 }, { "auxiliary_loss_clip": 0.01236998, "auxiliary_loss_mlp": 0.01000255, "balance_loss_clip": 1.01772606, "balance_loss_mlp": 0.99903917, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.7974589522435358, "language_loss": 0.56440485, "learning_rate": 2.271024676536079e-06, "loss": 0.58677745, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.2665016651153564 }, { "auxiliary_loss_clip": 0.01292692, "auxiliary_loss_mlp": 0.01033764, "balance_loss_clip": 1.05306506, "balance_loss_mlp": 1.02556801, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.1421724363186407, "language_loss": 0.73627174, "learning_rate": 2.2702528698513894e-06, "loss": 0.75953627, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 3.633065938949585 }, { "auxiliary_loss_clip": 0.01286365, "auxiliary_loss_mlp": 0.01026082, "balance_loss_clip": 1.0466001, "balance_loss_mlp": 1.01815164, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 1.896690986200204, "language_loss": 0.78522491, "learning_rate": 2.269481022169554e-06, "loss": 0.80834943, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.7973756790161133 }, { "auxiliary_loss_clip": 0.01293496, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.05051684, "balance_loss_mlp": 1.02184963, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 3.0629398692096808, "language_loss": 0.80557334, "learning_rate": 2.2687091336076614e-06, "loss": 0.82880366, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.8156309127807617 }, { "auxiliary_loss_clip": 0.01234401, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.05138195, "balance_loss_mlp": 1.0199182, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 2.2958506021619662, "language_loss": 0.80162716, "learning_rate": 2.267937204282807e-06, "loss": 0.82424635, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 3.5652012825012207 }, { "auxiliary_loss_clip": 0.01246335, "auxiliary_loss_mlp": 0.01028833, "balance_loss_clip": 1.0558207, "balance_loss_mlp": 1.0196836, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.4874566271943945, "language_loss": 0.79096305, "learning_rate": 2.2671652343120926e-06, "loss": 0.8137148, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.766378879547119 }, { "auxiliary_loss_clip": 0.01188449, "auxiliary_loss_mlp": 0.01024794, "balance_loss_clip": 1.05473793, "balance_loss_mlp": 1.0175643, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.762274535106121, "language_loss": 0.80377519, "learning_rate": 2.2663932238126236e-06, "loss": 0.82590765, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.706486940383911 }, { "auxiliary_loss_clip": 0.01233078, "auxiliary_loss_mlp": 0.0102588, "balance_loss_clip": 1.0484035, "balance_loss_mlp": 1.0183574, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 2.18132645301693, "language_loss": 0.80035657, "learning_rate": 2.265621172901515e-06, "loss": 0.82294619, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.762964963912964 }, { "auxiliary_loss_clip": 0.01190749, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.05653024, "balance_loss_mlp": 1.02254462, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 4.0417218395442704, "language_loss": 0.71655589, "learning_rate": 2.2648490816958854e-06, "loss": 0.7387687, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 3.7417593002319336 }, { "auxiliary_loss_clip": 0.01236401, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 1.04952419, "balance_loss_mlp": 1.01953197, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 2.2557851228150065, "language_loss": 0.73376787, "learning_rate": 2.264076950312861e-06, "loss": 0.75640404, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.6926236152648926 }, { "auxiliary_loss_clip": 0.01291465, "auxiliary_loss_mlp": 0.01024201, "balance_loss_clip": 1.05187368, "balance_loss_mlp": 1.01658976, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 2.249737052717874, "language_loss": 0.82289582, "learning_rate": 2.2633047788695727e-06, "loss": 0.84605253, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.782825231552124 }, { "auxiliary_loss_clip": 0.01284332, "auxiliary_loss_mlp": 0.01026, "balance_loss_clip": 1.05080104, "balance_loss_mlp": 1.01895213, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.927246801049443, "language_loss": 0.64257133, "learning_rate": 2.262532567483159e-06, "loss": 0.66567469, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.668940782546997 }, { "auxiliary_loss_clip": 0.01190014, "auxiliary_loss_mlp": 0.02566846, "balance_loss_clip": 1.05527449, "balance_loss_mlp": 1.00031841, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 4.739263912519275, "language_loss": 0.80288005, "learning_rate": 2.2617603162707635e-06, "loss": 0.84044856, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.736337184906006 }, { "auxiliary_loss_clip": 0.01187654, "auxiliary_loss_mlp": 0.01029158, "balance_loss_clip": 1.05384421, "balance_loss_mlp": 1.02118635, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 2.1389869362654697, "language_loss": 0.82371402, "learning_rate": 2.2609880253495363e-06, "loss": 0.84588218, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.6820895671844482 }, { "auxiliary_loss_clip": 0.01247716, "auxiliary_loss_mlp": 0.01025103, "balance_loss_clip": 1.04741144, "balance_loss_mlp": 1.01769376, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 2.239605966113392, "language_loss": 0.86783105, "learning_rate": 2.260215694836633e-06, "loss": 0.89055926, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.79030704498291 }, { "auxiliary_loss_clip": 0.01387552, "auxiliary_loss_mlp": 0.02565579, "balance_loss_clip": 1.04549289, "balance_loss_mlp": 1.00024807, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 2.1805686669858333, "language_loss": 0.64921361, "learning_rate": 2.2594433248492157e-06, "loss": 0.68874496, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.885796308517456 }, { "auxiliary_loss_clip": 0.01243497, "auxiliary_loss_mlp": 0.01027942, "balance_loss_clip": 1.05204296, "balance_loss_mlp": 1.01956737, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.6710842657326714, "language_loss": 0.80548239, "learning_rate": 2.2586709155044527e-06, "loss": 0.82819688, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.791947603225708 }, { "auxiliary_loss_clip": 0.01187589, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.05318379, "balance_loss_mlp": 1.01894593, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.5025861796852586, "language_loss": 0.75974464, "learning_rate": 2.2578984669195167e-06, "loss": 0.78188384, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.7215468883514404 }, { "auxiliary_loss_clip": 0.01230782, "auxiliary_loss_mlp": 0.01024449, "balance_loss_clip": 1.04672122, "balance_loss_mlp": 1.01730275, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.8412168329249965, "language_loss": 0.67742682, "learning_rate": 2.2571259792115887e-06, "loss": 0.69997919, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.750565528869629 }, { "auxiliary_loss_clip": 0.01232454, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.05063593, "balance_loss_mlp": 1.02052045, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.6735227537574664, "language_loss": 0.7958945, "learning_rate": 2.2563534524978544e-06, "loss": 0.81849444, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.729879856109619 }, { "auxiliary_loss_clip": 0.01339083, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.05397201, "balance_loss_mlp": 1.01920474, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 1.6204710135236071, "language_loss": 0.70550346, "learning_rate": 2.2555808868955052e-06, "loss": 0.72916067, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.8586926460266113 }, { "auxiliary_loss_clip": 0.01381629, "auxiliary_loss_mlp": 0.01023773, "balance_loss_clip": 1.04702067, "balance_loss_mlp": 1.01631606, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 3.1507245964209707, "language_loss": 0.73641163, "learning_rate": 2.254808282521738e-06, "loss": 0.76046568, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.849005937576294 }, { "auxiliary_loss_clip": 0.01341423, "auxiliary_loss_mlp": 0.02566301, "balance_loss_clip": 1.04908919, "balance_loss_mlp": 1.00026488, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 6.851139644165576, "language_loss": 0.81267321, "learning_rate": 2.2540356394937573e-06, "loss": 0.85175037, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.798614978790283 }, { "auxiliary_loss_clip": 0.0133892, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.0477376, "balance_loss_mlp": 1.01981449, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.4476015448329025, "language_loss": 0.83901536, "learning_rate": 2.253262957928772e-06, "loss": 0.86268836, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.7657079696655273 }, { "auxiliary_loss_clip": 0.01284287, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.04775882, "balance_loss_mlp": 1.02575648, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.7039129765766372, "language_loss": 0.71943408, "learning_rate": 2.2524902379439976e-06, "loss": 0.74261022, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.7573227882385254 }, { "auxiliary_loss_clip": 0.01360223, "auxiliary_loss_mlp": 0.01014767, "balance_loss_clip": 1.02938914, "balance_loss_mlp": 1.01376581, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7441109632420955, "language_loss": 0.63659275, "learning_rate": 2.251717479656655e-06, "loss": 0.66034269, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 4.5547401905059814 }, { "auxiliary_loss_clip": 0.01190654, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.0541631, "balance_loss_mlp": 1.0245657, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 2.3418153878350645, "language_loss": 0.76518071, "learning_rate": 2.2509446831839704e-06, "loss": 0.78740358, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.8758199214935303 }, { "auxiliary_loss_clip": 0.01285627, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.047768, "balance_loss_mlp": 1.02195752, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 3.0476294303179765, "language_loss": 0.82148439, "learning_rate": 2.250171848643177e-06, "loss": 0.84463471, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 4.0129101276397705 }, { "auxiliary_loss_clip": 0.01278668, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.05009365, "balance_loss_mlp": 1.02330291, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.8549003156613535, "language_loss": 0.85983694, "learning_rate": 2.249398976151513e-06, "loss": 0.88293087, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.7475900650024414 }, { "auxiliary_loss_clip": 0.01186669, "auxiliary_loss_mlp": 0.01026176, "balance_loss_clip": 1.05422711, "balance_loss_mlp": 1.01897013, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 2.4345417609650157, "language_loss": 0.78772306, "learning_rate": 2.248626065826223e-06, "loss": 0.80985147, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.7157766819000244 }, { "auxiliary_loss_clip": 0.01083894, "auxiliary_loss_mlp": 0.01004806, "balance_loss_clip": 1.02077532, "balance_loss_mlp": 1.00370932, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7852950522344894, "language_loss": 0.62578619, "learning_rate": 2.2478531177845564e-06, "loss": 0.64667308, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 4.031198263168335 }, { "auxiliary_loss_clip": 0.01290135, "auxiliary_loss_mlp": 0.01032674, "balance_loss_clip": 1.0540688, "balance_loss_mlp": 1.02500272, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.78785069531535, "language_loss": 0.85008192, "learning_rate": 2.247080132143769e-06, "loss": 0.87330997, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.7172610759735107 }, { "auxiliary_loss_clip": 0.0133418, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.0442946, "balance_loss_mlp": 1.01935077, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.572148498916762, "language_loss": 0.68935776, "learning_rate": 2.246307109021121e-06, "loss": 0.71297008, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.787047863006592 }, { "auxiliary_loss_clip": 0.01285031, "auxiliary_loss_mlp": 0.01026456, "balance_loss_clip": 1.04786777, "balance_loss_mlp": 1.01924658, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.627610406016541, "language_loss": 0.82048762, "learning_rate": 2.2455340485338817e-06, "loss": 0.84360254, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.643749952316284 }, { "auxiliary_loss_clip": 0.01236945, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.0509541, "balance_loss_mlp": 1.02667713, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 3.269736722692732, "language_loss": 0.6760323, "learning_rate": 2.244760950799322e-06, "loss": 0.69873798, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 3.6445305347442627 }, { "auxiliary_loss_clip": 0.01323409, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.0492034, "balance_loss_mlp": 1.02062893, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.1609394316323054, "language_loss": 0.72725427, "learning_rate": 2.2439878159347203e-06, "loss": 0.75076485, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.958996057510376 }, { "auxiliary_loss_clip": 0.01081325, "auxiliary_loss_mlp": 0.01004782, "balance_loss_clip": 1.01822031, "balance_loss_mlp": 1.003703, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7286375436426726, "language_loss": 0.55233574, "learning_rate": 2.2432146440573616e-06, "loss": 0.57319677, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.334859848022461 }, { "auxiliary_loss_clip": 0.01188662, "auxiliary_loss_mlp": 0.01024769, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.0172739, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 2.208211564161935, "language_loss": 0.66675234, "learning_rate": 2.242441435284534e-06, "loss": 0.68888664, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.7501134872436523 }, { "auxiliary_loss_clip": 0.01233546, "auxiliary_loss_mlp": 0.01029786, "balance_loss_clip": 1.05142403, "balance_loss_mlp": 1.02179933, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.720810644724066, "language_loss": 0.85459173, "learning_rate": 2.2416681897335337e-06, "loss": 0.87722498, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.687241315841675 }, { "auxiliary_loss_clip": 0.0137802, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.04877186, "balance_loss_mlp": 1.01840901, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 2.2456607616726956, "language_loss": 0.67007506, "learning_rate": 2.240894907521661e-06, "loss": 0.69411522, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.899890184402466 }, { "auxiliary_loss_clip": 0.01283774, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.04765558, "balance_loss_mlp": 1.01977956, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 2.2306348616346727, "language_loss": 0.63761413, "learning_rate": 2.240121588766223e-06, "loss": 0.66072607, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.763699531555176 }, { "auxiliary_loss_clip": 0.01276002, "auxiliary_loss_mlp": 0.01026766, "balance_loss_clip": 1.04719388, "balance_loss_mlp": 1.01958346, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 1.8546795276002148, "language_loss": 0.71640593, "learning_rate": 2.239348233584531e-06, "loss": 0.73943365, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.7645697593688965 }, { "auxiliary_loss_clip": 0.01238331, "auxiliary_loss_mlp": 0.01024291, "balance_loss_clip": 1.05183995, "balance_loss_mlp": 1.01682806, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 1.8359139387174148, "language_loss": 0.809044, "learning_rate": 2.2385748420939013e-06, "loss": 0.83167022, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.699110507965088 }, { "auxiliary_loss_clip": 0.01185931, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.05499864, "balance_loss_mlp": 1.02292848, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.732870255702771, "language_loss": 0.72781718, "learning_rate": 2.2378014144116583e-06, "loss": 0.74997443, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.634321451187134 }, { "auxiliary_loss_clip": 0.01187781, "auxiliary_loss_mlp": 0.01030561, "balance_loss_clip": 1.05303824, "balance_loss_mlp": 1.0232414, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 2.470771129311594, "language_loss": 0.79700643, "learning_rate": 2.23702795065513e-06, "loss": 0.81918991, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.701625347137451 }, { "auxiliary_loss_clip": 0.01133142, "auxiliary_loss_mlp": 0.01001718, "balance_loss_clip": 1.01624656, "balance_loss_mlp": 1.00072312, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9826473853767607, "language_loss": 0.67395473, "learning_rate": 2.2362544509416493e-06, "loss": 0.69530332, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 3.0818591117858887 }, { "auxiliary_loss_clip": 0.01276178, "auxiliary_loss_mlp": 0.01024332, "balance_loss_clip": 1.04636824, "balance_loss_mlp": 1.01741219, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.4093009860714574, "language_loss": 0.82856303, "learning_rate": 2.2354809153885572e-06, "loss": 0.8515681, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.736797332763672 }, { "auxiliary_loss_clip": 0.01234592, "auxiliary_loss_mlp": 0.01026413, "balance_loss_clip": 1.05011404, "balance_loss_mlp": 1.018646, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 2.719277404021331, "language_loss": 0.82800543, "learning_rate": 2.234707344113197e-06, "loss": 0.8506155, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.6978394985198975 }, { "auxiliary_loss_clip": 0.01182608, "auxiliary_loss_mlp": 0.01023522, "balance_loss_clip": 1.05162776, "balance_loss_mlp": 1.01624155, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.6641792501909178, "language_loss": 0.77446038, "learning_rate": 2.233933737232919e-06, "loss": 0.79652166, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.6217119693756104 }, { "auxiliary_loss_clip": 0.01379545, "auxiliary_loss_mlp": 0.02562996, "balance_loss_clip": 1.04569101, "balance_loss_mlp": 1.00035119, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 2.0204986668225327, "language_loss": 0.78319514, "learning_rate": 2.2331600948650793e-06, "loss": 0.82262057, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.873868465423584 }, { "auxiliary_loss_clip": 0.01335084, "auxiliary_loss_mlp": 0.02571538, "balance_loss_clip": 1.0519942, "balance_loss_mlp": 1.00033474, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.7715765350156634, "language_loss": 0.80095851, "learning_rate": 2.2323864171270386e-06, "loss": 0.84002477, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.754855155944824 }, { "auxiliary_loss_clip": 0.01336303, "auxiliary_loss_mlp": 0.01025142, "balance_loss_clip": 1.04648423, "balance_loss_mlp": 1.01765847, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.930216739826783, "language_loss": 0.72416496, "learning_rate": 2.231612704136164e-06, "loss": 0.74777949, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 3.6903622150421143 }, { "auxiliary_loss_clip": 0.01231419, "auxiliary_loss_mlp": 0.01026632, "balance_loss_clip": 1.04853737, "balance_loss_mlp": 1.01912189, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 2.491800884423435, "language_loss": 0.75007451, "learning_rate": 2.2308389560098253e-06, "loss": 0.77265501, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.6690518856048584 }, { "auxiliary_loss_clip": 0.01248974, "auxiliary_loss_mlp": 0.01031561, "balance_loss_clip": 1.05420542, "balance_loss_mlp": 1.0233357, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 4.479306698256506, "language_loss": 0.77117801, "learning_rate": 2.2300651728654008e-06, "loss": 0.79398328, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 3.5420398712158203 }, { "auxiliary_loss_clip": 0.0112577, "auxiliary_loss_mlp": 0.02505656, "balance_loss_clip": 1.01565099, "balance_loss_mlp": 1.00013447, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7220814177731106, "language_loss": 0.60164118, "learning_rate": 2.229291354820272e-06, "loss": 0.63795543, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.2551169395446777 }, { "auxiliary_loss_clip": 0.01235458, "auxiliary_loss_mlp": 0.0102678, "balance_loss_clip": 1.04915416, "balance_loss_mlp": 1.01864433, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 2.3409474135061856, "language_loss": 0.75594026, "learning_rate": 2.228517501991828e-06, "loss": 0.77856261, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.6436736583709717 }, { "auxiliary_loss_clip": 0.01178879, "auxiliary_loss_mlp": 0.01002444, "balance_loss_clip": 1.0155282, "balance_loss_mlp": 1.00121617, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8071989060090927, "language_loss": 0.61009216, "learning_rate": 2.22774361449746e-06, "loss": 0.63190532, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 4.235157489776611 }, { "auxiliary_loss_clip": 0.01438982, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.048491, "balance_loss_mlp": 1.02076149, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.642432576211445, "language_loss": 0.70379174, "learning_rate": 2.2269696924545668e-06, "loss": 0.72847164, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.854766368865967 }, { "auxiliary_loss_clip": 0.01331106, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04929352, "balance_loss_mlp": 1.02030158, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.3437818584386, "language_loss": 0.78236985, "learning_rate": 2.2261957359805523e-06, "loss": 0.80595016, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.6875789165496826 }, { "auxiliary_loss_clip": 0.0118451, "auxiliary_loss_mlp": 0.01027323, "balance_loss_clip": 1.0512861, "balance_loss_mlp": 1.01927066, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 2.1023363148315455, "language_loss": 0.7421025, "learning_rate": 2.225421745192823e-06, "loss": 0.76422083, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.704941987991333 }, { "auxiliary_loss_clip": 0.01233092, "auxiliary_loss_mlp": 0.01029955, "balance_loss_clip": 1.05104232, "balance_loss_mlp": 1.02112722, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 3.249295590247212, "language_loss": 0.78285152, "learning_rate": 2.2246477202087955e-06, "loss": 0.80548203, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 3.5701184272766113 }, { "auxiliary_loss_clip": 0.01285908, "auxiliary_loss_mlp": 0.01024938, "balance_loss_clip": 1.04760158, "balance_loss_mlp": 1.01790452, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.8445970586096467, "language_loss": 0.83014798, "learning_rate": 2.223873661145887e-06, "loss": 0.85325646, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.6784660816192627 }, { "auxiliary_loss_clip": 0.01290519, "auxiliary_loss_mlp": 0.02565776, "balance_loss_clip": 1.05639935, "balance_loss_mlp": 1.00037444, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.6338998398076803, "language_loss": 0.7158978, "learning_rate": 2.2230995681215226e-06, "loss": 0.75446075, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.781186819076538 }, { "auxiliary_loss_clip": 0.01330022, "auxiliary_loss_mlp": 0.01021669, "balance_loss_clip": 1.04665768, "balance_loss_mlp": 1.01404512, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 3.924458019417071, "language_loss": 0.77876991, "learning_rate": 2.2223254412531305e-06, "loss": 0.80228674, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.7020909786224365 }, { "auxiliary_loss_clip": 0.01275562, "auxiliary_loss_mlp": 0.01023281, "balance_loss_clip": 1.04579413, "balance_loss_mlp": 1.01683223, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 2.832967154021592, "language_loss": 0.8174113, "learning_rate": 2.221551280658146e-06, "loss": 0.84039974, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.7537178993225098 }, { "auxiliary_loss_clip": 0.0138136, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.04673147, "balance_loss_mlp": 1.03219116, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 5.20948616848376, "language_loss": 0.74222827, "learning_rate": 2.2207770864540085e-06, "loss": 0.7664398, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.8132967948913574 }, { "auxiliary_loss_clip": 0.01281633, "auxiliary_loss_mlp": 0.01024725, "balance_loss_clip": 1.04824853, "balance_loss_mlp": 1.01746511, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.8048584761020843, "language_loss": 0.73304325, "learning_rate": 2.220002858758162e-06, "loss": 0.75610685, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.692286968231201 }, { "auxiliary_loss_clip": 0.01133084, "auxiliary_loss_mlp": 0.00999427, "balance_loss_clip": 1.0148586, "balance_loss_mlp": 0.99842566, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8806941133959448, "language_loss": 0.60813951, "learning_rate": 2.2192285976880573e-06, "loss": 0.62946463, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.2378737926483154 }, { "auxiliary_loss_clip": 0.01334106, "auxiliary_loss_mlp": 0.02561533, "balance_loss_clip": 1.04567647, "balance_loss_mlp": 1.00034976, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.668726532589476, "language_loss": 0.81013978, "learning_rate": 2.2184543033611485e-06, "loss": 0.84909618, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.8525373935699463 }, { "auxiliary_loss_clip": 0.01236075, "auxiliary_loss_mlp": 0.01025706, "balance_loss_clip": 1.049793, "balance_loss_mlp": 1.01844954, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.0979533724727446, "language_loss": 0.81868732, "learning_rate": 2.2176799758948957e-06, "loss": 0.84130514, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.750671625137329 }, { "auxiliary_loss_clip": 0.0128175, "auxiliary_loss_mlp": 0.01027302, "balance_loss_clip": 1.04811752, "balance_loss_mlp": 1.01981533, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 2.9896715897573762, "language_loss": 0.73624015, "learning_rate": 2.2169056154067635e-06, "loss": 0.75933069, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.9134700298309326 }, { "auxiliary_loss_clip": 0.01234935, "auxiliary_loss_mlp": 0.02566943, "balance_loss_clip": 1.05186069, "balance_loss_mlp": 1.00026333, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 2.289772599992823, "language_loss": 0.82553023, "learning_rate": 2.216131222014222e-06, "loss": 0.86354899, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.7134668827056885 }, { "auxiliary_loss_clip": 0.0133278, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.05023789, "balance_loss_mlp": 1.02145505, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 3.1721657204080995, "language_loss": 0.80695355, "learning_rate": 2.2153567958347455e-06, "loss": 0.83057791, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.7942960262298584 }, { "auxiliary_loss_clip": 0.0128484, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.05151486, "balance_loss_mlp": 1.02489972, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 12.734515583041603, "language_loss": 0.79268873, "learning_rate": 2.214582336985815e-06, "loss": 0.81585664, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.6797971725463867 }, { "auxiliary_loss_clip": 0.01280774, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.04815674, "balance_loss_mlp": 1.02074397, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.5296339281498943, "language_loss": 0.66682529, "learning_rate": 2.2138078455849142e-06, "loss": 0.68992001, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.626786947250366 }, { "auxiliary_loss_clip": 0.01140781, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.04978538, "balance_loss_mlp": 1.02199197, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 2.204931484147545, "language_loss": 0.78783667, "learning_rate": 2.2130333217495334e-06, "loss": 0.80954146, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.7325327396392822 }, { "auxiliary_loss_clip": 0.01280951, "auxiliary_loss_mlp": 0.01028394, "balance_loss_clip": 1.04807091, "balance_loss_mlp": 1.02033544, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.4978340762360394, "language_loss": 0.67973709, "learning_rate": 2.2122587655971665e-06, "loss": 0.70283055, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.6405539512634277 }, { "auxiliary_loss_clip": 0.01287087, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04897749, "balance_loss_mlp": 1.02197409, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.7625858856840173, "language_loss": 0.64221549, "learning_rate": 2.211484177245314e-06, "loss": 0.66538286, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 3.7310776710510254 }, { "auxiliary_loss_clip": 0.01190579, "auxiliary_loss_mlp": 0.01025552, "balance_loss_clip": 1.05561256, "balance_loss_mlp": 1.01775277, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.546835394600579, "language_loss": 0.72585553, "learning_rate": 2.21070955681148e-06, "loss": 0.74801683, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.6638896465301514 }, { "auxiliary_loss_clip": 0.0132702, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.04797268, "balance_loss_mlp": 1.02283275, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.808249246711387, "language_loss": 0.78404933, "learning_rate": 2.209934904413174e-06, "loss": 0.80762422, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 3.5591137409210205 }, { "auxiliary_loss_clip": 0.01432751, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.03826082, "balance_loss_mlp": 1.02521992, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.35181230164141, "language_loss": 0.71708703, "learning_rate": 2.2091602201679095e-06, "loss": 0.74174178, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.859541893005371 }, { "auxiliary_loss_clip": 0.0134077, "auxiliary_loss_mlp": 0.01034544, "balance_loss_clip": 1.04958916, "balance_loss_mlp": 1.02718258, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.0658825405787575, "language_loss": 0.83400148, "learning_rate": 2.208385504193206e-06, "loss": 0.85775459, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.6651864051818848 }, { "auxiliary_loss_clip": 0.01184312, "auxiliary_loss_mlp": 0.0102836, "balance_loss_clip": 1.05036795, "balance_loss_mlp": 1.02045095, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.0870787882864157, "language_loss": 0.81161821, "learning_rate": 2.2076107566065873e-06, "loss": 0.833745, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.6630756855010986 }, { "auxiliary_loss_clip": 0.01143179, "auxiliary_loss_mlp": 0.01026558, "balance_loss_clip": 1.05296922, "balance_loss_mlp": 1.01896417, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.254671188865721, "language_loss": 0.75408506, "learning_rate": 2.2068359775255816e-06, "loss": 0.77578247, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 3.613736391067505 }, { "auxiliary_loss_clip": 0.01378043, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.04404855, "balance_loss_mlp": 1.02101636, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 3.017668213162362, "language_loss": 0.7915355, "learning_rate": 2.206061167067723e-06, "loss": 0.81559914, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.759812116622925 }, { "auxiliary_loss_clip": 0.01333589, "auxiliary_loss_mlp": 0.01027893, "balance_loss_clip": 1.04325008, "balance_loss_mlp": 1.01995361, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.794478330075195, "language_loss": 0.79228622, "learning_rate": 2.205286325350549e-06, "loss": 0.81590104, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.7434096336364746 }, { "auxiliary_loss_clip": 0.01384046, "auxiliary_loss_mlp": 0.01028023, "balance_loss_clip": 1.04696274, "balance_loss_mlp": 1.02078676, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.179825280001189, "language_loss": 0.72474223, "learning_rate": 2.204511452491603e-06, "loss": 0.74886298, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 3.6907846927642822 }, { "auxiliary_loss_clip": 0.01182805, "auxiliary_loss_mlp": 0.01024547, "balance_loss_clip": 1.05269265, "balance_loss_mlp": 1.01737642, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.9534178265892506, "language_loss": 0.7496835, "learning_rate": 2.2037365486084316e-06, "loss": 0.77175701, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.786121368408203 }, { "auxiliary_loss_clip": 0.01242848, "auxiliary_loss_mlp": 0.01023392, "balance_loss_clip": 1.04332495, "balance_loss_mlp": 1.01570344, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 2.316750200538825, "language_loss": 0.78068733, "learning_rate": 2.2029616138185886e-06, "loss": 0.80334967, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.7821168899536133 }, { "auxiliary_loss_clip": 0.01328097, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.0503819, "balance_loss_mlp": 1.01867247, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 2.314021514105893, "language_loss": 0.82735658, "learning_rate": 2.202186648239629e-06, "loss": 0.85090196, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.693026065826416 }, { "auxiliary_loss_clip": 0.01229845, "auxiliary_loss_mlp": 0.0102313, "balance_loss_clip": 1.04870367, "balance_loss_mlp": 1.01590335, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 3.9824183091694962, "language_loss": 0.71751487, "learning_rate": 2.201411651989117e-06, "loss": 0.74004459, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.6909263134002686 }, { "auxiliary_loss_clip": 0.01278185, "auxiliary_loss_mlp": 0.02563724, "balance_loss_clip": 1.0497973, "balance_loss_mlp": 1.00021887, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.847446337958562, "language_loss": 0.78327632, "learning_rate": 2.2006366251846167e-06, "loss": 0.82169539, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.748288869857788 }, { "auxiliary_loss_clip": 0.01285236, "auxiliary_loss_mlp": 0.01030569, "balance_loss_clip": 1.05182195, "balance_loss_mlp": 1.02333307, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 2.1116509048645384, "language_loss": 0.75521237, "learning_rate": 2.1998615679436997e-06, "loss": 0.77837038, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.6406500339508057 }, { "auxiliary_loss_clip": 0.01289314, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.04746127, "balance_loss_mlp": 1.02141047, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.093244695413779, "language_loss": 0.77205276, "learning_rate": 2.199086480383942e-06, "loss": 0.79523814, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.6515297889709473 }, { "auxiliary_loss_clip": 0.01299997, "auxiliary_loss_mlp": 0.01025923, "balance_loss_clip": 1.05233049, "balance_loss_mlp": 1.01729178, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 8.199483773780651, "language_loss": 0.68539035, "learning_rate": 2.1983113626229234e-06, "loss": 0.70864952, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.785170555114746 }, { "auxiliary_loss_clip": 0.01331325, "auxiliary_loss_mlp": 0.02564501, "balance_loss_clip": 1.04338896, "balance_loss_mlp": 1.00017452, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.9139298197525163, "language_loss": 0.7877171, "learning_rate": 2.1975362147782293e-06, "loss": 0.82667542, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.7041947841644287 }, { "auxiliary_loss_clip": 0.0120626, "auxiliary_loss_mlp": 0.01011912, "balance_loss_clip": 1.03012466, "balance_loss_mlp": 1.01089275, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6906493740568422, "language_loss": 0.54150295, "learning_rate": 2.196761036967448e-06, "loss": 0.5636847, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.3886163234710693 }, { "auxiliary_loss_clip": 0.01226314, "auxiliary_loss_mlp": 0.01024526, "balance_loss_clip": 1.04674876, "balance_loss_mlp": 1.01746273, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 2.151617822072996, "language_loss": 0.77638751, "learning_rate": 2.1959858293081743e-06, "loss": 0.79889596, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.6911773681640625 }, { "auxiliary_loss_clip": 0.01329995, "auxiliary_loss_mlp": 0.01025285, "balance_loss_clip": 1.04743338, "balance_loss_mlp": 1.01800132, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 2.038446735143694, "language_loss": 0.762555, "learning_rate": 2.1952105919180056e-06, "loss": 0.78610772, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.6715898513793945 }, { "auxiliary_loss_clip": 0.01280335, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 1.04965508, "balance_loss_mlp": 1.02075028, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 4.519225891287855, "language_loss": 0.68804157, "learning_rate": 2.1944353249145456e-06, "loss": 0.71112537, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.7250940799713135 }, { "auxiliary_loss_clip": 0.011865, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.05450082, "balance_loss_mlp": 1.01705956, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.857713479995933, "language_loss": 0.74371058, "learning_rate": 2.193660028415401e-06, "loss": 0.76581997, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.6284420490264893 }, { "auxiliary_loss_clip": 0.0127758, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.04645395, "balance_loss_mlp": 1.02214766, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 1.7166658704649305, "language_loss": 0.81545728, "learning_rate": 2.1928847025381852e-06, "loss": 0.83853358, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.7271432876586914 }, { "auxiliary_loss_clip": 0.01232982, "auxiliary_loss_mlp": 0.01027563, "balance_loss_clip": 1.04640174, "balance_loss_mlp": 1.01926029, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.6816150033570587, "language_loss": 0.83917463, "learning_rate": 2.192109347400512e-06, "loss": 0.86177999, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.70835542678833 }, { "auxiliary_loss_clip": 0.01285842, "auxiliary_loss_mlp": 0.01029779, "balance_loss_clip": 1.04885614, "balance_loss_mlp": 1.02204776, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.8787355625076705, "language_loss": 0.79001677, "learning_rate": 2.191333963120004e-06, "loss": 0.81317306, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 3.6955370903015137 }, { "auxiliary_loss_clip": 0.01280155, "auxiliary_loss_mlp": 0.01020454, "balance_loss_clip": 1.04826057, "balance_loss_mlp": 1.01301503, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.577322850743525, "language_loss": 0.70110315, "learning_rate": 2.190558549814286e-06, "loss": 0.72410923, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.740154981613159 }, { "auxiliary_loss_clip": 0.01284973, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.04680979, "balance_loss_mlp": 1.02272403, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.8724534510300423, "language_loss": 0.79464209, "learning_rate": 2.1897831076009872e-06, "loss": 0.81779152, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 3.563478708267212 }, { "auxiliary_loss_clip": 0.01233513, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.04965317, "balance_loss_mlp": 1.02213359, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 5.536514802884875, "language_loss": 0.79977751, "learning_rate": 2.1890076365977426e-06, "loss": 0.82240593, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.6622445583343506 }, { "auxiliary_loss_clip": 0.01180111, "auxiliary_loss_mlp": 0.01002073, "balance_loss_clip": 1.01513648, "balance_loss_mlp": 1.00098825, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8513363102495279, "language_loss": 0.52733511, "learning_rate": 2.188232136922189e-06, "loss": 0.54915696, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.1942598819732666 }, { "auxiliary_loss_clip": 0.01330134, "auxiliary_loss_mlp": 0.0102513, "balance_loss_clip": 1.04313731, "balance_loss_mlp": 1.01740265, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 2.934324086980654, "language_loss": 0.76255238, "learning_rate": 2.187456608691971e-06, "loss": 0.78610504, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.764496088027954 }, { "auxiliary_loss_clip": 0.01340569, "auxiliary_loss_mlp": 0.01027429, "balance_loss_clip": 1.05077398, "balance_loss_mlp": 1.01974607, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 2.3989080757998797, "language_loss": 0.87708789, "learning_rate": 2.1866810520247334e-06, "loss": 0.90076786, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 3.6264841556549072 }, { "auxiliary_loss_clip": 0.01237753, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04941964, "balance_loss_mlp": 1.01903009, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 2.2298147493003944, "language_loss": 0.65172511, "learning_rate": 2.185905467038129e-06, "loss": 0.67437643, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.6729867458343506 }, { "auxiliary_loss_clip": 0.01185837, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.05520606, "balance_loss_mlp": 1.02547777, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.7556145550608777, "language_loss": 0.77540231, "learning_rate": 2.1851298538498127e-06, "loss": 0.7975899, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 3.5811498165130615 }, { "auxiliary_loss_clip": 0.01244861, "auxiliary_loss_mlp": 0.02571706, "balance_loss_clip": 1.05551708, "balance_loss_mlp": 1.00021744, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 2.203234559846567, "language_loss": 0.79781342, "learning_rate": 2.184354212577446e-06, "loss": 0.8359791, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.722137928009033 }, { "auxiliary_loss_clip": 0.01188955, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.05210924, "balance_loss_mlp": 1.02721334, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 2.9383662867719096, "language_loss": 0.6314677, "learning_rate": 2.1835785433386907e-06, "loss": 0.6537056, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.5418028831481934 }, { "auxiliary_loss_clip": 0.01330853, "auxiliary_loss_mlp": 0.01027253, "balance_loss_clip": 1.0499692, "balance_loss_mlp": 1.0195998, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8948497124081514, "language_loss": 0.6558606, "learning_rate": 2.182802846251216e-06, "loss": 0.67944163, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.7988412380218506 }, { "auxiliary_loss_clip": 0.01334119, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.04529989, "balance_loss_mlp": 1.01988709, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 2.151002499061319, "language_loss": 0.72478008, "learning_rate": 2.182027121432696e-06, "loss": 0.74839234, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.7443809509277344 }, { "auxiliary_loss_clip": 0.01190696, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.05434, "balance_loss_mlp": 1.02521491, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 1.7493796448457717, "language_loss": 0.82173431, "learning_rate": 2.1812513690008054e-06, "loss": 0.84397018, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.6153476238250732 }, { "auxiliary_loss_clip": 0.01243943, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.05330718, "balance_loss_mlp": 1.02187598, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.1583693310231253, "language_loss": 0.80383146, "learning_rate": 2.180475589073227e-06, "loss": 0.82656872, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.6712169647216797 }, { "auxiliary_loss_clip": 0.01228235, "auxiliary_loss_mlp": 0.01023953, "balance_loss_clip": 1.04724002, "balance_loss_mlp": 1.0165832, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.8291550169859159, "language_loss": 0.73254263, "learning_rate": 2.1796997817676456e-06, "loss": 0.75506455, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.6843678951263428 }, { "auxiliary_loss_clip": 0.01239788, "auxiliary_loss_mlp": 0.02561772, "balance_loss_clip": 1.0521332, "balance_loss_mlp": 1.00015545, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 2.143198221918596, "language_loss": 0.67939043, "learning_rate": 2.1789239472017494e-06, "loss": 0.71740603, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.7078354358673096 }, { "auxiliary_loss_clip": 0.01332122, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.04630566, "balance_loss_mlp": 1.01886177, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.27829822490563, "language_loss": 0.73057711, "learning_rate": 2.1781480854932326e-06, "loss": 0.75416362, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.7918620109558105 }, { "auxiliary_loss_clip": 0.01381631, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.05006933, "balance_loss_mlp": 1.02126181, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 2.1570043409678323, "language_loss": 0.78997982, "learning_rate": 2.1773721967597933e-06, "loss": 0.81408232, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.7391819953918457 }, { "auxiliary_loss_clip": 0.01171331, "auxiliary_loss_mlp": 0.0099951, "balance_loss_clip": 1.01212263, "balance_loss_mlp": 0.99848461, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.853015285159282, "language_loss": 0.57344317, "learning_rate": 2.1765962811191322e-06, "loss": 0.5951516, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.217440366744995 }, { "auxiliary_loss_clip": 0.0128205, "auxiliary_loss_mlp": 0.01002132, "balance_loss_clip": 1.01282871, "balance_loss_mlp": 1.00108325, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8232219642285158, "language_loss": 0.61960328, "learning_rate": 2.1758203386889566e-06, "loss": 0.64244509, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.3421006202697754 }, { "auxiliary_loss_clip": 0.01337817, "auxiliary_loss_mlp": 0.02568989, "balance_loss_clip": 1.04965067, "balance_loss_mlp": 1.00015569, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 2.2858043154276695, "language_loss": 0.84692621, "learning_rate": 2.1750443695869746e-06, "loss": 0.88599432, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.7047276496887207 }, { "auxiliary_loss_clip": 0.01237779, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.05080652, "balance_loss_mlp": 1.02290797, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 1.9678700378595788, "language_loss": 0.85934412, "learning_rate": 2.174268373930901e-06, "loss": 0.88203198, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.699521780014038 }, { "auxiliary_loss_clip": 0.01328945, "auxiliary_loss_mlp": 0.02566388, "balance_loss_clip": 1.05094659, "balance_loss_mlp": 1.00017095, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 2.1011662137601284, "language_loss": 0.80159765, "learning_rate": 2.1734923518384537e-06, "loss": 0.84055102, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.6909468173980713 }, { "auxiliary_loss_clip": 0.0132489, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.05001557, "balance_loss_mlp": 1.02237201, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 2.0312641605487167, "language_loss": 0.82507312, "learning_rate": 2.1727163034273547e-06, "loss": 0.84861565, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.792018175125122 }, { "auxiliary_loss_clip": 0.01238178, "auxiliary_loss_mlp": 0.01027197, "balance_loss_clip": 1.05030322, "balance_loss_mlp": 1.01980889, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.1124358990082253, "language_loss": 0.78641951, "learning_rate": 2.17194022881533e-06, "loss": 0.80907321, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.7032992839813232 }, { "auxiliary_loss_clip": 0.01287614, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.05010986, "balance_loss_mlp": 1.02343893, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 1.9797445643938614, "language_loss": 0.67703128, "learning_rate": 2.1711641281201092e-06, "loss": 0.70021832, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 3.846893548965454 }, { "auxiliary_loss_clip": 0.01235949, "auxiliary_loss_mlp": 0.01024862, "balance_loss_clip": 1.05315888, "balance_loss_mlp": 1.01765037, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.041597914281744, "language_loss": 0.79196149, "learning_rate": 2.1703880014594264e-06, "loss": 0.81456959, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.7315523624420166 }, { "auxiliary_loss_clip": 0.01385519, "auxiliary_loss_mlp": 0.0102382, "balance_loss_clip": 1.0530839, "balance_loss_mlp": 1.01667666, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 2.199166503907879, "language_loss": 0.73894882, "learning_rate": 2.1696118489510182e-06, "loss": 0.76304221, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 3.713191270828247 }, { "auxiliary_loss_clip": 0.01339338, "auxiliary_loss_mlp": 0.025665, "balance_loss_clip": 1.04899788, "balance_loss_mlp": 1.00017607, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 2.569873835685232, "language_loss": 0.72806352, "learning_rate": 2.1688356707126286e-06, "loss": 0.76712191, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.7563745975494385 }, { "auxiliary_loss_clip": 0.01329647, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.04827261, "balance_loss_mlp": 1.02399468, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 2.2853673090312157, "language_loss": 0.70046687, "learning_rate": 2.168059466862001e-06, "loss": 0.72408807, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.708004951477051 }, { "auxiliary_loss_clip": 0.01282561, "auxiliary_loss_mlp": 0.01026205, "balance_loss_clip": 1.04610324, "balance_loss_mlp": 1.0193119, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 1.8764557716521835, "language_loss": 0.81804717, "learning_rate": 2.167283237516887e-06, "loss": 0.84113479, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.691319465637207 }, { "auxiliary_loss_clip": 0.01287546, "auxiliary_loss_mlp": 0.01026409, "balance_loss_clip": 1.04881167, "balance_loss_mlp": 1.01873171, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.924332755272138, "language_loss": 0.747159, "learning_rate": 2.1665069827950383e-06, "loss": 0.77029854, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 3.533473014831543 }, { "auxiliary_loss_clip": 0.01282348, "auxiliary_loss_mlp": 0.01021654, "balance_loss_clip": 1.04929638, "balance_loss_mlp": 1.01459718, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 2.1778333826309026, "language_loss": 0.86840725, "learning_rate": 2.1657307028142126e-06, "loss": 0.89144731, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.6145646572113037 }, { "auxiliary_loss_clip": 0.01284168, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.04968762, "balance_loss_mlp": 1.01937461, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 2.588690343760685, "language_loss": 0.67528522, "learning_rate": 2.164954397692171e-06, "loss": 0.69840503, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 3.6981096267700195 }, { "auxiliary_loss_clip": 0.01185293, "auxiliary_loss_mlp": 0.00999859, "balance_loss_clip": 1.01450849, "balance_loss_mlp": 0.9989174, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.08742452803563, "language_loss": 0.77302003, "learning_rate": 2.164178067546678e-06, "loss": 0.79487157, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.363568067550659 }, { "auxiliary_loss_clip": 0.0129047, "auxiliary_loss_mlp": 0.01026379, "balance_loss_clip": 1.04785371, "balance_loss_mlp": 1.01892281, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 2.625214398679279, "language_loss": 0.91025341, "learning_rate": 2.163401712495504e-06, "loss": 0.93342197, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.695124626159668 }, { "auxiliary_loss_clip": 0.0129785, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.05186009, "balance_loss_mlp": 1.02712131, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.6759869821067048, "language_loss": 0.79355657, "learning_rate": 2.1626253326564194e-06, "loss": 0.8168847, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.844325304031372 }, { "auxiliary_loss_clip": 0.01287172, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.05012655, "balance_loss_mlp": 1.02142692, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.8195501910112515, "language_loss": 0.76881802, "learning_rate": 2.161848928147201e-06, "loss": 0.7919876, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.8167190551757812 }, { "auxiliary_loss_clip": 0.01235346, "auxiliary_loss_mlp": 0.01031504, "balance_loss_clip": 1.05185914, "balance_loss_mlp": 1.02355301, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.830602692894066, "language_loss": 0.81023985, "learning_rate": 2.161072499085629e-06, "loss": 0.83290833, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.697890520095825 }, { "auxiliary_loss_clip": 0.0133568, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.04733777, "balance_loss_mlp": 1.02527928, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.719481161503393, "language_loss": 0.83239222, "learning_rate": 2.160296045589487e-06, "loss": 0.8560766, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.8560867309570312 }, { "auxiliary_loss_clip": 0.01236304, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.05264294, "balance_loss_mlp": 1.02179778, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 6.099918873185634, "language_loss": 0.70260644, "learning_rate": 2.159519567776562e-06, "loss": 0.72526264, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.7209858894348145 }, { "auxiliary_loss_clip": 0.01294442, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.04439187, "balance_loss_mlp": 1.01912034, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 3.0994053941026714, "language_loss": 0.70817995, "learning_rate": 2.1587430657646463e-06, "loss": 0.73139417, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.8434557914733887 }, { "auxiliary_loss_clip": 0.01282201, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.05185914, "balance_loss_mlp": 1.02458668, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 2.274945405022825, "language_loss": 0.78118086, "learning_rate": 2.157966539671533e-06, "loss": 0.80432296, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.7114460468292236 }, { "auxiliary_loss_clip": 0.01334718, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.04729223, "balance_loss_mlp": 1.02030802, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 1.916759785111905, "language_loss": 0.67214251, "learning_rate": 2.157189989615021e-06, "loss": 0.69576371, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.891043186187744 }, { "auxiliary_loss_clip": 0.01240931, "auxiliary_loss_mlp": 0.02572182, "balance_loss_clip": 1.05157208, "balance_loss_mlp": 1.00020254, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 3.6581874914413195, "language_loss": 0.74826384, "learning_rate": 2.156413415712913e-06, "loss": 0.78639501, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.761448621749878 }, { "auxiliary_loss_clip": 0.0129025, "auxiliary_loss_mlp": 0.02569688, "balance_loss_clip": 1.05090094, "balance_loss_mlp": 1.0002265, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.8617298862042326, "language_loss": 0.78987825, "learning_rate": 2.155636818083014e-06, "loss": 0.82847768, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.7744667530059814 }, { "auxiliary_loss_clip": 0.01278553, "auxiliary_loss_mlp": 0.01023399, "balance_loss_clip": 1.05027938, "balance_loss_mlp": 1.01618052, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 2.164436453279822, "language_loss": 0.83973384, "learning_rate": 2.154860196843134e-06, "loss": 0.86275333, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.7947726249694824 }, { "auxiliary_loss_clip": 0.01187373, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.05212045, "balance_loss_mlp": 1.02292013, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 1.9358775967418005, "language_loss": 0.76707506, "learning_rate": 2.154083552111085e-06, "loss": 0.78925151, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.7005770206451416 }, { "auxiliary_loss_clip": 0.01189607, "auxiliary_loss_mlp": 0.0102843, "balance_loss_clip": 1.05369461, "balance_loss_mlp": 1.02029991, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 2.221277937280941, "language_loss": 0.81707418, "learning_rate": 2.1533068840046834e-06, "loss": 0.83925462, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.7311692237854004 }, { "auxiliary_loss_clip": 0.01281672, "auxiliary_loss_mlp": 0.02571027, "balance_loss_clip": 1.0500474, "balance_loss_mlp": 1.00021267, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 8.024639708311575, "language_loss": 0.6168806, "learning_rate": 2.152530192641749e-06, "loss": 0.65540755, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.6845226287841797 }, { "auxiliary_loss_clip": 0.0114111, "auxiliary_loss_mlp": 0.01024815, "balance_loss_clip": 1.05155313, "balance_loss_mlp": 1.01740313, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.6920368708968616, "language_loss": 0.72350204, "learning_rate": 2.1517534781401068e-06, "loss": 0.74516129, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.7221028804779053 }, { "auxiliary_loss_clip": 0.01234108, "auxiliary_loss_mlp": 0.01025442, "balance_loss_clip": 1.04994154, "balance_loss_mlp": 1.01781535, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 3.3991941057863, "language_loss": 0.69562048, "learning_rate": 2.150976740617581e-06, "loss": 0.71821594, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 3.6098384857177734 }, { "auxiliary_loss_clip": 0.01289601, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.05084586, "balance_loss_mlp": 1.02088785, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.8711344686404499, "language_loss": 0.71469527, "learning_rate": 2.150199980192006e-06, "loss": 0.73787355, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.732696771621704 }, { "auxiliary_loss_clip": 0.01279554, "auxiliary_loss_mlp": 0.01027998, "balance_loss_clip": 1.04744697, "balance_loss_mlp": 1.02075577, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.8921588816844086, "language_loss": 0.80599666, "learning_rate": 2.1494231969812114e-06, "loss": 0.82907212, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 3.616732358932495 }, { "auxiliary_loss_clip": 0.01342677, "auxiliary_loss_mlp": 0.01028141, "balance_loss_clip": 1.05230904, "balance_loss_mlp": 1.02077985, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.1815065462757124, "language_loss": 0.81269741, "learning_rate": 2.1486463911030372e-06, "loss": 0.83640558, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.7819273471832275 }, { "auxiliary_loss_clip": 0.01281257, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 1.04741979, "balance_loss_mlp": 1.01932454, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.8011909947546807, "language_loss": 0.74700707, "learning_rate": 2.147869562675324e-06, "loss": 0.77008289, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.737931489944458 }, { "auxiliary_loss_clip": 0.01235235, "auxiliary_loss_mlp": 0.01023218, "balance_loss_clip": 1.05195022, "balance_loss_mlp": 1.01524878, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 2.3884726910761764, "language_loss": 0.72565341, "learning_rate": 2.147092711815915e-06, "loss": 0.74823797, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.782163619995117 }, { "auxiliary_loss_clip": 0.01329956, "auxiliary_loss_mlp": 0.0102591, "balance_loss_clip": 1.05011284, "balance_loss_mlp": 1.01860857, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.2295396392128706, "language_loss": 0.86331928, "learning_rate": 2.1463158386426593e-06, "loss": 0.88687789, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 3.529221296310425 }, { "auxiliary_loss_clip": 0.01288965, "auxiliary_loss_mlp": 0.0102779, "balance_loss_clip": 1.05057907, "balance_loss_mlp": 1.01999629, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 1.902701111938967, "language_loss": 0.76874733, "learning_rate": 2.145538943273407e-06, "loss": 0.79191488, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.8493595123291016 }, { "auxiliary_loss_clip": 0.01188854, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.05439425, "balance_loss_mlp": 1.02450633, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 2.1293576258097278, "language_loss": 0.71816349, "learning_rate": 2.144762025826013e-06, "loss": 0.74037302, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 3.4842960834503174 }, { "auxiliary_loss_clip": 0.01238199, "auxiliary_loss_mlp": 0.01028508, "balance_loss_clip": 1.04968524, "balance_loss_mlp": 1.02094412, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.51941093502445, "language_loss": 0.87001157, "learning_rate": 2.143985086418334e-06, "loss": 0.89267862, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.8092446327209473 }, { "auxiliary_loss_clip": 0.01285027, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.04794168, "balance_loss_mlp": 1.01783085, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.7898924274984098, "language_loss": 0.76638609, "learning_rate": 2.1432081251682324e-06, "loss": 0.78948599, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.718900203704834 }, { "auxiliary_loss_clip": 0.01245585, "auxiliary_loss_mlp": 0.01026588, "balance_loss_clip": 1.05926132, "balance_loss_mlp": 1.0193584, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.832696759631203, "language_loss": 0.8698895, "learning_rate": 2.142431142193572e-06, "loss": 0.89261121, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.673250675201416 }, { "auxiliary_loss_clip": 0.01184203, "auxiliary_loss_mlp": 0.0102769, "balance_loss_clip": 1.05251741, "balance_loss_mlp": 1.02034426, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.8837080944613485, "language_loss": 0.71365535, "learning_rate": 2.1416541376122207e-06, "loss": 0.73577428, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.7697925567626953 }, { "auxiliary_loss_clip": 0.01184945, "auxiliary_loss_mlp": 0.01026237, "balance_loss_clip": 1.04997706, "balance_loss_mlp": 1.01870263, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 2.0264516454909995, "language_loss": 0.73169315, "learning_rate": 2.1408771115420496e-06, "loss": 0.75380498, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.6863934993743896 }, { "auxiliary_loss_clip": 0.01397027, "auxiliary_loss_mlp": 0.01031422, "balance_loss_clip": 1.05727899, "balance_loss_mlp": 1.02417743, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 2.008328418644424, "language_loss": 0.64935887, "learning_rate": 2.140100064100932e-06, "loss": 0.67364329, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.858157157897949 }, { "auxiliary_loss_clip": 0.01231575, "auxiliary_loss_mlp": 0.01025943, "balance_loss_clip": 1.05229402, "balance_loss_mlp": 1.0189693, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 3.1755702641989827, "language_loss": 0.75709891, "learning_rate": 2.139322995406746e-06, "loss": 0.77967405, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.6984400749206543 }, { "auxiliary_loss_clip": 0.01186743, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.05378222, "balance_loss_mlp": 1.02349579, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.1668601685003255, "language_loss": 0.79699898, "learning_rate": 2.1385459055773727e-06, "loss": 0.81917799, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.685393810272217 }, { "auxiliary_loss_clip": 0.01368972, "auxiliary_loss_mlp": 0.02561272, "balance_loss_clip": 1.04313493, "balance_loss_mlp": 1.00019491, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 2.55958037116647, "language_loss": 0.7370832, "learning_rate": 2.137768794730696e-06, "loss": 0.77638566, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.2117631435394287 }, { "auxiliary_loss_clip": 0.01291072, "auxiliary_loss_mlp": 0.01028483, "balance_loss_clip": 1.05547488, "balance_loss_mlp": 1.02000082, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.8044477092487992, "language_loss": 0.80359882, "learning_rate": 2.1369916629846026e-06, "loss": 0.82679439, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.7997939586639404 }, { "auxiliary_loss_clip": 0.0128547, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.0492177, "balance_loss_mlp": 1.017061, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 1.8581552381994435, "language_loss": 0.75212002, "learning_rate": 2.136214510456983e-06, "loss": 0.7752192, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.961371421813965 }, { "auxiliary_loss_clip": 0.01294481, "auxiliary_loss_mlp": 0.02505666, "balance_loss_clip": 1.01515555, "balance_loss_mlp": 1.00014925, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.9234287445643253, "language_loss": 0.63131922, "learning_rate": 2.1354373372657296e-06, "loss": 0.6693207, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.343641519546509 }, { "auxiliary_loss_clip": 0.01186518, "auxiliary_loss_mlp": 0.01031284, "balance_loss_clip": 1.05332255, "balance_loss_mlp": 1.02403665, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 4.680266806208277, "language_loss": 0.7100426, "learning_rate": 2.1346601435287404e-06, "loss": 0.73222059, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.668780565261841 }, { "auxiliary_loss_clip": 0.01283876, "auxiliary_loss_mlp": 0.01026941, "balance_loss_clip": 1.04887652, "balance_loss_mlp": 1.01941621, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.8642869406623956, "language_loss": 0.80206895, "learning_rate": 2.1338829293639144e-06, "loss": 0.82517713, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.7817623615264893 }, { "auxiliary_loss_clip": 0.01390324, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.0515151, "balance_loss_mlp": 1.02495217, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 3.018294106867742, "language_loss": 0.83110428, "learning_rate": 2.1331056948891547e-06, "loss": 0.85533279, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.835930824279785 }, { "auxiliary_loss_clip": 0.01280888, "auxiliary_loss_mlp": 0.01028117, "balance_loss_clip": 1.04872656, "balance_loss_mlp": 1.02061236, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.2283260197248094, "language_loss": 0.75715554, "learning_rate": 2.1323284402223666e-06, "loss": 0.78024566, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.7047362327575684 }, { "auxiliary_loss_clip": 0.01182911, "auxiliary_loss_mlp": 0.02558821, "balance_loss_clip": 1.05447924, "balance_loss_mlp": 1.00030851, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.777977710740508, "language_loss": 0.87982798, "learning_rate": 2.1315511654814597e-06, "loss": 0.91724533, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.6819467544555664 }, { "auxiliary_loss_clip": 0.0127598, "auxiliary_loss_mlp": 0.0102562, "balance_loss_clip": 1.05074704, "balance_loss_mlp": 1.01847041, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 2.000460230129795, "language_loss": 0.78479874, "learning_rate": 2.1307738707843456e-06, "loss": 0.80781472, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 3.734989643096924 }, { "auxiliary_loss_clip": 0.01242246, "auxiliary_loss_mlp": 0.01022292, "balance_loss_clip": 1.05353498, "balance_loss_mlp": 1.01424301, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 2.2713729443568265, "language_loss": 0.69336343, "learning_rate": 2.1299965562489385e-06, "loss": 0.71600878, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.6821272373199463 }, { "auxiliary_loss_clip": 0.01234056, "auxiliary_loss_mlp": 0.01024241, "balance_loss_clip": 1.04947352, "balance_loss_mlp": 1.01654053, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.6561417931797737, "language_loss": 0.79005951, "learning_rate": 2.129219221993158e-06, "loss": 0.81264246, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 3.688939094543457 }, { "auxiliary_loss_clip": 0.0123443, "auxiliary_loss_mlp": 0.00999281, "balance_loss_clip": 1.01385188, "balance_loss_mlp": 0.9983328, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.7883896176894193, "language_loss": 0.59912694, "learning_rate": 2.128441868134924e-06, "loss": 0.62146401, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.3732128143310547 }, { "auxiliary_loss_clip": 0.01334338, "auxiliary_loss_mlp": 0.01030946, "balance_loss_clip": 1.04743481, "balance_loss_mlp": 1.02312636, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.576263561979592, "language_loss": 0.82660443, "learning_rate": 2.1276644947921606e-06, "loss": 0.85025728, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.7892258167266846 }, { "auxiliary_loss_clip": 0.01234679, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.05057621, "balance_loss_mlp": 1.0217731, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 1.896184670079025, "language_loss": 0.82611477, "learning_rate": 2.126887102082795e-06, "loss": 0.84875488, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.743011236190796 }, { "auxiliary_loss_clip": 0.01329868, "auxiliary_loss_mlp": 0.01022568, "balance_loss_clip": 1.04468274, "balance_loss_mlp": 1.01512921, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 2.0209503693419366, "language_loss": 0.70768607, "learning_rate": 2.126109690124757e-06, "loss": 0.73121047, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 3.665943145751953 }, { "auxiliary_loss_clip": 0.01381985, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.04541183, "balance_loss_mlp": 1.02663171, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.662271360464674, "language_loss": 0.70968306, "learning_rate": 2.1253322590359786e-06, "loss": 0.73384106, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.8502326011657715 }, { "auxiliary_loss_clip": 0.01236865, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.0512054, "balance_loss_mlp": 1.02375388, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 1.9843581420581209, "language_loss": 0.74111766, "learning_rate": 2.124554808934397e-06, "loss": 0.7638002, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 3.661850929260254 }, { "auxiliary_loss_clip": 0.01383197, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.04421806, "balance_loss_mlp": 1.01979947, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.959225912449428, "language_loss": 0.73079509, "learning_rate": 2.1237773399379496e-06, "loss": 0.75489956, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.771360158920288 }, { "auxiliary_loss_clip": 0.01188454, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.04475677, "balance_loss_mlp": 1.02087331, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 2.1712499920897956, "language_loss": 0.87085676, "learning_rate": 2.122999852164578e-06, "loss": 0.8930226, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.806997776031494 }, { "auxiliary_loss_clip": 0.01286484, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.05077624, "balance_loss_mlp": 1.0223279, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.72483758024556, "language_loss": 0.58656317, "learning_rate": 2.122222345732227e-06, "loss": 0.60972756, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.769883155822754 }, { "auxiliary_loss_clip": 0.01336034, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.04873097, "balance_loss_mlp": 1.02655792, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 1.9269496863668125, "language_loss": 0.83362901, "learning_rate": 2.121444820758843e-06, "loss": 0.85732913, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.675095796585083 }, { "auxiliary_loss_clip": 0.01383968, "auxiliary_loss_mlp": 0.01026862, "balance_loss_clip": 1.04857564, "balance_loss_mlp": 1.01877332, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 2.1996909702834, "language_loss": 0.7898978, "learning_rate": 2.120667277362376e-06, "loss": 0.81400609, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.8890936374664307 }, { "auxiliary_loss_clip": 0.01193313, "auxiliary_loss_mlp": 0.01039329, "balance_loss_clip": 1.05761123, "balance_loss_mlp": 1.03124046, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 2.0442151403814095, "language_loss": 0.8543098, "learning_rate": 2.1198897156607796e-06, "loss": 0.87663627, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.6068060398101807 }, { "auxiliary_loss_clip": 0.01143447, "auxiliary_loss_mlp": 0.01025197, "balance_loss_clip": 1.05213952, "balance_loss_mlp": 1.01737738, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.2830114380656044, "language_loss": 0.74228281, "learning_rate": 2.1191121357720085e-06, "loss": 0.7639693, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.687800645828247 }, { "auxiliary_loss_clip": 0.01385584, "auxiliary_loss_mlp": 0.01028703, "balance_loss_clip": 1.04923987, "balance_loss_mlp": 1.02073956, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.7530694012475256, "language_loss": 0.7465927, "learning_rate": 2.1183345378140206e-06, "loss": 0.77073562, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.7826993465423584 }, { "auxiliary_loss_clip": 0.01129958, "auxiliary_loss_mlp": 0.00998167, "balance_loss_clip": 1.01160979, "balance_loss_mlp": 0.99710578, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8536412097201204, "language_loss": 0.61993551, "learning_rate": 2.1175569219047783e-06, "loss": 0.64121675, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.391906499862671 }, { "auxiliary_loss_clip": 0.01185225, "auxiliary_loss_mlp": 0.0103217, "balance_loss_clip": 1.05251336, "balance_loss_mlp": 1.02390838, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 3.2406818973788813, "language_loss": 0.73328173, "learning_rate": 2.1167792881622437e-06, "loss": 0.75545567, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.6174063682556152 }, { "auxiliary_loss_clip": 0.01280159, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.05011284, "balance_loss_mlp": 1.02169371, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 2.3839916320621595, "language_loss": 0.809376, "learning_rate": 2.116001636704384e-06, "loss": 0.8324672, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.7341067790985107 }, { "auxiliary_loss_clip": 0.01296179, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.04823005, "balance_loss_mlp": 1.02592611, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 2.000744197278107, "language_loss": 0.80563879, "learning_rate": 2.1152239676491685e-06, "loss": 0.82893097, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.771641969680786 }, { "auxiliary_loss_clip": 0.01289147, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.0470649, "balance_loss_mlp": 1.01802778, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.7735905969710455, "language_loss": 0.73600388, "learning_rate": 2.114446281114569e-06, "loss": 0.75914514, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.7590909004211426 }, { "auxiliary_loss_clip": 0.01278161, "auxiliary_loss_mlp": 0.01024806, "balance_loss_clip": 1.04982471, "balance_loss_mlp": 1.01783586, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 2.0366895695850125, "language_loss": 0.76135612, "learning_rate": 2.1136685772185587e-06, "loss": 0.7843858, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.7410471439361572 }, { "auxiliary_loss_clip": 0.01286062, "auxiliary_loss_mlp": 0.02568736, "balance_loss_clip": 1.045946, "balance_loss_mlp": 1.00043595, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.990783538618622, "language_loss": 0.77984929, "learning_rate": 2.1128908560791163e-06, "loss": 0.81839734, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.75510573387146 }, { "auxiliary_loss_clip": 0.0118583, "auxiliary_loss_mlp": 0.01021808, "balance_loss_clip": 1.0529933, "balance_loss_mlp": 1.01460195, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 2.1673512861386497, "language_loss": 0.78505754, "learning_rate": 2.1121131178142203e-06, "loss": 0.80713391, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.6770565509796143 }, { "auxiliary_loss_clip": 0.01281165, "auxiliary_loss_mlp": 0.01029763, "balance_loss_clip": 1.04693365, "balance_loss_mlp": 1.02181745, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.5166479663822954, "language_loss": 0.82638192, "learning_rate": 2.1113353625418544e-06, "loss": 0.84949124, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 4.192366600036621 }, { "auxiliary_loss_clip": 0.01228932, "auxiliary_loss_mlp": 0.01025331, "balance_loss_clip": 1.05331278, "balance_loss_mlp": 1.01876569, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.767591121728797, "language_loss": 0.78912973, "learning_rate": 2.1105575903800017e-06, "loss": 0.81167233, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.7218494415283203 }, { "auxiliary_loss_clip": 0.01142432, "auxiliary_loss_mlp": 0.01030382, "balance_loss_clip": 1.05168831, "balance_loss_mlp": 1.02313113, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 2.1399130191919413, "language_loss": 0.85309893, "learning_rate": 2.1097798014466502e-06, "loss": 0.87482703, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.7035794258117676 }, { "auxiliary_loss_clip": 0.01239797, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.05290616, "balance_loss_mlp": 1.01895571, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 8.457942669996953, "language_loss": 0.59238994, "learning_rate": 2.109001995859791e-06, "loss": 0.61505437, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 3.457157611846924 }, { "auxiliary_loss_clip": 0.01176141, "auxiliary_loss_mlp": 0.01012074, "balance_loss_clip": 1.0135684, "balance_loss_mlp": 1.01117361, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.8071210640303216, "language_loss": 0.60024107, "learning_rate": 2.108224173737415e-06, "loss": 0.62212324, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.2433512210845947 }, { "auxiliary_loss_clip": 0.01280665, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.04719234, "balance_loss_mlp": 1.02231431, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 15.776375112544851, "language_loss": 0.75651228, "learning_rate": 2.1074463351975183e-06, "loss": 0.77962697, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.6877658367156982 }, { "auxiliary_loss_clip": 0.01242824, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.05129957, "balance_loss_mlp": 1.01768064, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.7591895907636477, "language_loss": 0.71433794, "learning_rate": 2.106668480358098e-06, "loss": 0.7370162, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.8086867332458496 }, { "auxiliary_loss_clip": 0.01247041, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.04713202, "balance_loss_mlp": 1.02207315, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 2.1804721297455645, "language_loss": 0.71078098, "learning_rate": 2.105890609337154e-06, "loss": 0.73355681, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 3.666923999786377 }, { "auxiliary_loss_clip": 0.01072866, "auxiliary_loss_mlp": 0.01003056, "balance_loss_clip": 1.01148808, "balance_loss_mlp": 1.00210786, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6902665917836267, "language_loss": 0.63811994, "learning_rate": 2.1051127222526883e-06, "loss": 0.65887916, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.2904574871063232 }, { "auxiliary_loss_clip": 0.01232325, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.05274963, "balance_loss_mlp": 1.021842, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.7297559188118679, "language_loss": 0.80757177, "learning_rate": 2.1043348192227067e-06, "loss": 0.83018517, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 3.670820951461792 }, { "auxiliary_loss_clip": 0.01327635, "auxiliary_loss_mlp": 0.01021702, "balance_loss_clip": 1.04896104, "balance_loss_mlp": 1.01471615, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.840613908049841, "language_loss": 0.6196779, "learning_rate": 2.1035569003652156e-06, "loss": 0.64317131, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.7445223331451416 }, { "auxiliary_loss_clip": 0.0138814, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.04949856, "balance_loss_mlp": 1.02074814, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 2.0128663384647028, "language_loss": 0.81533909, "learning_rate": 2.1027789657982255e-06, "loss": 0.83951062, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.704967737197876 }, { "auxiliary_loss_clip": 0.01283563, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.04754543, "balance_loss_mlp": 1.02208495, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 2.010187240126557, "language_loss": 0.77461052, "learning_rate": 2.1020010156397482e-06, "loss": 0.79773968, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.7819645404815674 }, { "auxiliary_loss_clip": 0.01237203, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.05195916, "balance_loss_mlp": 1.02580357, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 2.0869407868930256, "language_loss": 0.77526015, "learning_rate": 2.101223050007797e-06, "loss": 0.79796791, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.6645030975341797 }, { "auxiliary_loss_clip": 0.01072512, "auxiliary_loss_mlp": 0.01001767, "balance_loss_clip": 1.01132619, "balance_loss_mlp": 1.00081944, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8101946151117984, "language_loss": 0.53807145, "learning_rate": 2.1004450690203904e-06, "loss": 0.55881423, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.2060556411743164 }, { "auxiliary_loss_clip": 0.01072374, "auxiliary_loss_mlp": 0.01000375, "balance_loss_clip": 1.01115263, "balance_loss_mlp": 0.99943954, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8522600901112465, "language_loss": 0.63299453, "learning_rate": 2.099667072795546e-06, "loss": 0.65372199, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.2112271785736084 }, { "auxiliary_loss_clip": 0.01232141, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.05015349, "balance_loss_mlp": 1.01637793, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 1.9862424875835254, "language_loss": 0.79621822, "learning_rate": 2.0988890614512864e-06, "loss": 0.81877482, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.664026975631714 }, { "auxiliary_loss_clip": 0.01285848, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.05180836, "balance_loss_mlp": 1.0221622, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 1.7470603831291087, "language_loss": 0.84282792, "learning_rate": 2.098111035105635e-06, "loss": 0.86598337, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.6992902755737305 }, { "auxiliary_loss_clip": 0.01281077, "auxiliary_loss_mlp": 0.01027089, "balance_loss_clip": 1.04899049, "balance_loss_mlp": 1.01921558, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 2.2433393757423588, "language_loss": 0.73282903, "learning_rate": 2.0973329938766176e-06, "loss": 0.75591075, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.738851547241211 }, { "auxiliary_loss_clip": 0.01242852, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.05370235, "balance_loss_mlp": 1.02705789, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 2.0303538648133053, "language_loss": 0.78890651, "learning_rate": 2.0965549378822618e-06, "loss": 0.81168163, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.7415287494659424 }, { "auxiliary_loss_clip": 0.0143216, "auxiliary_loss_mlp": 0.01031685, "balance_loss_clip": 1.04184544, "balance_loss_mlp": 1.02366853, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 2.0146628136041844, "language_loss": 0.84187973, "learning_rate": 2.095776867240599e-06, "loss": 0.8665182, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.911146402359009 }, { "auxiliary_loss_clip": 0.01331172, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.04958868, "balance_loss_mlp": 1.02139807, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 2.8157786245615264, "language_loss": 0.82708722, "learning_rate": 2.094998782069661e-06, "loss": 0.85068905, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.91485333442688 }, { "auxiliary_loss_clip": 0.01187479, "auxiliary_loss_mlp": 0.01026727, "balance_loss_clip": 1.05373144, "balance_loss_mlp": 1.01942897, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 1.6805356846806552, "language_loss": 0.75627899, "learning_rate": 2.0942206824874845e-06, "loss": 0.77842104, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.640352487564087 }, { "auxiliary_loss_clip": 0.01240632, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.05512667, "balance_loss_mlp": 1.02004242, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.1457899289953666, "language_loss": 0.79001355, "learning_rate": 2.093442568612105e-06, "loss": 0.81269801, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.6331303119659424 }, { "auxiliary_loss_clip": 0.01187753, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.05287504, "balance_loss_mlp": 1.02149153, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.587308737945686, "language_loss": 0.85126543, "learning_rate": 2.0926644405615613e-06, "loss": 0.87343055, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.707958936691284 }, { "auxiliary_loss_clip": 0.01329532, "auxiliary_loss_mlp": 0.01024689, "balance_loss_clip": 1.04966211, "balance_loss_mlp": 1.01722646, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 9.649132619440723, "language_loss": 0.80976325, "learning_rate": 2.091886298453897e-06, "loss": 0.83330548, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.7181267738342285 }, { "auxiliary_loss_clip": 0.01233665, "auxiliary_loss_mlp": 0.01031561, "balance_loss_clip": 1.05107296, "balance_loss_mlp": 1.02436352, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 1.912825706434224, "language_loss": 0.72579354, "learning_rate": 2.091108142407153e-06, "loss": 0.74844581, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 3.884100914001465 }, { "auxiliary_loss_clip": 0.01197489, "auxiliary_loss_mlp": 0.01008342, "balance_loss_clip": 1.02698922, "balance_loss_mlp": 1.00731647, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8401286919300145, "language_loss": 0.62291181, "learning_rate": 2.090329972539377e-06, "loss": 0.64497012, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.32360577583313 }, { "auxiliary_loss_clip": 0.01521977, "auxiliary_loss_mlp": 0.01031405, "balance_loss_clip": 1.04220855, "balance_loss_mlp": 1.02397788, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 3.5072877160300338, "language_loss": 0.68609804, "learning_rate": 2.089551788968616e-06, "loss": 0.71163189, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 3.78326153755188 }, { "auxiliary_loss_clip": 0.01070641, "auxiliary_loss_mlp": 0.01001765, "balance_loss_clip": 1.00962353, "balance_loss_mlp": 1.00081182, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8410451818228101, "language_loss": 0.60762852, "learning_rate": 2.08877359181292e-06, "loss": 0.62835258, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.8049914836883545 }, { "auxiliary_loss_clip": 0.01342119, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.04808426, "balance_loss_mlp": 1.02088451, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.426558944242154, "language_loss": 0.85644233, "learning_rate": 2.0879953811903396e-06, "loss": 0.88015187, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.837725877761841 }, { "auxiliary_loss_clip": 0.01235814, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.05456996, "balance_loss_mlp": 1.01998758, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.9730809085668162, "language_loss": 0.78524286, "learning_rate": 2.08721715721893e-06, "loss": 0.80788946, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.6633224487304688 }, { "auxiliary_loss_clip": 0.01234404, "auxiliary_loss_mlp": 0.01024426, "balance_loss_clip": 1.05093741, "balance_loss_mlp": 1.01646876, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.905723966956785, "language_loss": 0.76909906, "learning_rate": 2.0864389200167477e-06, "loss": 0.79168737, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.7141213417053223 }, { "auxiliary_loss_clip": 0.01239386, "auxiliary_loss_mlp": 0.02565915, "balance_loss_clip": 1.05244291, "balance_loss_mlp": 1.00048578, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 2.116187847421211, "language_loss": 0.79133689, "learning_rate": 2.0856606697018504e-06, "loss": 0.82938993, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 3.571125030517578 }, { "auxiliary_loss_clip": 0.01284435, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.04952598, "balance_loss_mlp": 1.01871574, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 3.5182449984103172, "language_loss": 0.73382968, "learning_rate": 2.084882406392297e-06, "loss": 0.75694966, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.6927130222320557 }, { "auxiliary_loss_clip": 0.01240715, "auxiliary_loss_mlp": 0.01028782, "balance_loss_clip": 1.05541301, "balance_loss_mlp": 1.02146208, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.8084566104658393, "language_loss": 0.7156949, "learning_rate": 2.0841041302061496e-06, "loss": 0.73838985, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 3.7973413467407227 }, { "auxiliary_loss_clip": 0.01278505, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.04601645, "balance_loss_mlp": 1.02121079, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.9015787389038572, "language_loss": 0.75530612, "learning_rate": 2.083325841261473e-06, "loss": 0.77838081, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.6683974266052246 }, { "auxiliary_loss_clip": 0.01283065, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.04815793, "balance_loss_mlp": 1.01935947, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.261233225763131, "language_loss": 0.65920979, "learning_rate": 2.0825475396763322e-06, "loss": 0.68231237, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.726410150527954 }, { "auxiliary_loss_clip": 0.01534559, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.04472065, "balance_loss_mlp": 1.0194819, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.7478749717544888, "language_loss": 0.65740216, "learning_rate": 2.081769225568796e-06, "loss": 0.68302333, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 3.1406784057617188 }, { "auxiliary_loss_clip": 0.01237608, "auxiliary_loss_mlp": 0.01028301, "balance_loss_clip": 1.04973626, "balance_loss_mlp": 1.02050161, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.566373579950536, "language_loss": 0.7585972, "learning_rate": 2.0809908990569327e-06, "loss": 0.78125632, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 3.1817538738250732 }, { "auxiliary_loss_clip": 0.01284186, "auxiliary_loss_mlp": 0.01030164, "balance_loss_clip": 1.05062604, "balance_loss_mlp": 1.02204251, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.750413583068922, "language_loss": 0.78999078, "learning_rate": 2.0802125602588146e-06, "loss": 0.81313431, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.7144672870635986 }, { "auxiliary_loss_clip": 0.01190055, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.0556668, "balance_loss_mlp": 1.02059019, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 2.214441960041844, "language_loss": 0.66843641, "learning_rate": 2.0794342092925146e-06, "loss": 0.69062603, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.677269458770752 }, { "auxiliary_loss_clip": 0.01242279, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.05615783, "balance_loss_mlp": 1.02090883, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 2.287320450264016, "language_loss": 0.678177, "learning_rate": 2.078655846276108e-06, "loss": 0.70088309, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.7539780139923096 }, { "auxiliary_loss_clip": 0.01284764, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.05112958, "balance_loss_mlp": 1.02141988, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 1.8946451812654899, "language_loss": 0.68885142, "learning_rate": 2.0778774713276727e-06, "loss": 0.71199322, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.6224398612976074 }, { "auxiliary_loss_clip": 0.01235754, "auxiliary_loss_mlp": 0.01025452, "balance_loss_clip": 1.04925478, "balance_loss_mlp": 1.01732826, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.5403464611550466, "language_loss": 0.68414193, "learning_rate": 2.077099084565287e-06, "loss": 0.70675397, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.719621419906616 }, { "auxiliary_loss_clip": 0.01284044, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.04665053, "balance_loss_mlp": 1.02530301, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.3741521794356055, "language_loss": 0.65171528, "learning_rate": 2.0763206861070313e-06, "loss": 0.67488962, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.634406566619873 }, { "auxiliary_loss_clip": 0.01189975, "auxiliary_loss_mlp": 0.01031747, "balance_loss_clip": 1.05536473, "balance_loss_mlp": 1.02401042, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 2.1040253436877943, "language_loss": 0.75981146, "learning_rate": 2.0755422760709876e-06, "loss": 0.78202868, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.6740145683288574 }, { "auxiliary_loss_clip": 0.01284398, "auxiliary_loss_mlp": 0.01026977, "balance_loss_clip": 1.04457641, "balance_loss_mlp": 1.01888263, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 1.9083566981690696, "language_loss": 0.77168965, "learning_rate": 2.0747638545752417e-06, "loss": 0.79480338, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.7839784622192383 }, { "auxiliary_loss_clip": 0.01283621, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 1.05370426, "balance_loss_mlp": 1.01790464, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 1.8970379046166974, "language_loss": 0.83245009, "learning_rate": 2.073985421737878e-06, "loss": 0.85554522, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.690967082977295 }, { "auxiliary_loss_clip": 0.01241776, "auxiliary_loss_mlp": 0.01025468, "balance_loss_clip": 1.05461788, "balance_loss_mlp": 1.01727879, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.305684477621911, "language_loss": 0.74217683, "learning_rate": 2.0732069776769844e-06, "loss": 0.76484931, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.673722743988037 }, { "auxiliary_loss_clip": 0.01191145, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.05723321, "balance_loss_mlp": 1.0222733, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 10.46057131021812, "language_loss": 0.73487639, "learning_rate": 2.072428522510651e-06, "loss": 0.75708663, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.612955331802368 }, { "auxiliary_loss_clip": 0.01331866, "auxiliary_loss_mlp": 0.01032406, "balance_loss_clip": 1.04935002, "balance_loss_mlp": 1.02526498, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.2039779530997445, "language_loss": 0.76280499, "learning_rate": 2.071650056356968e-06, "loss": 0.78644776, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.685763359069824 }, { "auxiliary_loss_clip": 0.01191054, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.05636883, "balance_loss_mlp": 1.02035999, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 1.9832538753152447, "language_loss": 0.80039144, "learning_rate": 2.070871579334028e-06, "loss": 0.82257885, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.5858988761901855 }, { "auxiliary_loss_clip": 0.0118812, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.0535295, "balance_loss_mlp": 1.02358413, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.6006667600027416, "language_loss": 0.71854258, "learning_rate": 2.0700930915599264e-06, "loss": 0.7407409, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 3.750080108642578 }, { "auxiliary_loss_clip": 0.01186393, "auxiliary_loss_mlp": 0.01021453, "balance_loss_clip": 1.05391502, "balance_loss_mlp": 1.01403236, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 3.2738584540713105, "language_loss": 0.78579724, "learning_rate": 2.0693145931527583e-06, "loss": 0.80787569, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 3.4346187114715576 }, { "auxiliary_loss_clip": 0.01279121, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.04917383, "balance_loss_mlp": 1.01985669, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.9869202257030683, "language_loss": 0.78253293, "learning_rate": 2.068536084230622e-06, "loss": 0.80560076, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.7597763538360596 }, { "auxiliary_loss_clip": 0.01239354, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.05551744, "balance_loss_mlp": 1.02031386, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 2.537407436432198, "language_loss": 0.889521, "learning_rate": 2.067757564911616e-06, "loss": 0.91219687, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.6904523372650146 }, { "auxiliary_loss_clip": 0.0129691, "auxiliary_loss_mlp": 0.02569666, "balance_loss_clip": 1.05273366, "balance_loss_mlp": 1.0005393, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 2.2254169153339043, "language_loss": 0.92705804, "learning_rate": 2.0669790353138407e-06, "loss": 0.96572375, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.6739180088043213 }, { "auxiliary_loss_clip": 0.01328878, "auxiliary_loss_mlp": 0.02570618, "balance_loss_clip": 1.05130911, "balance_loss_mlp": 1.00055337, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.1347324845978695, "language_loss": 0.72884488, "learning_rate": 2.0662004955553995e-06, "loss": 0.76783985, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.73905611038208 }, { "auxiliary_loss_clip": 0.01284692, "auxiliary_loss_mlp": 0.01028187, "balance_loss_clip": 1.0485723, "balance_loss_mlp": 1.02081943, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.9477206343537974, "language_loss": 0.769292, "learning_rate": 2.065421945754395e-06, "loss": 0.79242074, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 3.543562412261963 }, { "auxiliary_loss_clip": 0.01389835, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.05311096, "balance_loss_mlp": 1.03017211, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.6791075064865157, "language_loss": 0.7816667, "learning_rate": 2.0646433860289344e-06, "loss": 0.80593848, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.8779854774475098 }, { "auxiliary_loss_clip": 0.0124322, "auxiliary_loss_mlp": 0.02573353, "balance_loss_clip": 1.05360389, "balance_loss_mlp": 1.00055194, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.8214318576296606, "language_loss": 0.82905763, "learning_rate": 2.0638648164971233e-06, "loss": 0.86722338, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 3.58199405670166 }, { "auxiliary_loss_clip": 0.0128707, "auxiliary_loss_mlp": 0.01023906, "balance_loss_clip": 1.05251765, "balance_loss_mlp": 1.0162437, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 2.3428560908845606, "language_loss": 0.88343406, "learning_rate": 2.06308623727707e-06, "loss": 0.90654373, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.663426399230957 }, { "auxiliary_loss_clip": 0.01235687, "auxiliary_loss_mlp": 0.01022011, "balance_loss_clip": 1.053424, "balance_loss_mlp": 1.0139879, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.5719106333400243, "language_loss": 0.76371419, "learning_rate": 2.0623076484868846e-06, "loss": 0.78629118, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.6759989261627197 }, { "auxiliary_loss_clip": 0.0117674, "auxiliary_loss_mlp": 0.01006927, "balance_loss_clip": 1.01415157, "balance_loss_mlp": 1.00602663, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8289232627244497, "language_loss": 0.60675198, "learning_rate": 2.061529050244679e-06, "loss": 0.62858862, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.156445026397705 }, { "auxiliary_loss_clip": 0.0124618, "auxiliary_loss_mlp": 0.01025355, "balance_loss_clip": 1.05105078, "balance_loss_mlp": 1.01817858, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 5.989393459890356, "language_loss": 0.74401772, "learning_rate": 2.060750442668565e-06, "loss": 0.76673305, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.682685136795044 }, { "auxiliary_loss_clip": 0.01239139, "auxiliary_loss_mlp": 0.01025416, "balance_loss_clip": 1.05547762, "balance_loss_mlp": 1.01828766, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.9219108892922705, "language_loss": 0.63892376, "learning_rate": 2.059971825876657e-06, "loss": 0.66156936, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.573573350906372 }, { "auxiliary_loss_clip": 0.01239888, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.05309677, "balance_loss_mlp": 1.02347708, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 2.7387074534438556, "language_loss": 0.76459318, "learning_rate": 2.0591931999870713e-06, "loss": 0.7873013, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.6271414756774902 }, { "auxiliary_loss_clip": 0.01122366, "auxiliary_loss_mlp": 0.01001601, "balance_loss_clip": 1.01337314, "balance_loss_mlp": 1.00068903, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8284379637430926, "language_loss": 0.57508707, "learning_rate": 2.0584145651179234e-06, "loss": 0.59632683, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.200458288192749 }, { "auxiliary_loss_clip": 0.01282613, "auxiliary_loss_mlp": 0.02569526, "balance_loss_clip": 1.05268574, "balance_loss_mlp": 1.00052142, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 3.063184148546686, "language_loss": 0.80095911, "learning_rate": 2.0576359213873327e-06, "loss": 0.83948046, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.6157121658325195 }, { "auxiliary_loss_clip": 0.01295187, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.05045652, "balance_loss_mlp": 1.02488542, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 5.444401714201307, "language_loss": 0.70604479, "learning_rate": 2.056857268913419e-06, "loss": 0.72932553, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.6921019554138184 }, { "auxiliary_loss_clip": 0.012376, "auxiliary_loss_mlp": 0.01034598, "balance_loss_clip": 1.05429411, "balance_loss_mlp": 1.02671862, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.1400263757346654, "language_loss": 0.8387205, "learning_rate": 2.056078607814303e-06, "loss": 0.86144245, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.5700321197509766 }, { "auxiliary_loss_clip": 0.01233984, "auxiliary_loss_mlp": 0.01024565, "balance_loss_clip": 1.05309415, "balance_loss_mlp": 1.01684058, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 1.9596587596091506, "language_loss": 0.78643012, "learning_rate": 2.055299938208106e-06, "loss": 0.80901563, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.6806366443634033 }, { "auxiliary_loss_clip": 0.0124339, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.05478168, "balance_loss_mlp": 1.02394402, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.728046828147528, "language_loss": 0.8603934, "learning_rate": 2.0545212602129526e-06, "loss": 0.88314402, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.642712354660034 }, { "auxiliary_loss_clip": 0.01282021, "auxiliary_loss_mlp": 0.01026563, "balance_loss_clip": 1.04925275, "balance_loss_mlp": 1.01860607, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 2.1247600311230426, "language_loss": 0.66554207, "learning_rate": 2.0537425739469673e-06, "loss": 0.6886279, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.7188549041748047 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.01005, "balance_loss_clip": 1.01230872, "balance_loss_mlp": 1.00401103, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8431149404073137, "language_loss": 0.59403682, "learning_rate": 2.052963879528276e-06, "loss": 0.61535877, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.220952033996582 }, { "auxiliary_loss_clip": 0.0123939, "auxiliary_loss_mlp": 0.01031699, "balance_loss_clip": 1.05442142, "balance_loss_mlp": 1.02352738, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.0269640873184636, "language_loss": 0.76393896, "learning_rate": 2.052185177075007e-06, "loss": 0.78664988, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.6739485263824463 }, { "auxiliary_loss_clip": 0.01240241, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.05228543, "balance_loss_mlp": 1.02285504, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 1.6238099056653537, "language_loss": 0.82751811, "learning_rate": 2.051406466705288e-06, "loss": 0.85022867, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.597687244415283 }, { "auxiliary_loss_clip": 0.01187301, "auxiliary_loss_mlp": 0.01029807, "balance_loss_clip": 1.05295885, "balance_loss_mlp": 1.02220726, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 1.8460121900165614, "language_loss": 0.81105375, "learning_rate": 2.0506277485372486e-06, "loss": 0.83322483, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.6549878120422363 }, { "auxiliary_loss_clip": 0.01236668, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.05251944, "balance_loss_mlp": 1.02281499, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 5.878785431905208, "language_loss": 0.67412138, "learning_rate": 2.04984902268902e-06, "loss": 0.69679344, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 3.5617048740386963 }, { "auxiliary_loss_clip": 0.01245154, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.05242956, "balance_loss_mlp": 1.01924467, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 5.971070889515067, "language_loss": 0.75483704, "learning_rate": 2.0490702892787345e-06, "loss": 0.77756816, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 3.5786798000335693 }, { "auxiliary_loss_clip": 0.01231295, "auxiliary_loss_mlp": 0.0102868, "balance_loss_clip": 1.05019808, "balance_loss_mlp": 1.02128863, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 3.7536291219538898, "language_loss": 0.62510169, "learning_rate": 2.0482915484245246e-06, "loss": 0.6477015, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.7080938816070557 }, { "auxiliary_loss_clip": 0.01382383, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.04826188, "balance_loss_mlp": 1.01942551, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 3.3378579871423626, "language_loss": 0.84123242, "learning_rate": 2.047512800244526e-06, "loss": 0.86533344, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.771091938018799 }, { "auxiliary_loss_clip": 0.01237817, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.05505002, "balance_loss_mlp": 1.02481461, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 1.9705214048520783, "language_loss": 0.79063851, "learning_rate": 2.046734044856873e-06, "loss": 0.81334829, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.662229537963867 }, { "auxiliary_loss_clip": 0.01236442, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.05267775, "balance_loss_mlp": 1.02013421, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 1.9023128432605896, "language_loss": 0.8152343, "learning_rate": 2.045955282379702e-06, "loss": 0.83788121, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.6408963203430176 }, { "auxiliary_loss_clip": 0.01236168, "auxiliary_loss_mlp": 0.01029535, "balance_loss_clip": 1.05070519, "balance_loss_mlp": 1.02183759, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 2.9954433392108704, "language_loss": 0.76118684, "learning_rate": 2.045176512931152e-06, "loss": 0.78384382, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 3.531169891357422 }, { "auxiliary_loss_clip": 0.01239522, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.04839158, "balance_loss_mlp": 1.02516985, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 1.9848145498258551, "language_loss": 0.76051116, "learning_rate": 2.0443977366293604e-06, "loss": 0.78323424, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.728571653366089 }, { "auxiliary_loss_clip": 0.01440728, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.04666281, "balance_loss_mlp": 1.02396834, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.6389621680992141, "language_loss": 0.76954323, "learning_rate": 2.043618953592468e-06, "loss": 0.79427612, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 3.812769651412964 }, { "auxiliary_loss_clip": 0.01287987, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.05336356, "balance_loss_mlp": 1.01934886, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 2.17422309593626, "language_loss": 0.81266522, "learning_rate": 2.0428401639386144e-06, "loss": 0.83581829, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.668104887008667 }, { "auxiliary_loss_clip": 0.01170093, "auxiliary_loss_mlp": 0.0100215, "balance_loss_clip": 1.00923491, "balance_loss_mlp": 1.00131571, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8215744846731133, "language_loss": 0.58070374, "learning_rate": 2.042061367785943e-06, "loss": 0.60242617, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.2362186908721924 }, { "auxiliary_loss_clip": 0.01335754, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.04765582, "balance_loss_mlp": 1.0200398, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 3.0495571992711183, "language_loss": 0.7539506, "learning_rate": 2.041282565252594e-06, "loss": 0.77759176, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.823314666748047 }, { "auxiliary_loss_clip": 0.01340157, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.04914904, "balance_loss_mlp": 1.01994884, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.7369516380144414, "language_loss": 0.77350593, "learning_rate": 2.040503756456714e-06, "loss": 0.79718554, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.7437095642089844 }, { "auxiliary_loss_clip": 0.01234241, "auxiliary_loss_mlp": 0.01028323, "balance_loss_clip": 1.05049086, "balance_loss_mlp": 1.02046096, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 2.0555860998315936, "language_loss": 0.78925979, "learning_rate": 2.0397249415164456e-06, "loss": 0.81188536, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.616964340209961 }, { "auxiliary_loss_clip": 0.01283965, "auxiliary_loss_mlp": 0.01034695, "balance_loss_clip": 1.04798937, "balance_loss_mlp": 1.02617717, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 2.508623787729711, "language_loss": 0.80017138, "learning_rate": 2.0389461205499354e-06, "loss": 0.82335794, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.7122769355773926 }, { "auxiliary_loss_clip": 0.01334185, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04744315, "balance_loss_mlp": 1.01763213, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.806077978939886, "language_loss": 0.73307949, "learning_rate": 2.03816729367533e-06, "loss": 0.75667524, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.6615328788757324 }, { "auxiliary_loss_clip": 0.01299333, "auxiliary_loss_mlp": 0.01033273, "balance_loss_clip": 1.05762625, "balance_loss_mlp": 1.02523279, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 2.116606876631003, "language_loss": 0.71598321, "learning_rate": 2.0373884610107765e-06, "loss": 0.73930925, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.7179880142211914 }, { "auxiliary_loss_clip": 0.01240771, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.05146158, "balance_loss_mlp": 1.01969075, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.3324043164801593, "language_loss": 0.69568568, "learning_rate": 2.0366096226744225e-06, "loss": 0.71836311, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.640141725540161 }, { "auxiliary_loss_clip": 0.0123305, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.05184615, "balance_loss_mlp": 1.02496409, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 1.6931631147167616, "language_loss": 0.77264929, "learning_rate": 2.035830778784418e-06, "loss": 0.79530787, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.663135290145874 }, { "auxiliary_loss_clip": 0.01194743, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.05619037, "balance_loss_mlp": 1.02655387, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 1.9718020760316641, "language_loss": 0.80211222, "learning_rate": 2.0350519294589134e-06, "loss": 0.82440913, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.6724162101745605 }, { "auxiliary_loss_clip": 0.0138154, "auxiliary_loss_mlp": 0.01026812, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.01849115, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.7910671899559647, "language_loss": 0.83052605, "learning_rate": 2.0342730748160588e-06, "loss": 0.85460961, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.742356061935425 }, { "auxiliary_loss_clip": 0.01284879, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.04792595, "balance_loss_mlp": 1.01578164, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.0385225423691273, "language_loss": 0.70159161, "learning_rate": 2.033494214974006e-06, "loss": 0.72467983, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.803779363632202 }, { "auxiliary_loss_clip": 0.01277811, "auxiliary_loss_mlp": 0.0102678, "balance_loss_clip": 1.05006266, "balance_loss_mlp": 1.01934683, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.8080347863697357, "language_loss": 0.83853912, "learning_rate": 2.0327153500509067e-06, "loss": 0.86158502, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.672940969467163 }, { "auxiliary_loss_clip": 0.01289795, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.05437326, "balance_loss_mlp": 1.01948929, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 3.006609471102507, "language_loss": 0.85113436, "learning_rate": 2.031936480164916e-06, "loss": 0.8743093, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.657680034637451 }, { "auxiliary_loss_clip": 0.01282713, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.05206311, "balance_loss_mlp": 1.02287471, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 2.8059184832033415, "language_loss": 0.79845887, "learning_rate": 2.0311576054341857e-06, "loss": 0.82159048, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.688277006149292 }, { "auxiliary_loss_clip": 0.0119046, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.0563972, "balance_loss_mlp": 1.01999664, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.6153995750640837, "language_loss": 0.62437236, "learning_rate": 2.0303787259768715e-06, "loss": 0.64655495, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.6395647525787354 }, { "auxiliary_loss_clip": 0.01289369, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.05465412, "balance_loss_mlp": 1.02262032, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.516874305093746, "language_loss": 0.68901467, "learning_rate": 2.0295998419111294e-06, "loss": 0.71221519, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 3.658639907836914 }, { "auxiliary_loss_clip": 0.01439348, "auxiliary_loss_mlp": 0.01027257, "balance_loss_clip": 1.0440495, "balance_loss_mlp": 1.01965714, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 4.458959566805954, "language_loss": 0.74135137, "learning_rate": 2.028820953355115e-06, "loss": 0.76601738, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.7507364749908447 }, { "auxiliary_loss_clip": 0.01289544, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.04961109, "balance_loss_mlp": 1.01996386, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 2.0004514258679467, "language_loss": 0.78386688, "learning_rate": 2.0280420604269834e-06, "loss": 0.80704588, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 3.659754514694214 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.00999592, "balance_loss_clip": 1.00897229, "balance_loss_mlp": 0.99862051, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7114740193597943, "language_loss": 0.58912587, "learning_rate": 2.027263163244895e-06, "loss": 0.6103096, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.3793118000030518 }, { "auxiliary_loss_clip": 0.01237731, "auxiliary_loss_mlp": 0.01026249, "balance_loss_clip": 1.05460024, "balance_loss_mlp": 1.01870358, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.5920930467843653, "language_loss": 0.74736702, "learning_rate": 2.026484261927005e-06, "loss": 0.77000684, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.647348165512085 }, { "auxiliary_loss_clip": 0.01245208, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.05622792, "balance_loss_mlp": 1.02309704, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.781167373177434, "language_loss": 0.73800516, "learning_rate": 2.025705356591475e-06, "loss": 0.76076984, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 2.6712985038757324 }, { "auxiliary_loss_clip": 0.01224596, "auxiliary_loss_mlp": 0.02507124, "balance_loss_clip": 1.00825834, "balance_loss_mlp": 1.00042903, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7725360168070031, "language_loss": 0.57937801, "learning_rate": 2.024926447356462e-06, "loss": 0.61669523, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 4.049286365509033 }, { "auxiliary_loss_clip": 0.01238061, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.05401576, "balance_loss_mlp": 1.02344179, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 14.610791524427741, "language_loss": 0.78675854, "learning_rate": 2.024147534340127e-06, "loss": 0.80946052, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.662172555923462 }, { "auxiliary_loss_clip": 0.01186746, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.0478071, "balance_loss_mlp": 1.02471161, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.6818946675170077, "language_loss": 0.80127656, "learning_rate": 2.02336861766063e-06, "loss": 0.8234669, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 3.579575538635254 }, { "auxiliary_loss_clip": 0.01246008, "auxiliary_loss_mlp": 0.01033563, "balance_loss_clip": 1.05392623, "balance_loss_mlp": 1.02576065, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 2.6929512944112926, "language_loss": 0.79048747, "learning_rate": 2.0225896974361327e-06, "loss": 0.81328321, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.6093294620513916 }, { "auxiliary_loss_clip": 0.0122833, "auxiliary_loss_mlp": 0.00999602, "balance_loss_clip": 1.01029587, "balance_loss_mlp": 0.99855345, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8531134335463075, "language_loss": 0.59903169, "learning_rate": 2.0218107737847962e-06, "loss": 0.62131095, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.343179702758789 }, { "auxiliary_loss_clip": 0.01187575, "auxiliary_loss_mlp": 0.01029709, "balance_loss_clip": 1.05419636, "balance_loss_mlp": 1.02182281, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 1.8594719890262934, "language_loss": 0.74970937, "learning_rate": 2.0210318468247826e-06, "loss": 0.77188218, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.6054012775421143 }, { "auxiliary_loss_clip": 0.01282454, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.04832077, "balance_loss_mlp": 1.02204609, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.9452928956673092, "language_loss": 0.81803334, "learning_rate": 2.020252916674255e-06, "loss": 0.841151, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.6939377784729004 }, { "auxiliary_loss_clip": 0.01238826, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.05132389, "balance_loss_mlp": 1.01858854, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.7521781587305745, "language_loss": 0.80921811, "learning_rate": 2.019473983451375e-06, "loss": 0.83187449, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.6142094135284424 }, { "auxiliary_loss_clip": 0.01242092, "auxiliary_loss_mlp": 0.01033789, "balance_loss_clip": 1.04723024, "balance_loss_mlp": 1.02551627, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.823092720779081, "language_loss": 0.71387392, "learning_rate": 2.0186950472743076e-06, "loss": 0.73663276, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.71830415725708 }, { "auxiliary_loss_clip": 0.01190253, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.05512059, "balance_loss_mlp": 1.01951218, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.5989056658616305, "language_loss": 0.73877674, "learning_rate": 2.0179161082612162e-06, "loss": 0.76094741, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.6248505115509033 }, { "auxiliary_loss_clip": 0.01281789, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.04923868, "balance_loss_mlp": 1.01797032, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 2.064618032287243, "language_loss": 0.72970164, "learning_rate": 2.017137166530266e-06, "loss": 0.75277364, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.6486153602600098 }, { "auxiliary_loss_clip": 0.01293864, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.05302405, "balance_loss_mlp": 1.02142906, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 1.8940156636822334, "language_loss": 0.80092394, "learning_rate": 2.0163582221996213e-06, "loss": 0.82415318, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.7023940086364746 }, { "auxiliary_loss_clip": 0.01288048, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.05241191, "balance_loss_mlp": 1.02819419, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 2.341472384373308, "language_loss": 0.68159473, "learning_rate": 2.015579275387446e-06, "loss": 0.70483798, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.7968685626983643 }, { "auxiliary_loss_clip": 0.01283087, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.05216002, "balance_loss_mlp": 1.02490306, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 2.4082693937591304, "language_loss": 0.68681848, "learning_rate": 2.0148003262119085e-06, "loss": 0.70997894, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.723036766052246 }, { "auxiliary_loss_clip": 0.01333685, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.0490737, "balance_loss_mlp": 1.022259, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.8038179350853971, "language_loss": 0.7661432, "learning_rate": 2.0140213747911728e-06, "loss": 0.78978336, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.647580862045288 }, { "auxiliary_loss_clip": 0.0133555, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.05336642, "balance_loss_mlp": 1.01861238, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.8848474023546635, "language_loss": 0.80472827, "learning_rate": 2.013242421243406e-06, "loss": 0.82835537, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.765040397644043 }, { "auxiliary_loss_clip": 0.01386744, "auxiliary_loss_mlp": 0.01028761, "balance_loss_clip": 1.05184996, "balance_loss_mlp": 1.02158141, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.7232823809930522, "language_loss": 0.79030257, "learning_rate": 2.012463465686774e-06, "loss": 0.8144576, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.6798245906829834 }, { "auxiliary_loss_clip": 0.01209287, "auxiliary_loss_mlp": 0.01001669, "balance_loss_clip": 1.03021502, "balance_loss_mlp": 1.00049484, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 1.168157537934399, "language_loss": 0.54784179, "learning_rate": 2.0116845082394446e-06, "loss": 0.56995142, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.3915586471557617 }, { "auxiliary_loss_clip": 0.01241829, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.05357826, "balance_loss_mlp": 1.01972628, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 2.381449979427126, "language_loss": 0.78605795, "learning_rate": 2.0109055490195836e-06, "loss": 0.80874681, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.7886970043182373 }, { "auxiliary_loss_clip": 0.01438463, "auxiliary_loss_mlp": 0.01027745, "balance_loss_clip": 1.03992963, "balance_loss_mlp": 1.01964426, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 2.112732016406456, "language_loss": 0.64699078, "learning_rate": 2.0101265881453605e-06, "loss": 0.67165285, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 3.7338459491729736 }, { "auxiliary_loss_clip": 0.01284309, "auxiliary_loss_mlp": 0.01037885, "balance_loss_clip": 1.05405831, "balance_loss_mlp": 1.03025556, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.2956386984582706, "language_loss": 0.78291684, "learning_rate": 2.009347625734941e-06, "loss": 0.80613875, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.724912166595459 }, { "auxiliary_loss_clip": 0.01193157, "auxiliary_loss_mlp": 0.01031826, "balance_loss_clip": 1.05759168, "balance_loss_mlp": 1.02396464, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.4464263475880634, "language_loss": 0.74854326, "learning_rate": 2.0085686619064954e-06, "loss": 0.77079308, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 3.581024408340454 }, { "auxiliary_loss_clip": 0.01242672, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.05399919, "balance_loss_mlp": 1.03059399, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.1526519812786646, "language_loss": 0.82929325, "learning_rate": 2.00778969677819e-06, "loss": 0.85210592, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.6501219272613525 }, { "auxiliary_loss_clip": 0.01284642, "auxiliary_loss_mlp": 0.01027495, "balance_loss_clip": 1.04964423, "balance_loss_mlp": 1.0197432, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.754762230018292, "language_loss": 0.6419667, "learning_rate": 2.0070107304681934e-06, "loss": 0.66508806, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.719332218170166 }, { "auxiliary_loss_clip": 0.01332546, "auxiliary_loss_mlp": 0.01025178, "balance_loss_clip": 1.05067003, "balance_loss_mlp": 1.01744092, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 1.8494326660740519, "language_loss": 0.78349459, "learning_rate": 2.006231763094675e-06, "loss": 0.80707186, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.8273448944091797 }, { "auxiliary_loss_clip": 0.01280575, "auxiliary_loss_mlp": 0.0102841, "balance_loss_clip": 1.05287683, "balance_loss_mlp": 1.02082217, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 1.9713472316979852, "language_loss": 0.8778044, "learning_rate": 2.0054527947758027e-06, "loss": 0.90089428, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 2.7359025478363037 }, { "auxiliary_loss_clip": 0.01118641, "auxiliary_loss_mlp": 0.00998716, "balance_loss_clip": 1.00860357, "balance_loss_mlp": 0.99779224, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7216920381716123, "language_loss": 0.55852109, "learning_rate": 2.004673825629746e-06, "loss": 0.57969469, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 4.058689117431641 }, { "auxiliary_loss_clip": 0.01285013, "auxiliary_loss_mlp": 0.0102624, "balance_loss_clip": 1.04882884, "balance_loss_mlp": 1.0184319, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 1.6528576129152224, "language_loss": 0.72561109, "learning_rate": 2.0038948557746744e-06, "loss": 0.74872357, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.7297794818878174 }, { "auxiliary_loss_clip": 0.0123543, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.0552907, "balance_loss_mlp": 1.02152634, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.835274850110765, "language_loss": 0.75228453, "learning_rate": 2.0031158853287558e-06, "loss": 0.77492779, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 3.5825581550598145 }, { "auxiliary_loss_clip": 0.01283678, "auxiliary_loss_mlp": 0.01030266, "balance_loss_clip": 1.05291915, "balance_loss_mlp": 1.0222013, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 2.3332309026224904, "language_loss": 0.70217454, "learning_rate": 2.0023369144101593e-06, "loss": 0.72531396, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.710209608078003 }, { "auxiliary_loss_clip": 0.01278442, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.04855108, "balance_loss_mlp": 1.02038598, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 2.5871030695860484, "language_loss": 0.76766217, "learning_rate": 2.0015579431370555e-06, "loss": 0.79072392, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.698371171951294 }, { "auxiliary_loss_clip": 0.01236429, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.05368066, "balance_loss_mlp": 1.02429867, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 1.9258201872589449, "language_loss": 0.69634855, "learning_rate": 2.000778971627612e-06, "loss": 0.7190274, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.7236742973327637 }, { "auxiliary_loss_clip": 0.01282756, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.04888916, "balance_loss_mlp": 1.02401114, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 1.9187140600716615, "language_loss": 0.90327203, "learning_rate": 2e-06, "loss": 0.92641413, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.6352896690368652 }, { "auxiliary_loss_clip": 0.01189091, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.05735302, "balance_loss_mlp": 1.02417207, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 6.434070578374005, "language_loss": 0.85754848, "learning_rate": 1.9992210283723878e-06, "loss": 0.87975538, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.608607530593872 }, { "auxiliary_loss_clip": 0.01187267, "auxiliary_loss_mlp": 0.01031304, "balance_loss_clip": 1.05706251, "balance_loss_mlp": 1.02424085, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.6222002262950437, "language_loss": 0.79616487, "learning_rate": 1.9984420568629448e-06, "loss": 0.81835055, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.6762595176696777 }, { "auxiliary_loss_clip": 0.01238807, "auxiliary_loss_mlp": 0.0102745, "balance_loss_clip": 1.05366778, "balance_loss_mlp": 1.02024651, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.193212647535323, "language_loss": 0.78316987, "learning_rate": 1.9976630855898405e-06, "loss": 0.80583245, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.6561484336853027 }, { "auxiliary_loss_clip": 0.01282507, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.04712129, "balance_loss_mlp": 1.02014315, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.2467997312766483, "language_loss": 0.75075918, "learning_rate": 1.9968841146712445e-06, "loss": 0.77385831, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.7067112922668457 }, { "auxiliary_loss_clip": 0.01431291, "auxiliary_loss_mlp": 0.02569499, "balance_loss_clip": 1.04663324, "balance_loss_mlp": 1.00067377, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.8705311528836928, "language_loss": 0.7142396, "learning_rate": 1.996105144225326e-06, "loss": 0.75424749, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.880500555038452 }, { "auxiliary_loss_clip": 0.01237584, "auxiliary_loss_mlp": 0.01031558, "balance_loss_clip": 1.05594945, "balance_loss_mlp": 1.02389336, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 2.1892422949518906, "language_loss": 0.79114175, "learning_rate": 1.995326174370254e-06, "loss": 0.81383324, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.61956524848938 }, { "auxiliary_loss_clip": 0.0123511, "auxiliary_loss_mlp": 0.0256255, "balance_loss_clip": 1.05257833, "balance_loss_mlp": 1.00063884, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.6906978312900784, "language_loss": 0.73179364, "learning_rate": 1.994547205224197e-06, "loss": 0.76977026, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.6912481784820557 }, { "auxiliary_loss_clip": 0.01280326, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.05169928, "balance_loss_mlp": 1.02942741, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 2.203955283101594, "language_loss": 0.67531121, "learning_rate": 1.993768236905325e-06, "loss": 0.69848108, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.6730446815490723 }, { "auxiliary_loss_clip": 0.01282904, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.04981017, "balance_loss_mlp": 1.02090859, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 2.5137559908477467, "language_loss": 0.65991074, "learning_rate": 1.992989269531807e-06, "loss": 0.68302518, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.733698844909668 }, { "auxiliary_loss_clip": 0.01284829, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.05051112, "balance_loss_mlp": 1.0226748, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.4138290103333766, "language_loss": 0.6777814, "learning_rate": 1.99221030322181e-06, "loss": 0.70093822, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.6164045333862305 }, { "auxiliary_loss_clip": 0.01289191, "auxiliary_loss_mlp": 0.0102258, "balance_loss_clip": 1.05067301, "balance_loss_mlp": 1.01583529, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 3.611248364540869, "language_loss": 0.81007981, "learning_rate": 1.991431338093505e-06, "loss": 0.83319759, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.7282557487487793 }, { "auxiliary_loss_clip": 0.01286524, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.05554175, "balance_loss_mlp": 1.02556181, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.7891481541501169, "language_loss": 0.79374963, "learning_rate": 1.9906523742650587e-06, "loss": 0.8169421, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.6433379650115967 }, { "auxiliary_loss_clip": 0.01186875, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 1.05154443, "balance_loss_mlp": 1.02018881, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.4376782443202942, "language_loss": 0.77664089, "learning_rate": 1.9898734118546397e-06, "loss": 0.7987988, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.7678468227386475 }, { "auxiliary_loss_clip": 0.01484065, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.04541314, "balance_loss_mlp": 1.02026224, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.5684395904048862, "language_loss": 0.80615342, "learning_rate": 1.989094450980416e-06, "loss": 0.83127213, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 4.031324148178101 }, { "auxiliary_loss_clip": 0.01231964, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.0522604, "balance_loss_mlp": 1.02254665, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 2.1416244666748825, "language_loss": 0.76982868, "learning_rate": 1.9883154917605556e-06, "loss": 0.79244941, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 4.15547776222229 }, { "auxiliary_loss_clip": 0.01187958, "auxiliary_loss_mlp": 0.01029003, "balance_loss_clip": 1.05472851, "balance_loss_mlp": 1.02161813, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.690022109053751, "language_loss": 0.83576149, "learning_rate": 1.9875365343132262e-06, "loss": 0.85793108, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.6159043312072754 }, { "auxiliary_loss_clip": 0.01239496, "auxiliary_loss_mlp": 0.02566568, "balance_loss_clip": 1.05692339, "balance_loss_mlp": 1.00054502, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 3.9756936073571087, "language_loss": 0.84977263, "learning_rate": 1.9867575787565946e-06, "loss": 0.88783324, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.7158455848693848 }, { "auxiliary_loss_clip": 0.01238748, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.05584979, "balance_loss_mlp": 1.02601659, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 1.8735579176970072, "language_loss": 0.85688365, "learning_rate": 1.9859786252088275e-06, "loss": 0.87960827, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.599133014678955 }, { "auxiliary_loss_clip": 0.01337262, "auxiliary_loss_mlp": 0.0102568, "balance_loss_clip": 1.0509305, "balance_loss_mlp": 1.01752555, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 2.4928819911120708, "language_loss": 0.66851121, "learning_rate": 1.9851996737880914e-06, "loss": 0.69214058, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 3.6310720443725586 }, { "auxiliary_loss_clip": 0.01239182, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.05212033, "balance_loss_mlp": 1.0196476, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.2591712061277085, "language_loss": 0.74668694, "learning_rate": 1.9844207246125537e-06, "loss": 0.7693547, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.5822744369506836 }, { "auxiliary_loss_clip": 0.01284541, "auxiliary_loss_mlp": 0.0102683, "balance_loss_clip": 1.05104566, "balance_loss_mlp": 1.0194627, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 1.7908900442780344, "language_loss": 0.68280149, "learning_rate": 1.983641777800379e-06, "loss": 0.70591521, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.820469617843628 }, { "auxiliary_loss_clip": 0.01185049, "auxiliary_loss_mlp": 0.01006471, "balance_loss_clip": 1.00879979, "balance_loss_mlp": 1.00553536, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.7442690253247953, "language_loss": 0.5876438, "learning_rate": 1.9828628334697343e-06, "loss": 0.609559, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 4.31735897064209 }, { "auxiliary_loss_clip": 0.01179371, "auxiliary_loss_mlp": 0.01005507, "balance_loss_clip": 1.00886405, "balance_loss_mlp": 1.00456512, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7595450418083342, "language_loss": 0.54622197, "learning_rate": 1.982083891738784e-06, "loss": 0.56807077, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.2857279777526855 }, { "auxiliary_loss_clip": 0.01279529, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.05340171, "balance_loss_mlp": 1.02215099, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.497768734514173, "language_loss": 0.82938784, "learning_rate": 1.9813049527256923e-06, "loss": 0.85247445, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.675826072692871 }, { "auxiliary_loss_clip": 0.01337312, "auxiliary_loss_mlp": 0.01027603, "balance_loss_clip": 1.04719174, "balance_loss_mlp": 1.01968157, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.3384362165082995, "language_loss": 0.82050824, "learning_rate": 1.9805260165486252e-06, "loss": 0.8441574, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.7308170795440674 }, { "auxiliary_loss_clip": 0.012368, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.05358243, "balance_loss_mlp": 1.02047205, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 3.4179440234518865, "language_loss": 0.86617184, "learning_rate": 1.9797470833257457e-06, "loss": 0.88882315, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.6335628032684326 }, { "auxiliary_loss_clip": 0.01241751, "auxiliary_loss_mlp": 0.01031321, "balance_loss_clip": 1.05748165, "balance_loss_mlp": 1.02380538, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 1.916818794600053, "language_loss": 0.77740729, "learning_rate": 1.9789681531752177e-06, "loss": 0.800138, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.6624913215637207 }, { "auxiliary_loss_clip": 0.01381287, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.05009615, "balance_loss_mlp": 1.02124715, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.8533470607369493, "language_loss": 0.72616667, "learning_rate": 1.978189226215204e-06, "loss": 0.75026387, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.752135992050171 }, { "auxiliary_loss_clip": 0.01188117, "auxiliary_loss_mlp": 0.01027192, "balance_loss_clip": 1.05510724, "balance_loss_mlp": 1.01894248, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 1.9373131513217623, "language_loss": 0.76788878, "learning_rate": 1.9774103025638675e-06, "loss": 0.7900418, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.583927631378174 }, { "auxiliary_loss_clip": 0.01390578, "auxiliary_loss_mlp": 0.01032314, "balance_loss_clip": 1.05479085, "balance_loss_mlp": 1.02463078, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.6329006416483713, "language_loss": 0.76581007, "learning_rate": 1.9766313823393696e-06, "loss": 0.79003894, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.769239664077759 }, { "auxiliary_loss_clip": 0.01374245, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.04327226, "balance_loss_mlp": 1.01964593, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 2.5115238212886224, "language_loss": 0.69066679, "learning_rate": 1.975852465659873e-06, "loss": 0.71467865, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.726191759109497 }, { "auxiliary_loss_clip": 0.01238677, "auxiliary_loss_mlp": 0.01032128, "balance_loss_clip": 1.05460048, "balance_loss_mlp": 1.02403963, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.92627360893441, "language_loss": 0.70018351, "learning_rate": 1.9750735526435377e-06, "loss": 0.72289157, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.6177966594696045 }, { "auxiliary_loss_clip": 0.01289634, "auxiliary_loss_mlp": 0.0102779, "balance_loss_clip": 1.05500746, "balance_loss_mlp": 1.01951051, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.58351472849173, "language_loss": 0.79185927, "learning_rate": 1.974294643408525e-06, "loss": 0.8150335, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.7322659492492676 }, { "auxiliary_loss_clip": 0.01241952, "auxiliary_loss_mlp": 0.01026869, "balance_loss_clip": 1.05245519, "balance_loss_mlp": 1.01909614, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 1.9478848300427416, "language_loss": 0.66817045, "learning_rate": 1.9735157380729947e-06, "loss": 0.69085866, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.6463534832000732 }, { "auxiliary_loss_clip": 0.01287366, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.05083704, "balance_loss_mlp": 1.01862931, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 2.7198960169598494, "language_loss": 0.84327376, "learning_rate": 1.9727368367551053e-06, "loss": 0.86640447, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.742995500564575 }, { "auxiliary_loss_clip": 0.01279769, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.05071568, "balance_loss_mlp": 1.02099574, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.937388422785079, "language_loss": 0.68397534, "learning_rate": 1.9719579395730164e-06, "loss": 0.70705557, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.7188711166381836 }, { "auxiliary_loss_clip": 0.01190514, "auxiliary_loss_mlp": 0.01032241, "balance_loss_clip": 1.05775952, "balance_loss_mlp": 1.0239892, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 2.9196186893576774, "language_loss": 0.93654859, "learning_rate": 1.9711790466448854e-06, "loss": 0.95877618, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.5437870025634766 }, { "auxiliary_loss_clip": 0.01385766, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.04968274, "balance_loss_mlp": 1.02247965, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.260103335477977, "language_loss": 0.71671426, "learning_rate": 1.9704001580888704e-06, "loss": 0.74088788, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.7915499210357666 }, { "auxiliary_loss_clip": 0.01280698, "auxiliary_loss_mlp": 0.02568368, "balance_loss_clip": 1.04977894, "balance_loss_mlp": 1.00063062, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 2.0477472260874032, "language_loss": 0.86790967, "learning_rate": 1.9696212740231283e-06, "loss": 0.90640032, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 3.5986363887786865 }, { "auxiliary_loss_clip": 0.0124189, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.05070102, "balance_loss_mlp": 1.0220449, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.1409868787869013, "language_loss": 0.82088947, "learning_rate": 1.9688423945658146e-06, "loss": 0.843611, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.6712229251861572 }, { "auxiliary_loss_clip": 0.01377446, "auxiliary_loss_mlp": 0.01029909, "balance_loss_clip": 1.04176986, "balance_loss_mlp": 1.02141571, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.4201044278583255, "language_loss": 0.72016865, "learning_rate": 1.9680635198350845e-06, "loss": 0.74424219, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 3.683738946914673 }, { "auxiliary_loss_clip": 0.01236639, "auxiliary_loss_mlp": 0.01034318, "balance_loss_clip": 1.05049133, "balance_loss_mlp": 1.02624726, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 5.9312439333795774, "language_loss": 0.72845531, "learning_rate": 1.967284649949093e-06, "loss": 0.75116491, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.621351718902588 }, { "auxiliary_loss_clip": 0.01338244, "auxiliary_loss_mlp": 0.01025585, "balance_loss_clip": 1.04957962, "balance_loss_mlp": 1.01788425, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 2.2014526690159575, "language_loss": 0.7213614, "learning_rate": 1.966505785025994e-06, "loss": 0.74499965, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.91732120513916 }, { "auxiliary_loss_clip": 0.01237802, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.05159366, "balance_loss_mlp": 1.01944625, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 2.5647337287136676, "language_loss": 0.76324397, "learning_rate": 1.965726925183941e-06, "loss": 0.78589261, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 3.0606443881988525 }, { "auxiliary_loss_clip": 0.01187729, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.05490661, "balance_loss_mlp": 1.01828909, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.95108666452016, "language_loss": 0.85008502, "learning_rate": 1.964948070541087e-06, "loss": 0.87222183, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 3.4612600803375244 }, { "auxiliary_loss_clip": 0.01230145, "auxiliary_loss_mlp": 0.01030903, "balance_loss_clip": 1.0490334, "balance_loss_mlp": 1.02330303, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.2101095567880615, "language_loss": 0.70133555, "learning_rate": 1.9641692212155816e-06, "loss": 0.72394603, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.6175167560577393 }, { "auxiliary_loss_clip": 0.01384678, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.05166292, "balance_loss_mlp": 1.02530205, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 2.188769119920752, "language_loss": 0.72715378, "learning_rate": 1.9633903773255777e-06, "loss": 0.75132829, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.9817237854003906 }, { "auxiliary_loss_clip": 0.01185065, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.05131555, "balance_loss_mlp": 1.02160788, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.5605933611123073, "language_loss": 0.74599254, "learning_rate": 1.9626115389892237e-06, "loss": 0.76813436, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.639089345932007 }, { "auxiliary_loss_clip": 0.01243736, "auxiliary_loss_mlp": 0.01033271, "balance_loss_clip": 1.05101168, "balance_loss_mlp": 1.0251472, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 6.277888899689654, "language_loss": 0.85904956, "learning_rate": 1.96183270632467e-06, "loss": 0.88181973, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.70280122756958 }, { "auxiliary_loss_clip": 0.01234204, "auxiliary_loss_mlp": 0.0257199, "balance_loss_clip": 1.04754663, "balance_loss_mlp": 1.00046062, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.8837176155762203, "language_loss": 0.79119194, "learning_rate": 1.9610538794500644e-06, "loss": 0.82925385, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.7595667839050293 }, { "auxiliary_loss_clip": 0.01234965, "auxiliary_loss_mlp": 0.01000573, "balance_loss_clip": 1.0105623, "balance_loss_mlp": 0.99962503, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7672028944906849, "language_loss": 0.59413958, "learning_rate": 1.9602750584835542e-06, "loss": 0.61649501, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.379917621612549 }, { "auxiliary_loss_clip": 0.01285563, "auxiliary_loss_mlp": 0.01027955, "balance_loss_clip": 1.04904974, "balance_loss_mlp": 1.0208261, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 1.8762834780335629, "language_loss": 0.82510853, "learning_rate": 1.959496243543286e-06, "loss": 0.84824371, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.629377603530884 }, { "auxiliary_loss_clip": 0.01243694, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.05906928, "balance_loss_mlp": 1.02053809, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.1934465193050703, "language_loss": 0.79182589, "learning_rate": 1.9587174347474057e-06, "loss": 0.81454682, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.6937756538391113 }, { "auxiliary_loss_clip": 0.01374732, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.04581821, "balance_loss_mlp": 1.02346683, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.1327781413731706, "language_loss": 0.82022023, "learning_rate": 1.9579386322140574e-06, "loss": 0.84428406, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.6899569034576416 }, { "auxiliary_loss_clip": 0.01192087, "auxiliary_loss_mlp": 0.02571493, "balance_loss_clip": 1.05725825, "balance_loss_mlp": 1.00039113, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 1.6876930415929043, "language_loss": 0.80760574, "learning_rate": 1.9571598360613854e-06, "loss": 0.84524155, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.7201995849609375 }, { "auxiliary_loss_clip": 0.01277848, "auxiliary_loss_mlp": 0.01026432, "balance_loss_clip": 1.0465858, "balance_loss_mlp": 1.01872778, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 4.031970000213029, "language_loss": 0.69579482, "learning_rate": 1.956381046407532e-06, "loss": 0.71883762, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.630032777786255 }, { "auxiliary_loss_clip": 0.01337434, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.0501523, "balance_loss_mlp": 1.02436161, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 1.7576520104422564, "language_loss": 0.86462438, "learning_rate": 1.9556022633706394e-06, "loss": 0.88831723, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.7641727924346924 }, { "auxiliary_loss_clip": 0.01281698, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.05162716, "balance_loss_mlp": 1.01975393, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.6896153027725294, "language_loss": 0.79706281, "learning_rate": 1.954823487068848e-06, "loss": 0.82015562, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.6593313217163086 }, { "auxiliary_loss_clip": 0.01234508, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.05590737, "balance_loss_mlp": 1.02363002, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 2.3372791669616655, "language_loss": 0.80849111, "learning_rate": 1.9540447176202976e-06, "loss": 0.83114302, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.724114179611206 }, { "auxiliary_loss_clip": 0.01119268, "auxiliary_loss_mlp": 0.01001363, "balance_loss_clip": 1.01079834, "balance_loss_mlp": 1.00046277, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8748230963469118, "language_loss": 0.60737705, "learning_rate": 1.9532659551431272e-06, "loss": 0.62858331, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.372098684310913 }, { "auxiliary_loss_clip": 0.0123698, "auxiliary_loss_mlp": 0.0102807, "balance_loss_clip": 1.05072343, "balance_loss_mlp": 1.02082777, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 2.2231224961209737, "language_loss": 0.672966, "learning_rate": 1.9524871997554744e-06, "loss": 0.69561648, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 2.997997522354126 }, { "auxiliary_loss_clip": 0.01235729, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.053105, "balance_loss_mlp": 1.02654314, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.5540092213146552, "language_loss": 0.80514133, "learning_rate": 1.951708451575475e-06, "loss": 0.82784116, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.6836438179016113 }, { "auxiliary_loss_clip": 0.01346205, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.04903185, "balance_loss_mlp": 1.01912141, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.0565095741394144, "language_loss": 0.82200706, "learning_rate": 1.9509297107212657e-06, "loss": 0.84573495, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.6244020462036133 }, { "auxiliary_loss_clip": 0.01187757, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.05580723, "balance_loss_mlp": 1.02557564, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.678582240832953, "language_loss": 0.79004824, "learning_rate": 1.95015097731098e-06, "loss": 0.81225628, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.6659224033355713 }, { "auxiliary_loss_clip": 0.01187519, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.05551696, "balance_loss_mlp": 1.02135444, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.722424368247757, "language_loss": 0.82144129, "learning_rate": 1.949372251462751e-06, "loss": 0.84360862, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 3.5072882175445557 }, { "auxiliary_loss_clip": 0.01336087, "auxiliary_loss_mlp": 0.02565476, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.00046873, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 2.065775578812693, "language_loss": 0.82822466, "learning_rate": 1.9485935332947124e-06, "loss": 0.86724031, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.7020936012268066 }, { "auxiliary_loss_clip": 0.01279234, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.05061436, "balance_loss_mlp": 1.02184057, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.5782680764155597, "language_loss": 0.83511668, "learning_rate": 1.947814822924993e-06, "loss": 0.85819846, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 3.510423183441162 }, { "auxiliary_loss_clip": 0.01188548, "auxiliary_loss_mlp": 0.01022607, "balance_loss_clip": 1.05723906, "balance_loss_mlp": 1.01555848, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 1.9080543316385168, "language_loss": 0.82965744, "learning_rate": 1.9470361204717236e-06, "loss": 0.85176897, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.584832191467285 }, { "auxiliary_loss_clip": 0.01336179, "auxiliary_loss_mlp": 0.02569373, "balance_loss_clip": 1.04802775, "balance_loss_mlp": 1.00040722, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.6683820784087944, "language_loss": 0.8090679, "learning_rate": 1.9462574260530326e-06, "loss": 0.84812343, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.774475574493408 }, { "auxiliary_loss_clip": 0.01229245, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 1.04942822, "balance_loss_mlp": 1.0213728, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.713449073014465, "language_loss": 0.81057239, "learning_rate": 1.9454787397870472e-06, "loss": 0.83315849, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.5859947204589844 }, { "auxiliary_loss_clip": 0.01333458, "auxiliary_loss_mlp": 0.01028411, "balance_loss_clip": 1.05029106, "balance_loss_mlp": 1.02129698, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 2.2572265607389963, "language_loss": 0.71807611, "learning_rate": 1.944700061791894e-06, "loss": 0.74169487, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 3.773970365524292 }, { "auxiliary_loss_clip": 0.01236019, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.054268, "balance_loss_mlp": 1.0216819, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.388066339957428, "language_loss": 0.65084988, "learning_rate": 1.943921392185698e-06, "loss": 0.67349994, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.8274667263031006 }, { "auxiliary_loss_clip": 0.01190584, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 1.04949367, "balance_loss_mlp": 1.02045834, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.0070271081645994, "language_loss": 0.7719239, "learning_rate": 1.9431427310865814e-06, "loss": 0.79410452, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 4.049981355667114 }, { "auxiliary_loss_clip": 0.01329103, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.05029154, "balance_loss_mlp": 1.02253246, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 2.560035319514936, "language_loss": 0.78547043, "learning_rate": 1.942364078612667e-06, "loss": 0.80905712, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.781801462173462 }, { "auxiliary_loss_clip": 0.01241783, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.04830813, "balance_loss_mlp": 1.0239594, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 1.8398002272572562, "language_loss": 0.75898385, "learning_rate": 1.9415854348820765e-06, "loss": 0.78171748, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.7578771114349365 }, { "auxiliary_loss_clip": 0.01139789, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.05058813, "balance_loss_mlp": 1.0172689, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.71802765816635, "language_loss": 0.6839357, "learning_rate": 1.940806800012929e-06, "loss": 0.70558274, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.5751709938049316 }, { "auxiliary_loss_clip": 0.0137596, "auxiliary_loss_mlp": 0.02570481, "balance_loss_clip": 1.04819143, "balance_loss_mlp": 1.00035024, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 10.445961543803138, "language_loss": 0.63726026, "learning_rate": 1.9400281741233432e-06, "loss": 0.67672467, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.9346792697906494 }, { "auxiliary_loss_clip": 0.01222633, "auxiliary_loss_mlp": 0.01002627, "balance_loss_clip": 1.00940406, "balance_loss_mlp": 1.00153661, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6581290227637631, "language_loss": 0.52531052, "learning_rate": 1.939249557331435e-06, "loss": 0.5475632, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.300522804260254 }, { "auxiliary_loss_clip": 0.01342039, "auxiliary_loss_mlp": 0.01025389, "balance_loss_clip": 1.05023849, "balance_loss_mlp": 1.01771796, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 1.9515354800268836, "language_loss": 0.73284018, "learning_rate": 1.938470949755321e-06, "loss": 0.75651443, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.8130288124084473 }, { "auxiliary_loss_clip": 0.01230465, "auxiliary_loss_mlp": 0.01002695, "balance_loss_clip": 1.00902855, "balance_loss_mlp": 1.00166976, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8085968802254015, "language_loss": 0.55663246, "learning_rate": 1.937692351513115e-06, "loss": 0.57896405, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.2948267459869385 }, { "auxiliary_loss_clip": 0.01238788, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.05140913, "balance_loss_mlp": 1.02436328, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.7289018902683848, "language_loss": 0.80647659, "learning_rate": 1.9369137627229297e-06, "loss": 0.82918859, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.652724504470825 }, { "auxiliary_loss_clip": 0.01232174, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.05250573, "balance_loss_mlp": 1.01932776, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.368646272956753, "language_loss": 0.88653195, "learning_rate": 1.936135183502877e-06, "loss": 0.90912384, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.833144187927246 }, { "auxiliary_loss_clip": 0.0123712, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.0492959, "balance_loss_mlp": 1.01935506, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.816656488995863, "language_loss": 0.80988425, "learning_rate": 1.935356613971066e-06, "loss": 0.83252585, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.721973180770874 }, { "auxiliary_loss_clip": 0.01282273, "auxiliary_loss_mlp": 0.02567534, "balance_loss_clip": 1.05166769, "balance_loss_mlp": 1.00040197, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.823372909035621, "language_loss": 0.76930439, "learning_rate": 1.9345780542456047e-06, "loss": 0.8078025, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.7309157848358154 }, { "auxiliary_loss_clip": 0.01230664, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.05181372, "balance_loss_mlp": 1.02142644, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 1.8886989854972738, "language_loss": 0.7149393, "learning_rate": 1.9337995044446007e-06, "loss": 0.73753655, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.6662302017211914 }, { "auxiliary_loss_clip": 0.01240054, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.05300331, "balance_loss_mlp": 1.01658988, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.6404054414212483, "language_loss": 0.79840845, "learning_rate": 1.9330209646861596e-06, "loss": 0.82105148, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.6257431507110596 }, { "auxiliary_loss_clip": 0.01279554, "auxiliary_loss_mlp": 0.01026532, "balance_loss_clip": 1.04967117, "balance_loss_mlp": 1.01942134, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 2.1095451305151585, "language_loss": 0.77695131, "learning_rate": 1.9322424350883843e-06, "loss": 0.80001211, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.705080509185791 }, { "auxiliary_loss_clip": 0.01283788, "auxiliary_loss_mlp": 0.01029936, "balance_loss_clip": 1.04891789, "balance_loss_mlp": 1.02288771, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.644014474061873, "language_loss": 0.78909898, "learning_rate": 1.931463915769379e-06, "loss": 0.81223619, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.736386775970459 }, { "auxiliary_loss_clip": 0.01380426, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.04610276, "balance_loss_mlp": 1.0186801, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.8298840898551103, "language_loss": 0.74177456, "learning_rate": 1.930685406847242e-06, "loss": 0.76583838, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.7104313373565674 }, { "auxiliary_loss_clip": 0.01279657, "auxiliary_loss_mlp": 0.01028835, "balance_loss_clip": 1.05026329, "balance_loss_mlp": 1.02117026, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.5513260804614328, "language_loss": 0.81631428, "learning_rate": 1.9299069084400734e-06, "loss": 0.83939922, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.730428695678711 }, { "auxiliary_loss_clip": 0.01328703, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.05078983, "balance_loss_mlp": 1.01653481, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 1.979560062245474, "language_loss": 0.69693476, "learning_rate": 1.9291284206659717e-06, "loss": 0.72046262, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 3.5764782428741455 }, { "auxiliary_loss_clip": 0.01186814, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.05377269, "balance_loss_mlp": 1.017627, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.222806581810258, "language_loss": 0.7176162, "learning_rate": 1.928349943643032e-06, "loss": 0.73973346, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.6171255111694336 }, { "auxiliary_loss_clip": 0.01234359, "auxiliary_loss_mlp": 0.0102855, "balance_loss_clip": 1.05569887, "balance_loss_mlp": 1.02134109, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.837858489579107, "language_loss": 0.82025409, "learning_rate": 1.9275714774893493e-06, "loss": 0.84288317, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 3.5779716968536377 }, { "auxiliary_loss_clip": 0.01325056, "auxiliary_loss_mlp": 0.01024399, "balance_loss_clip": 1.04433155, "balance_loss_mlp": 1.01606345, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.817806838562993, "language_loss": 0.72796285, "learning_rate": 1.9267930223230154e-06, "loss": 0.75145745, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.688720464706421 }, { "auxiliary_loss_clip": 0.01287244, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 1.05192029, "balance_loss_mlp": 1.02227581, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 2.2678925788639877, "language_loss": 0.78067261, "learning_rate": 1.9260145782621224e-06, "loss": 0.80384159, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.7068588733673096 }, { "auxiliary_loss_clip": 0.01279275, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.05218554, "balance_loss_mlp": 1.02253306, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 2.1262928666574044, "language_loss": 0.88012147, "learning_rate": 1.925236145424758e-06, "loss": 0.90321249, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.705787181854248 }, { "auxiliary_loss_clip": 0.01125212, "auxiliary_loss_mlp": 0.01002515, "balance_loss_clip": 1.01034737, "balance_loss_mlp": 1.00152004, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.6955536957429622, "language_loss": 0.57536066, "learning_rate": 1.924457723929012e-06, "loss": 0.59663796, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 4.267252206802368 }, { "auxiliary_loss_clip": 0.01232179, "auxiliary_loss_mlp": 0.01027393, "balance_loss_clip": 1.05097461, "balance_loss_mlp": 1.01972759, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 1.5197347547548996, "language_loss": 0.82527137, "learning_rate": 1.9236793138929685e-06, "loss": 0.84786713, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.6301937103271484 }, { "auxiliary_loss_clip": 0.01237902, "auxiliary_loss_mlp": 0.01024702, "balance_loss_clip": 1.05120325, "balance_loss_mlp": 1.01723409, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 2.6381634810182035, "language_loss": 0.81363046, "learning_rate": 1.9229009154347133e-06, "loss": 0.83625644, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.619692087173462 }, { "auxiliary_loss_clip": 0.01371745, "auxiliary_loss_mlp": 0.02564912, "balance_loss_clip": 1.04472291, "balance_loss_mlp": 1.000314, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.0412931460655424, "language_loss": 0.81044722, "learning_rate": 1.922122528672327e-06, "loss": 0.8498137, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 3.6411266326904297 }, { "auxiliary_loss_clip": 0.01182648, "auxiliary_loss_mlp": 0.01025453, "balance_loss_clip": 1.05365562, "balance_loss_mlp": 1.01817799, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 3.8609273611739106, "language_loss": 0.78727651, "learning_rate": 1.9213441537238914e-06, "loss": 0.80935752, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.6276931762695312 }, { "auxiliary_loss_clip": 0.01279281, "auxiliary_loss_mlp": 0.01003541, "balance_loss_clip": 1.01622629, "balance_loss_mlp": 1.00260568, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.9692769502701037, "language_loss": 0.57322073, "learning_rate": 1.920565790707485e-06, "loss": 0.59604895, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.407212495803833 }, { "auxiliary_loss_clip": 0.01289363, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 1.04477811, "balance_loss_mlp": 1.01984477, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 2.0414887742862007, "language_loss": 0.66100335, "learning_rate": 1.9197874397411853e-06, "loss": 0.68417633, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.7652432918548584 }, { "auxiliary_loss_clip": 0.01335735, "auxiliary_loss_mlp": 0.01023068, "balance_loss_clip": 1.04485822, "balance_loss_mlp": 1.01527119, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 3.206329760329269, "language_loss": 0.67306674, "learning_rate": 1.919009100943067e-06, "loss": 0.69665468, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.7600257396698 }, { "auxiliary_loss_clip": 0.01390785, "auxiliary_loss_mlp": 0.01030066, "balance_loss_clip": 1.0498184, "balance_loss_mlp": 1.02200747, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 1.9415243942071785, "language_loss": 0.65773731, "learning_rate": 1.9182307744312043e-06, "loss": 0.68194586, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.829319477081299 }, { "auxiliary_loss_clip": 0.01288593, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.05049539, "balance_loss_mlp": 1.02293718, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 1.9537225621420309, "language_loss": 0.76615983, "learning_rate": 1.9174524603236676e-06, "loss": 0.7893517, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.698467969894409 }, { "auxiliary_loss_clip": 0.01282301, "auxiliary_loss_mlp": 0.01027874, "balance_loss_clip": 1.05097365, "balance_loss_mlp": 1.02035785, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 2.173295876279596, "language_loss": 0.76662105, "learning_rate": 1.916674158738527e-06, "loss": 0.7897228, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.6360151767730713 }, { "auxiliary_loss_clip": 0.01325762, "auxiliary_loss_mlp": 0.02572572, "balance_loss_clip": 1.04967356, "balance_loss_mlp": 1.00032401, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 2.2596280714456154, "language_loss": 0.60418236, "learning_rate": 1.9158958697938506e-06, "loss": 0.64316571, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.697342872619629 }, { "auxiliary_loss_clip": 0.01281044, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.04882574, "balance_loss_mlp": 1.02329421, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.417463263801839, "language_loss": 0.85552049, "learning_rate": 1.9151175936077032e-06, "loss": 0.8786391, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.6112420558929443 }, { "auxiliary_loss_clip": 0.01230658, "auxiliary_loss_mlp": 0.01024686, "balance_loss_clip": 1.05122983, "balance_loss_mlp": 1.01714325, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.669200631043679, "language_loss": 0.79697526, "learning_rate": 1.9143393302981507e-06, "loss": 0.8195287, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.6463088989257812 }, { "auxiliary_loss_clip": 0.01285358, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.05105245, "balance_loss_mlp": 1.02177095, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.9047231284573756, "language_loss": 0.83469629, "learning_rate": 1.913561079983252e-06, "loss": 0.85785186, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.647071599960327 }, { "auxiliary_loss_clip": 0.01191913, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.04992223, "balance_loss_mlp": 1.02099919, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.142992167236776, "language_loss": 0.74373066, "learning_rate": 1.9127828427810693e-06, "loss": 0.7659384, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.715075731277466 }, { "auxiliary_loss_clip": 0.01339537, "auxiliary_loss_mlp": 0.01023563, "balance_loss_clip": 1.04886317, "balance_loss_mlp": 1.01592731, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 2.0298545238946333, "language_loss": 0.80912155, "learning_rate": 1.9120046188096607e-06, "loss": 0.83275259, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.720177173614502 }, { "auxiliary_loss_clip": 0.01287982, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.05643344, "balance_loss_mlp": 1.02422881, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 2.0737285155104286, "language_loss": 0.74596846, "learning_rate": 1.9112264081870804e-06, "loss": 0.76916814, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.686845064163208 }, { "auxiliary_loss_clip": 0.0133825, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.05553603, "balance_loss_mlp": 1.02197671, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.2207256564153735, "language_loss": 0.75835323, "learning_rate": 1.9104482110313843e-06, "loss": 0.78203386, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.7054479122161865 }, { "auxiliary_loss_clip": 0.01232492, "auxiliary_loss_mlp": 0.0102959, "balance_loss_clip": 1.05141401, "balance_loss_mlp": 1.02197862, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 2.0857759582847404, "language_loss": 0.74337035, "learning_rate": 1.909670027460623e-06, "loss": 0.76599121, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.6722099781036377 }, { "auxiliary_loss_clip": 0.0123543, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.05461144, "balance_loss_mlp": 1.02191544, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 2.756565584566713, "language_loss": 0.71794248, "learning_rate": 1.908891857592847e-06, "loss": 0.74059916, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 3.7423958778381348 }, { "auxiliary_loss_clip": 0.01333694, "auxiliary_loss_mlp": 0.01024018, "balance_loss_clip": 1.05411553, "balance_loss_mlp": 1.01654959, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.2837454189663857, "language_loss": 0.90501851, "learning_rate": 1.9081137015461034e-06, "loss": 0.9285956, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.6742448806762695 }, { "auxiliary_loss_clip": 0.01376058, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.04969597, "balance_loss_mlp": 1.02179861, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 1.9460034158419945, "language_loss": 0.90556818, "learning_rate": 1.9073355594384383e-06, "loss": 0.92962694, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 3.6304914951324463 }, { "auxiliary_loss_clip": 0.01335616, "auxiliary_loss_mlp": 0.01028694, "balance_loss_clip": 1.05191314, "balance_loss_mlp": 1.02092791, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 1.8481395496933284, "language_loss": 0.80487388, "learning_rate": 1.906557431387895e-06, "loss": 0.82851696, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.7530767917633057 }, { "auxiliary_loss_clip": 0.01337377, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.05899882, "balance_loss_mlp": 1.02444029, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 1.8662470317671334, "language_loss": 0.78707546, "learning_rate": 1.905779317512516e-06, "loss": 0.81077719, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.727649450302124 }, { "auxiliary_loss_clip": 0.01232257, "auxiliary_loss_mlp": 0.01031049, "balance_loss_clip": 1.05264139, "balance_loss_mlp": 1.02370536, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 1.993246985336603, "language_loss": 0.80904454, "learning_rate": 1.9050012179303385e-06, "loss": 0.83167756, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 3.533539056777954 }, { "auxiliary_loss_clip": 0.01235257, "auxiliary_loss_mlp": 0.01022432, "balance_loss_clip": 1.04970407, "balance_loss_mlp": 1.01506507, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.3141382399903487, "language_loss": 0.68799257, "learning_rate": 1.904223132759401e-06, "loss": 0.7105695, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.7328834533691406 }, { "auxiliary_loss_clip": 0.0123597, "auxiliary_loss_mlp": 0.01028032, "balance_loss_clip": 1.05316055, "balance_loss_mlp": 1.02069497, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.4017844351390214, "language_loss": 0.69310069, "learning_rate": 1.9034450621177383e-06, "loss": 0.71574074, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.690855026245117 }, { "auxiliary_loss_clip": 0.0123552, "auxiliary_loss_mlp": 0.0102798, "balance_loss_clip": 1.05535436, "balance_loss_mlp": 1.0204246, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 1.7973449600078506, "language_loss": 0.70299506, "learning_rate": 1.9026670061233824e-06, "loss": 0.72563004, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 3.6432478427886963 }, { "auxiliary_loss_clip": 0.01278348, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.05134296, "balance_loss_mlp": 1.02432823, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.8749033949663176, "language_loss": 0.80544186, "learning_rate": 1.901888964894365e-06, "loss": 0.82854003, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.6385996341705322 }, { "auxiliary_loss_clip": 0.01187763, "auxiliary_loss_mlp": 0.01025454, "balance_loss_clip": 1.05444288, "balance_loss_mlp": 1.01797998, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 2.328274124813815, "language_loss": 0.67797303, "learning_rate": 1.9011109385487134e-06, "loss": 0.70010525, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.7030937671661377 }, { "auxiliary_loss_clip": 0.01183491, "auxiliary_loss_mlp": 0.01026544, "balance_loss_clip": 1.05117822, "balance_loss_mlp": 1.01869369, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 3.572836870142444, "language_loss": 0.66284347, "learning_rate": 1.900332927204454e-06, "loss": 0.6849438, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.644507884979248 }, { "auxiliary_loss_clip": 0.01192884, "auxiliary_loss_mlp": 0.0102694, "balance_loss_clip": 1.05240774, "balance_loss_mlp": 1.01879764, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.962416289814965, "language_loss": 0.77031374, "learning_rate": 1.8995549309796097e-06, "loss": 0.792512, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.7139759063720703 }, { "auxiliary_loss_clip": 0.01142561, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.05387807, "balance_loss_mlp": 1.02759576, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 1.7148704070479583, "language_loss": 0.76322293, "learning_rate": 1.8987769499922028e-06, "loss": 0.78499836, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.615846633911133 }, { "auxiliary_loss_clip": 0.01232393, "auxiliary_loss_mlp": 0.02569418, "balance_loss_clip": 1.05216885, "balance_loss_mlp": 1.00033283, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.451933707398109, "language_loss": 0.71305704, "learning_rate": 1.897998984360252e-06, "loss": 0.75107515, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.644105911254883 }, { "auxiliary_loss_clip": 0.01282508, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.05069721, "balance_loss_mlp": 1.01862097, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.4405899041750962, "language_loss": 0.78390163, "learning_rate": 1.897221034201775e-06, "loss": 0.8069855, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.7365224361419678 }, { "auxiliary_loss_clip": 0.01234402, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.04821467, "balance_loss_mlp": 1.02211308, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.6650359898731233, "language_loss": 0.6668098, "learning_rate": 1.8964430996347842e-06, "loss": 0.68944228, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.781513214111328 }, { "auxiliary_loss_clip": 0.0128032, "auxiliary_loss_mlp": 0.0102768, "balance_loss_clip": 1.04844666, "balance_loss_mlp": 1.01982367, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 2.5482128225261467, "language_loss": 0.82303858, "learning_rate": 1.8956651807772931e-06, "loss": 0.84611857, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.63783860206604 }, { "auxiliary_loss_clip": 0.0123194, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.05372298, "balance_loss_mlp": 1.02066541, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.8349017987870098, "language_loss": 0.83970398, "learning_rate": 1.8948872777473115e-06, "loss": 0.86230278, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.6132283210754395 }, { "auxiliary_loss_clip": 0.0128452, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.05284858, "balance_loss_mlp": 1.01933193, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 2.1121984100085873, "language_loss": 0.63418818, "learning_rate": 1.8941093906628458e-06, "loss": 0.65729952, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.7168147563934326 }, { "auxiliary_loss_clip": 0.01274532, "auxiliary_loss_mlp": 0.01029318, "balance_loss_clip": 1.04712641, "balance_loss_mlp": 1.02173066, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.84051968648129, "language_loss": 0.71102995, "learning_rate": 1.893331519641902e-06, "loss": 0.73406845, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.7308335304260254 }, { "auxiliary_loss_clip": 0.01324801, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.04425144, "balance_loss_mlp": 1.01917171, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.4684487689074937, "language_loss": 0.73930788, "learning_rate": 1.8925536648024815e-06, "loss": 0.76282024, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.733017921447754 }, { "auxiliary_loss_clip": 0.01185747, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.05382848, "balance_loss_mlp": 1.01942956, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.8530276167130597, "language_loss": 0.75926602, "learning_rate": 1.8917758262625849e-06, "loss": 0.78138888, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.614306926727295 }, { "auxiliary_loss_clip": 0.01275069, "auxiliary_loss_mlp": 0.0102761, "balance_loss_clip": 1.05066037, "balance_loss_mlp": 1.02074623, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.7179753305538454, "language_loss": 0.80912864, "learning_rate": 1.8909980041402089e-06, "loss": 0.83215541, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.713853120803833 }, { "auxiliary_loss_clip": 0.01231037, "auxiliary_loss_mlp": 0.0102744, "balance_loss_clip": 1.05002475, "balance_loss_mlp": 1.01990294, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.954552652520029, "language_loss": 0.65331769, "learning_rate": 1.8902201985533494e-06, "loss": 0.67590249, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.5758488178253174 }, { "auxiliary_loss_clip": 0.01279161, "auxiliary_loss_mlp": 0.01024179, "balance_loss_clip": 1.05034637, "balance_loss_mlp": 1.01658547, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 1.7278943971376004, "language_loss": 0.74747902, "learning_rate": 1.8894424096199983e-06, "loss": 0.77051246, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.676978349685669 }, { "auxiliary_loss_clip": 0.01233542, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.05353498, "balance_loss_mlp": 1.02005553, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 2.3582776413376503, "language_loss": 0.85971999, "learning_rate": 1.8886646374581463e-06, "loss": 0.88233453, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.5816051959991455 }, { "auxiliary_loss_clip": 0.01232612, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.04971552, "balance_loss_mlp": 1.02184439, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 1.7382575540923493, "language_loss": 0.71058547, "learning_rate": 1.8878868821857795e-06, "loss": 0.73320246, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 3.639629602432251 }, { "auxiliary_loss_clip": 0.0138912, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.04870915, "balance_loss_mlp": 1.02187717, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 5.9573634477542345, "language_loss": 0.7541123, "learning_rate": 1.8871091439208838e-06, "loss": 0.77830881, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.8746516704559326 }, { "auxiliary_loss_clip": 0.01379223, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.04623222, "balance_loss_mlp": 1.02537179, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.0643864885502, "language_loss": 0.77679288, "learning_rate": 1.8863314227814414e-06, "loss": 0.80092019, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 3.6129109859466553 }, { "auxiliary_loss_clip": 0.01240028, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.05242968, "balance_loss_mlp": 1.02312934, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.4008873478968735, "language_loss": 0.49043664, "learning_rate": 1.8855537188854313e-06, "loss": 0.5131464, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.668762445449829 }, { "auxiliary_loss_clip": 0.01234486, "auxiliary_loss_mlp": 0.01024015, "balance_loss_clip": 1.04832506, "balance_loss_mlp": 1.01664233, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.237363689320833, "language_loss": 0.78353643, "learning_rate": 1.8847760323508315e-06, "loss": 0.80612135, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 3.500570774078369 }, { "auxiliary_loss_clip": 0.01273079, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.04977298, "balance_loss_mlp": 1.0269084, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 2.095234196097736, "language_loss": 0.75783962, "learning_rate": 1.883998363295616e-06, "loss": 0.78091025, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.6286723613739014 }, { "auxiliary_loss_clip": 0.01177983, "auxiliary_loss_mlp": 0.01000131, "balance_loss_clip": 1.00955594, "balance_loss_mlp": 0.99929088, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8799564695616859, "language_loss": 0.62592113, "learning_rate": 1.8832207118377565e-06, "loss": 0.64770222, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.1451516151428223 }, { "auxiliary_loss_clip": 0.01181565, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.05279422, "balance_loss_mlp": 1.01811218, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 2.168248543413643, "language_loss": 0.69501495, "learning_rate": 1.882443078095222e-06, "loss": 0.71708238, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 3.477083206176758 }, { "auxiliary_loss_clip": 0.01285994, "auxiliary_loss_mlp": 0.00998741, "balance_loss_clip": 1.01296687, "balance_loss_mlp": 0.99771559, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8679342972089867, "language_loss": 0.66727257, "learning_rate": 1.8816654621859794e-06, "loss": 0.69011986, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.150754928588867 }, { "auxiliary_loss_clip": 0.01181802, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.05259717, "balance_loss_mlp": 1.01951146, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.9920142596969486, "language_loss": 0.72616208, "learning_rate": 1.8808878642279915e-06, "loss": 0.74824899, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.6199655532836914 }, { "auxiliary_loss_clip": 0.0133444, "auxiliary_loss_mlp": 0.01029574, "balance_loss_clip": 1.04364347, "balance_loss_mlp": 1.02204013, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.383317645569247, "language_loss": 0.6529274, "learning_rate": 1.8801102843392209e-06, "loss": 0.67656755, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.692514181137085 }, { "auxiliary_loss_clip": 0.01327756, "auxiliary_loss_mlp": 0.01032382, "balance_loss_clip": 1.04587352, "balance_loss_mlp": 1.02479768, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.648635795586077, "language_loss": 0.85297298, "learning_rate": 1.8793327226376238e-06, "loss": 0.8765744, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.7795395851135254 }, { "auxiliary_loss_clip": 0.01190682, "auxiliary_loss_mlp": 0.01028628, "balance_loss_clip": 1.04843211, "balance_loss_mlp": 1.02111435, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 1.840081934934768, "language_loss": 0.80241752, "learning_rate": 1.8785551792411569e-06, "loss": 0.82461059, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.646946907043457 }, { "auxiliary_loss_clip": 0.01283101, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.05275893, "balance_loss_mlp": 1.02329564, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.2743884325532417, "language_loss": 0.82838506, "learning_rate": 1.8777776542677733e-06, "loss": 0.85151589, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.664463520050049 }, { "auxiliary_loss_clip": 0.01329824, "auxiliary_loss_mlp": 0.01024842, "balance_loss_clip": 1.04469323, "balance_loss_mlp": 1.01752281, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.8779537978569611, "language_loss": 0.73376364, "learning_rate": 1.8770001478354216e-06, "loss": 0.75731027, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.6752665042877197 }, { "auxiliary_loss_clip": 0.0123362, "auxiliary_loss_mlp": 0.01031153, "balance_loss_clip": 1.05058801, "balance_loss_mlp": 1.02333295, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.1510530917278454, "language_loss": 0.83798563, "learning_rate": 1.8762226600620504e-06, "loss": 0.86063337, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.637486219406128 }, { "auxiliary_loss_clip": 0.01291935, "auxiliary_loss_mlp": 0.01027906, "balance_loss_clip": 1.04950392, "balance_loss_mlp": 1.01974034, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.396025824633304, "language_loss": 0.58787549, "learning_rate": 1.8754451910656031e-06, "loss": 0.61107385, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.5972797870635986 }, { "auxiliary_loss_clip": 0.01386649, "auxiliary_loss_mlp": 0.01030986, "balance_loss_clip": 1.04778862, "balance_loss_mlp": 1.02377415, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 1.8201204011138943, "language_loss": 0.8263858, "learning_rate": 1.8746677409640212e-06, "loss": 0.85056221, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.8030319213867188 }, { "auxiliary_loss_clip": 0.01240782, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.05538774, "balance_loss_mlp": 1.01889408, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.877989119785938, "language_loss": 0.84568352, "learning_rate": 1.8738903098752432e-06, "loss": 0.86835313, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.665954351425171 }, { "auxiliary_loss_clip": 0.01285127, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.04944932, "balance_loss_mlp": 1.0190165, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.2100647932407593, "language_loss": 0.73337996, "learning_rate": 1.8731128979172052e-06, "loss": 0.75649631, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.732459783554077 }, { "auxiliary_loss_clip": 0.01277291, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.04982626, "balance_loss_mlp": 1.01608062, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.2684301209695485, "language_loss": 0.67189038, "learning_rate": 1.8723355052078394e-06, "loss": 0.69489914, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.7272303104400635 }, { "auxiliary_loss_clip": 0.01235709, "auxiliary_loss_mlp": 0.01030463, "balance_loss_clip": 1.05191898, "balance_loss_mlp": 1.02230346, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.5287701583912825, "language_loss": 0.77675337, "learning_rate": 1.8715581318650765e-06, "loss": 0.79941511, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.6476898193359375 }, { "auxiliary_loss_clip": 0.01340867, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.04978836, "balance_loss_mlp": 1.0256424, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 3.3293249875912116, "language_loss": 0.82055134, "learning_rate": 1.8707807780068422e-06, "loss": 0.84429896, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.653832197189331 }, { "auxiliary_loss_clip": 0.0128224, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.05048192, "balance_loss_mlp": 1.02096534, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.458454015749908, "language_loss": 0.66511816, "learning_rate": 1.8700034437510611e-06, "loss": 0.68822521, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.6986446380615234 }, { "auxiliary_loss_clip": 0.01329006, "auxiliary_loss_mlp": 0.01030594, "balance_loss_clip": 1.04759598, "balance_loss_mlp": 1.02304244, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.4758487427762654, "language_loss": 0.81562459, "learning_rate": 1.8692261292156549e-06, "loss": 0.83922064, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.7031590938568115 }, { "auxiliary_loss_clip": 0.01182278, "auxiliary_loss_mlp": 0.01026737, "balance_loss_clip": 1.05501032, "balance_loss_mlp": 1.01920843, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 2.0952953375274985, "language_loss": 0.81372982, "learning_rate": 1.8684488345185401e-06, "loss": 0.83581996, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 3.531764507293701 }, { "auxiliary_loss_clip": 0.0118612, "auxiliary_loss_mlp": 0.01028308, "balance_loss_clip": 1.05406499, "balance_loss_mlp": 1.02067327, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.651664230588639, "language_loss": 0.79256016, "learning_rate": 1.8676715597776332e-06, "loss": 0.81470442, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.5915939807891846 }, { "auxiliary_loss_clip": 0.01369431, "auxiliary_loss_mlp": 0.0102492, "balance_loss_clip": 1.04292798, "balance_loss_mlp": 1.01729608, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 2.2575438676725015, "language_loss": 0.76202881, "learning_rate": 1.8668943051108455e-06, "loss": 0.78597224, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.7119991779327393 }, { "auxiliary_loss_clip": 0.01283186, "auxiliary_loss_mlp": 0.01026985, "balance_loss_clip": 1.04986429, "balance_loss_mlp": 1.0189085, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 2.002130354135141, "language_loss": 0.76859021, "learning_rate": 1.8661170706360856e-06, "loss": 0.7916919, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 3.665884256362915 }, { "auxiliary_loss_clip": 0.01227541, "auxiliary_loss_mlp": 0.01027977, "balance_loss_clip": 1.05181181, "balance_loss_mlp": 1.0208962, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 2.0759518817941722, "language_loss": 0.81333995, "learning_rate": 1.8653398564712594e-06, "loss": 0.83589518, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.68106746673584 }, { "auxiliary_loss_clip": 0.0123023, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.05139089, "balance_loss_mlp": 1.02055204, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 2.088769941457838, "language_loss": 0.82364392, "learning_rate": 1.8645626627342704e-06, "loss": 0.84622562, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 3.5616838932037354 }, { "auxiliary_loss_clip": 0.01237639, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 1.05225742, "balance_loss_mlp": 1.01932406, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.108636518134639, "language_loss": 0.81347299, "learning_rate": 1.8637854895430172e-06, "loss": 0.83611572, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.68507719039917 }, { "auxiliary_loss_clip": 0.0133553, "auxiliary_loss_mlp": 0.01030743, "balance_loss_clip": 1.05080569, "balance_loss_mlp": 1.02319694, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.308592280950981, "language_loss": 0.69802582, "learning_rate": 1.8630083370153978e-06, "loss": 0.72168857, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.680406093597412 }, { "auxiliary_loss_clip": 0.01340701, "auxiliary_loss_mlp": 0.00999757, "balance_loss_clip": 1.00906229, "balance_loss_mlp": 0.99872553, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7424601735233134, "language_loss": 0.55357742, "learning_rate": 1.8622312052693041e-06, "loss": 0.57698202, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 4.30298924446106 }, { "auxiliary_loss_clip": 0.01228452, "auxiliary_loss_mlp": 0.01030988, "balance_loss_clip": 1.04871798, "balance_loss_mlp": 1.02350187, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.163201669690553, "language_loss": 0.72058737, "learning_rate": 1.8614540944226267e-06, "loss": 0.74318182, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.7324352264404297 }, { "auxiliary_loss_clip": 0.01280466, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.05351055, "balance_loss_mlp": 1.02326608, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.8662429531375802, "language_loss": 0.68007171, "learning_rate": 1.8606770045932537e-06, "loss": 0.70317942, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.6839284896850586 }, { "auxiliary_loss_clip": 0.0133058, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04447007, "balance_loss_mlp": 1.02108169, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 1.8051391264652918, "language_loss": 0.81611609, "learning_rate": 1.859899935899068e-06, "loss": 0.83971369, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.7336862087249756 }, { "auxiliary_loss_clip": 0.01286476, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.05569458, "balance_loss_mlp": 1.01722074, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.668052983023667, "language_loss": 0.79300559, "learning_rate": 1.8591228884579506e-06, "loss": 0.81611907, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.621901512145996 }, { "auxiliary_loss_clip": 0.01334533, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.0492481, "balance_loss_mlp": 1.02501798, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 3.1392902232799496, "language_loss": 0.82464683, "learning_rate": 1.8583458623877795e-06, "loss": 0.8483175, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.7451260089874268 }, { "auxiliary_loss_clip": 0.01236215, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.0531311, "balance_loss_mlp": 1.02429414, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 1.9977581024357034, "language_loss": 0.74183148, "learning_rate": 1.8575688578064281e-06, "loss": 0.76451206, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.5836660861968994 }, { "auxiliary_loss_clip": 0.01238193, "auxiliary_loss_mlp": 0.01027027, "balance_loss_clip": 1.05459416, "balance_loss_mlp": 1.01903391, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.8722604868300439, "language_loss": 0.76916516, "learning_rate": 1.8567918748317674e-06, "loss": 0.79181737, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.6660170555114746 }, { "auxiliary_loss_clip": 0.01334079, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.04629922, "balance_loss_mlp": 1.02747583, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 1.8216586885843669, "language_loss": 0.82725358, "learning_rate": 1.8560149135816659e-06, "loss": 0.85095125, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.6919517517089844 }, { "auxiliary_loss_clip": 0.0123092, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.04974604, "balance_loss_mlp": 1.02005792, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.271618752112108, "language_loss": 0.84555787, "learning_rate": 1.8552379741739873e-06, "loss": 0.86814541, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.578207015991211 }, { "auxiliary_loss_clip": 0.01232019, "auxiliary_loss_mlp": 0.02507041, "balance_loss_clip": 1.01211643, "balance_loss_mlp": 1.00014198, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.8953746917126124, "language_loss": 0.55594438, "learning_rate": 1.8544610567265935e-06, "loss": 0.59333503, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.2288105487823486 }, { "auxiliary_loss_clip": 0.01280468, "auxiliary_loss_mlp": 0.0256823, "balance_loss_clip": 1.05268359, "balance_loss_mlp": 1.00017738, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.9516414112150426, "language_loss": 0.83214235, "learning_rate": 1.853684161357341e-06, "loss": 0.87062931, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.6232361793518066 }, { "auxiliary_loss_clip": 0.01234006, "auxiliary_loss_mlp": 0.02571642, "balance_loss_clip": 1.05478323, "balance_loss_mlp": 1.00018191, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 1.744406824581951, "language_loss": 0.77215898, "learning_rate": 1.852907288184085e-06, "loss": 0.81021547, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.6645655632019043 }, { "auxiliary_loss_clip": 0.01393296, "auxiliary_loss_mlp": 0.01025884, "balance_loss_clip": 1.05339622, "balance_loss_mlp": 1.01808476, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 1.9827958682681817, "language_loss": 0.70177197, "learning_rate": 1.8521304373246762e-06, "loss": 0.72596377, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.7705183029174805 }, { "auxiliary_loss_clip": 0.01237346, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.05104136, "balance_loss_mlp": 1.0230242, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 3.9830663156830726, "language_loss": 0.89168549, "learning_rate": 1.8513536088969626e-06, "loss": 0.91437054, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.6913797855377197 }, { "auxiliary_loss_clip": 0.0123796, "auxiliary_loss_mlp": 0.01028596, "balance_loss_clip": 1.05624938, "balance_loss_mlp": 1.02066278, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.8888907137139854, "language_loss": 0.80153477, "learning_rate": 1.8505768030187884e-06, "loss": 0.82420039, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.611132860183716 }, { "auxiliary_loss_clip": 0.01280675, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.05290806, "balance_loss_mlp": 1.02469754, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.6122809789026693, "language_loss": 0.79985315, "learning_rate": 1.849800019807995e-06, "loss": 0.82298326, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.760871171951294 }, { "auxiliary_loss_clip": 0.01333229, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 1.05125594, "balance_loss_mlp": 1.01845253, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.5351578031954416, "language_loss": 0.71024728, "learning_rate": 1.8490232593824186e-06, "loss": 0.73384428, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.666750192642212 }, { "auxiliary_loss_clip": 0.01277838, "auxiliary_loss_mlp": 0.0102779, "balance_loss_clip": 1.05238307, "balance_loss_mlp": 1.02010095, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 2.1199186884475947, "language_loss": 0.84909761, "learning_rate": 1.8482465218598935e-06, "loss": 0.87215388, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 3.93097186088562 }, { "auxiliary_loss_clip": 0.01335415, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.04879832, "balance_loss_mlp": 1.02499807, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 6.8765341992454445, "language_loss": 0.83740377, "learning_rate": 1.8474698073582508e-06, "loss": 0.86108899, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.7604830265045166 }, { "auxiliary_loss_clip": 0.01335198, "auxiliary_loss_mlp": 0.01028712, "balance_loss_clip": 1.04781747, "balance_loss_mlp": 1.02097583, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 84.01387600767887, "language_loss": 0.87722957, "learning_rate": 1.8466931159953166e-06, "loss": 0.90086871, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.697531223297119 }, { "auxiliary_loss_clip": 0.01286567, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.05371952, "balance_loss_mlp": 1.02592289, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 1.7187021731527345, "language_loss": 0.84615839, "learning_rate": 1.8459164478889158e-06, "loss": 0.86936915, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 3.5353281497955322 }, { "auxiliary_loss_clip": 0.01323668, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 1.04321766, "balance_loss_mlp": 1.01793492, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.934824225279793, "language_loss": 0.75817955, "learning_rate": 1.8451398031568663e-06, "loss": 0.78166819, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.759193181991577 }, { "auxiliary_loss_clip": 0.01329542, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.04790926, "balance_loss_mlp": 1.02526379, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.7603867054941869, "language_loss": 0.74447489, "learning_rate": 1.844363181916986e-06, "loss": 0.76810515, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 3.6051831245422363 }, { "auxiliary_loss_clip": 0.012364, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.05080962, "balance_loss_mlp": 1.01884627, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 1.8023594792739972, "language_loss": 0.8304863, "learning_rate": 1.8435865842870868e-06, "loss": 0.85312051, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.6228084564208984 }, { "auxiliary_loss_clip": 0.01279873, "auxiliary_loss_mlp": 0.02572097, "balance_loss_clip": 1.04685783, "balance_loss_mlp": 1.00013149, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.7860447664803163, "language_loss": 0.71614051, "learning_rate": 1.8428100103849787e-06, "loss": 0.75466025, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.659630298614502 }, { "auxiliary_loss_clip": 0.01279754, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.0531466, "balance_loss_mlp": 1.02278328, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.1943808600938386, "language_loss": 0.73252809, "learning_rate": 1.842033460328467e-06, "loss": 0.75563318, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 3.5680630207061768 }, { "auxiliary_loss_clip": 0.01285215, "auxiliary_loss_mlp": 0.02569911, "balance_loss_clip": 1.04815054, "balance_loss_mlp": 1.00008488, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.828554318849219, "language_loss": 0.75214338, "learning_rate": 1.8412569342353541e-06, "loss": 0.79069459, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.7275190353393555 }, { "auxiliary_loss_clip": 0.01288374, "auxiliary_loss_mlp": 0.01032049, "balance_loss_clip": 1.05279601, "balance_loss_mlp": 1.02309656, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 2.153080314040614, "language_loss": 0.85085917, "learning_rate": 1.840480432223438e-06, "loss": 0.87406337, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.754060983657837 }, { "auxiliary_loss_clip": 0.01285937, "auxiliary_loss_mlp": 0.01030779, "balance_loss_clip": 1.0503118, "balance_loss_mlp": 1.02334595, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 2.3353168442384624, "language_loss": 0.7787292, "learning_rate": 1.8397039544105131e-06, "loss": 0.80189633, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.742990732192993 }, { "auxiliary_loss_clip": 0.01282915, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.04760683, "balance_loss_mlp": 1.02228832, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 2.0294604865489307, "language_loss": 0.69619083, "learning_rate": 1.8389275009143711e-06, "loss": 0.7193228, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.720344305038452 }, { "auxiliary_loss_clip": 0.01183139, "auxiliary_loss_mlp": 0.01029915, "balance_loss_clip": 1.05277562, "balance_loss_mlp": 1.02271473, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.7068144331066497, "language_loss": 0.73812258, "learning_rate": 1.8381510718527988e-06, "loss": 0.76025307, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.622429609298706 }, { "auxiliary_loss_clip": 0.01285631, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.04764557, "balance_loss_mlp": 1.027233, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 2.1186050753799677, "language_loss": 0.63425952, "learning_rate": 1.8373746673435812e-06, "loss": 0.65746772, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.7408626079559326 }, { "auxiliary_loss_clip": 0.0118726, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.05554032, "balance_loss_mlp": 1.01832032, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.5734292605496722, "language_loss": 0.7904737, "learning_rate": 1.8365982875044964e-06, "loss": 0.81261039, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.6595115661621094 }, { "auxiliary_loss_clip": 0.01240675, "auxiliary_loss_mlp": 0.02574928, "balance_loss_clip": 1.0539763, "balance_loss_mlp": 1.00012183, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.403059831138205, "language_loss": 0.7599628, "learning_rate": 1.8358219324533217e-06, "loss": 0.79811883, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.6333398818969727 }, { "auxiliary_loss_clip": 0.01279005, "auxiliary_loss_mlp": 0.01025114, "balance_loss_clip": 1.04934728, "balance_loss_mlp": 1.01823545, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.6994184572004745, "language_loss": 0.70376778, "learning_rate": 1.8350456023078292e-06, "loss": 0.72680891, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.715528964996338 }, { "auxiliary_loss_clip": 0.01189006, "auxiliary_loss_mlp": 0.01029381, "balance_loss_clip": 1.05282462, "balance_loss_mlp": 1.02132237, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.527437182760878, "language_loss": 0.78297204, "learning_rate": 1.8342692971857874e-06, "loss": 0.80515587, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.637469530105591 }, { "auxiliary_loss_clip": 0.01278419, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.04976058, "balance_loss_mlp": 1.01939893, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 6.590252840968521, "language_loss": 0.71451843, "learning_rate": 1.833493017204962e-06, "loss": 0.73757339, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.6559832096099854 }, { "auxiliary_loss_clip": 0.01186252, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 1.05397558, "balance_loss_mlp": 1.01939631, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 6.049448983686824, "language_loss": 0.77720964, "learning_rate": 1.8327167624831134e-06, "loss": 0.79935193, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.6645667552948 }, { "auxiliary_loss_clip": 0.01187516, "auxiliary_loss_mlp": 0.01027365, "balance_loss_clip": 1.05645776, "balance_loss_mlp": 1.01919961, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 2.1629203208861334, "language_loss": 0.70788091, "learning_rate": 1.831940533137999e-06, "loss": 0.7300297, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.5630605220794678 }, { "auxiliary_loss_clip": 0.01238949, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 1.05666375, "balance_loss_mlp": 1.02241516, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.6920747243818073, "language_loss": 0.72545588, "learning_rate": 1.8311643292873718e-06, "loss": 0.74814624, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.749964714050293 }, { "auxiliary_loss_clip": 0.01227367, "auxiliary_loss_mlp": 0.01030112, "balance_loss_clip": 1.05212712, "balance_loss_mlp": 1.02289379, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.8655982564667606, "language_loss": 0.88017476, "learning_rate": 1.8303881510489818e-06, "loss": 0.90274954, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.6332592964172363 }, { "auxiliary_loss_clip": 0.01284191, "auxiliary_loss_mlp": 0.01025602, "balance_loss_clip": 1.05249977, "balance_loss_mlp": 1.01756692, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 7.174409592603019, "language_loss": 0.69060445, "learning_rate": 1.829611998540574e-06, "loss": 0.71370238, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.731602191925049 }, { "auxiliary_loss_clip": 0.01232185, "auxiliary_loss_mlp": 0.02571935, "balance_loss_clip": 1.05078101, "balance_loss_mlp": 1.00010061, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.6719564317558226, "language_loss": 0.7986939, "learning_rate": 1.8288358718798914e-06, "loss": 0.83673513, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.6927952766418457 }, { "auxiliary_loss_clip": 0.01229492, "auxiliary_loss_mlp": 0.02568765, "balance_loss_clip": 1.05172086, "balance_loss_mlp": 1.00001073, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 1.6520341686985553, "language_loss": 0.72547799, "learning_rate": 1.8280597711846703e-06, "loss": 0.76346052, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 3.5981485843658447 }, { "auxiliary_loss_clip": 0.01239058, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.0572989, "balance_loss_mlp": 1.02239847, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.8969772841744423, "language_loss": 0.83687192, "learning_rate": 1.8272836965726455e-06, "loss": 0.85956049, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.6601197719573975 }, { "auxiliary_loss_clip": 0.01434436, "auxiliary_loss_mlp": 0.01030774, "balance_loss_clip": 1.0441308, "balance_loss_mlp": 1.02287006, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.8435670840164178, "language_loss": 0.78607178, "learning_rate": 1.8265076481615461e-06, "loss": 0.81072384, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.8759567737579346 }, { "auxiliary_loss_clip": 0.01283053, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.05370307, "balance_loss_mlp": 1.02329183, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.130627239378055, "language_loss": 0.87794524, "learning_rate": 1.8257316260690987e-06, "loss": 0.90108669, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 3.5824501514434814 }, { "auxiliary_loss_clip": 0.01137866, "auxiliary_loss_mlp": 0.01024355, "balance_loss_clip": 1.05399561, "balance_loss_mlp": 1.01718497, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.4897702558176484, "language_loss": 0.76170838, "learning_rate": 1.8249556304130254e-06, "loss": 0.78333056, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.612170457839966 }, { "auxiliary_loss_clip": 0.01276188, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.04818344, "balance_loss_mlp": 1.02239788, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 18.734975432679548, "language_loss": 0.68405402, "learning_rate": 1.824179661311044e-06, "loss": 0.70711911, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 3.590867042541504 }, { "auxiliary_loss_clip": 0.01379585, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.042009, "balance_loss_mlp": 1.02106047, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 1.9151834614788033, "language_loss": 0.7952081, "learning_rate": 1.823403718880868e-06, "loss": 0.81928694, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.7538390159606934 }, { "auxiliary_loss_clip": 0.01285111, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.04706228, "balance_loss_mlp": 1.02322769, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 2.231119656455754, "language_loss": 0.66278648, "learning_rate": 1.822627803240207e-06, "loss": 0.68594998, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.8428750038146973 }, { "auxiliary_loss_clip": 0.01334292, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.04843974, "balance_loss_mlp": 1.02116346, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 2.207829462459169, "language_loss": 0.85388541, "learning_rate": 1.8218519145067675e-06, "loss": 0.87751144, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 3.561455726623535 }, { "auxiliary_loss_clip": 0.01328903, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.04729211, "balance_loss_mlp": 1.01933408, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 2.0436150692627417, "language_loss": 0.89546764, "learning_rate": 1.8210760527982508e-06, "loss": 0.9190262, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.7052371501922607 }, { "auxiliary_loss_clip": 0.01285174, "auxiliary_loss_mlp": 0.02567924, "balance_loss_clip": 1.05340958, "balance_loss_mlp": 0.99999392, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 4.559731883494288, "language_loss": 0.75159764, "learning_rate": 1.8203002182323552e-06, "loss": 0.79012871, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.727151393890381 }, { "auxiliary_loss_clip": 0.01291444, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.0534302, "balance_loss_mlp": 1.02112079, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.7681589357186471, "language_loss": 0.75639832, "learning_rate": 1.819524410926773e-06, "loss": 0.77960366, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.658545732498169 }, { "auxiliary_loss_clip": 0.01428535, "auxiliary_loss_mlp": 0.01026859, "balance_loss_clip": 1.04891884, "balance_loss_mlp": 1.01939631, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.495903969937153, "language_loss": 0.77149403, "learning_rate": 1.8187486309991944e-06, "loss": 0.79604793, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.849848985671997 }, { "auxiliary_loss_clip": 0.01141279, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.05400169, "balance_loss_mlp": 1.02585697, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 2.2904099498481565, "language_loss": 0.77523255, "learning_rate": 1.817972878567304e-06, "loss": 0.79697669, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.7956297397613525 }, { "auxiliary_loss_clip": 0.01289402, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.05074358, "balance_loss_mlp": 1.02535558, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.8034055347374058, "language_loss": 0.76341069, "learning_rate": 1.8171971537487834e-06, "loss": 0.78663635, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.659496784210205 }, { "auxiliary_loss_clip": 0.01187936, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.05330682, "balance_loss_mlp": 1.01934588, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.790860051453592, "language_loss": 0.80629349, "learning_rate": 1.8164214566613093e-06, "loss": 0.8284415, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.6098196506500244 }, { "auxiliary_loss_clip": 0.01182227, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.05164289, "balance_loss_mlp": 1.02226305, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 22.717279894417217, "language_loss": 0.66310996, "learning_rate": 1.8156457874225547e-06, "loss": 0.68523097, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.5993218421936035 }, { "auxiliary_loss_clip": 0.01273562, "auxiliary_loss_mlp": 0.01027522, "balance_loss_clip": 1.04960048, "balance_loss_mlp": 1.020787, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 2.1268214146062587, "language_loss": 0.80624598, "learning_rate": 1.814870146150187e-06, "loss": 0.82925683, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.632239580154419 }, { "auxiliary_loss_clip": 0.01189802, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.04858398, "balance_loss_mlp": 1.02362084, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 1.9880651742824034, "language_loss": 0.78589112, "learning_rate": 1.814094532961871e-06, "loss": 0.80810165, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.664611577987671 }, { "auxiliary_loss_clip": 0.01385493, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.0449785, "balance_loss_mlp": 1.02141118, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 1.846198873535851, "language_loss": 0.83866107, "learning_rate": 1.8133189479752666e-06, "loss": 0.86280811, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.8159639835357666 }, { "auxiliary_loss_clip": 0.011806, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.02554929, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 9.38377310482567, "language_loss": 0.82213652, "learning_rate": 1.8125433913080292e-06, "loss": 0.84427023, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.5612564086914062 }, { "auxiliary_loss_clip": 0.01583549, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.04400575, "balance_loss_mlp": 1.02289844, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.67320244953315, "language_loss": 0.82473689, "learning_rate": 1.811767863077811e-06, "loss": 0.8508743, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 2.988917827606201 }, { "auxiliary_loss_clip": 0.01422784, "auxiliary_loss_mlp": 0.01028875, "balance_loss_clip": 1.04622662, "balance_loss_mlp": 1.02184796, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.5842614954494652, "language_loss": 0.78483033, "learning_rate": 1.8109923634022577e-06, "loss": 0.80934691, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 2.9779064655303955 }, { "auxiliary_loss_clip": 0.01186187, "auxiliary_loss_mlp": 0.01026552, "balance_loss_clip": 1.05318046, "balance_loss_mlp": 1.01920557, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.1603266645149946, "language_loss": 0.86411786, "learning_rate": 1.8102168923990128e-06, "loss": 0.88624519, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.7571797370910645 }, { "auxiliary_loss_clip": 0.01237, "auxiliary_loss_mlp": 0.02568195, "balance_loss_clip": 1.0537076, "balance_loss_mlp": 1.00003242, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 2.997992636863713, "language_loss": 0.7961989, "learning_rate": 1.809441450185714e-06, "loss": 0.83425081, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.7207083702087402 }, { "auxiliary_loss_clip": 0.012863, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.04706907, "balance_loss_mlp": 1.0180006, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.3716861460386442, "language_loss": 0.73596263, "learning_rate": 1.8086660368799958e-06, "loss": 0.75908399, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.667907953262329 }, { "auxiliary_loss_clip": 0.01283593, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.05099499, "balance_loss_mlp": 1.02139592, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 1.745474619489347, "language_loss": 0.77647173, "learning_rate": 1.807890652599488e-06, "loss": 0.79960549, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 4.2622387409210205 }, { "auxiliary_loss_clip": 0.01181446, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.05260086, "balance_loss_mlp": 1.02097154, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 2.0374502603550955, "language_loss": 0.82604313, "learning_rate": 1.8071152974618156e-06, "loss": 0.84813696, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.622394561767578 }, { "auxiliary_loss_clip": 0.01332129, "auxiliary_loss_mlp": 0.02569175, "balance_loss_clip": 1.04577947, "balance_loss_mlp": 1.00003767, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.2034734964559073, "language_loss": 0.78263915, "learning_rate": 1.806339971584599e-06, "loss": 0.82165217, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.681488513946533 }, { "auxiliary_loss_clip": 0.01184245, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.05327892, "balance_loss_mlp": 1.01825368, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 1.8577325122815846, "language_loss": 0.85463971, "learning_rate": 1.8055646750854546e-06, "loss": 0.87673438, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 3.44612717628479 }, { "auxiliary_loss_clip": 0.01287062, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.05132365, "balance_loss_mlp": 1.02547503, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.404428848962965, "language_loss": 0.81787121, "learning_rate": 1.8047894080819945e-06, "loss": 0.84107268, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.6227357387542725 }, { "auxiliary_loss_clip": 0.01071526, "auxiliary_loss_mlp": 0.01007921, "balance_loss_clip": 1.01297879, "balance_loss_mlp": 1.00700307, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.722843420769958, "language_loss": 0.63172281, "learning_rate": 1.8040141706918258e-06, "loss": 0.65251732, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 4.191967487335205 }, { "auxiliary_loss_clip": 0.01184263, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.05089355, "balance_loss_mlp": 1.01859236, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 1.9239383213910626, "language_loss": 0.77068847, "learning_rate": 1.8032389630325525e-06, "loss": 0.79278857, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.7548232078552246 }, { "auxiliary_loss_clip": 0.01280904, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.04486597, "balance_loss_mlp": 1.02066135, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.9732184564176583, "language_loss": 0.75688046, "learning_rate": 1.8024637852217707e-06, "loss": 0.77996516, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.675246000289917 }, { "auxiliary_loss_clip": 0.01280984, "auxiliary_loss_mlp": 0.01027753, "balance_loss_clip": 1.05081511, "balance_loss_mlp": 1.02078819, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.927195178386429, "language_loss": 0.84745532, "learning_rate": 1.8016886373770766e-06, "loss": 0.87054271, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.7440197467803955 }, { "auxiliary_loss_clip": 0.01283694, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 1.04930031, "balance_loss_mlp": 1.02009165, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.7333558321307618, "language_loss": 0.79244781, "learning_rate": 1.8009135196160579e-06, "loss": 0.81556094, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 3.633702516555786 }, { "auxiliary_loss_clip": 0.01328266, "auxiliary_loss_mlp": 0.01031144, "balance_loss_clip": 1.04686546, "balance_loss_mlp": 1.02390516, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 2.1583400820763705, "language_loss": 0.84296107, "learning_rate": 1.8001384320563e-06, "loss": 0.86655509, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.7704291343688965 }, { "auxiliary_loss_clip": 0.01071155, "auxiliary_loss_mlp": 0.01004808, "balance_loss_clip": 1.01249921, "balance_loss_mlp": 1.00386071, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7712229241989073, "language_loss": 0.57726693, "learning_rate": 1.7993633748153833e-06, "loss": 0.59802657, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.054633378982544 }, { "auxiliary_loss_clip": 0.01238785, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.05315721, "balance_loss_mlp": 1.01764131, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 1.9407270689064575, "language_loss": 0.72759199, "learning_rate": 1.7985883480108834e-06, "loss": 0.75022662, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.593532085418701 }, { "auxiliary_loss_clip": 0.01234194, "auxiliary_loss_mlp": 0.0102539, "balance_loss_clip": 1.05098093, "balance_loss_mlp": 1.01839209, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 2.6504388740649856, "language_loss": 0.72464907, "learning_rate": 1.797813351760371e-06, "loss": 0.74724489, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.7103240489959717 }, { "auxiliary_loss_clip": 0.01184304, "auxiliary_loss_mlp": 0.01028163, "balance_loss_clip": 1.05220294, "balance_loss_mlp": 1.02083755, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 2.028097284284308, "language_loss": 0.78036577, "learning_rate": 1.7970383861814116e-06, "loss": 0.80249047, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.6060426235198975 }, { "auxiliary_loss_clip": 0.01232552, "auxiliary_loss_mlp": 0.01028406, "balance_loss_clip": 1.05304623, "balance_loss_mlp": 1.02028155, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 2.1866419856603345, "language_loss": 0.74155116, "learning_rate": 1.7962634513915684e-06, "loss": 0.76416069, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.667836904525757 }, { "auxiliary_loss_clip": 0.01182284, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.0525347, "balance_loss_mlp": 1.02000701, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.5617020868691744, "language_loss": 0.79374468, "learning_rate": 1.7954885475083969e-06, "loss": 0.81583822, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.608644962310791 }, { "auxiliary_loss_clip": 0.01186926, "auxiliary_loss_mlp": 0.01025106, "balance_loss_clip": 1.05423212, "balance_loss_mlp": 1.01738429, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.709724757275978, "language_loss": 0.73025602, "learning_rate": 1.7947136746494513e-06, "loss": 0.75237632, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.647827625274658 }, { "auxiliary_loss_clip": 0.01231108, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.0529213, "balance_loss_mlp": 1.01813793, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 1.977722154551884, "language_loss": 0.88379419, "learning_rate": 1.793938832932277e-06, "loss": 0.90635765, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.73384428024292 }, { "auxiliary_loss_clip": 0.01182334, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.05122972, "balance_loss_mlp": 1.01933777, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 1.9389726926407547, "language_loss": 0.70361549, "learning_rate": 1.7931640224744185e-06, "loss": 0.72570622, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.6903538703918457 }, { "auxiliary_loss_clip": 0.01321598, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.0408783, "balance_loss_mlp": 1.0213728, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.612362214963138, "language_loss": 0.73541135, "learning_rate": 1.7923892433934127e-06, "loss": 0.75891662, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.7857728004455566 }, { "auxiliary_loss_clip": 0.01286035, "auxiliary_loss_mlp": 0.02572371, "balance_loss_clip": 1.05168438, "balance_loss_mlp": 1.00001192, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 2.1548236682635062, "language_loss": 0.78824866, "learning_rate": 1.7916144958067939e-06, "loss": 0.82683277, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.6917479038238525 }, { "auxiliary_loss_clip": 0.01231756, "auxiliary_loss_mlp": 0.01026825, "balance_loss_clip": 1.05020177, "balance_loss_mlp": 1.01969051, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.956477709628978, "language_loss": 0.78853166, "learning_rate": 1.7908397798320905e-06, "loss": 0.81111747, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.717228412628174 }, { "auxiliary_loss_clip": 0.01233161, "auxiliary_loss_mlp": 0.02570686, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 0.99998915, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 1.953683815224385, "language_loss": 0.7482565, "learning_rate": 1.7900650955868265e-06, "loss": 0.786295, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.634089469909668 }, { "auxiliary_loss_clip": 0.01230538, "auxiliary_loss_mlp": 0.02568502, "balance_loss_clip": 1.05175233, "balance_loss_mlp": 0.99999541, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.429820852738912, "language_loss": 0.76358128, "learning_rate": 1.7892904431885202e-06, "loss": 0.80157173, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.971467971801758 }, { "auxiliary_loss_clip": 0.0137719, "auxiliary_loss_mlp": 0.0103775, "balance_loss_clip": 1.0429678, "balance_loss_mlp": 1.03045464, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 3.4296478902106786, "language_loss": 0.75548542, "learning_rate": 1.788515822754686e-06, "loss": 0.77963477, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.786411762237549 }, { "auxiliary_loss_clip": 0.01336255, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.0455333, "balance_loss_mlp": 1.01732421, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.7209734079417354, "language_loss": 0.78468239, "learning_rate": 1.7877412344028335e-06, "loss": 0.80829543, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 3.6621856689453125 }, { "auxiliary_loss_clip": 0.01234482, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.05068612, "balance_loss_mlp": 1.03191435, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.2328470229357387, "language_loss": 0.7738322, "learning_rate": 1.7869666782504668e-06, "loss": 0.79657102, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.6051809787750244 }, { "auxiliary_loss_clip": 0.01274134, "auxiliary_loss_mlp": 0.01027028, "balance_loss_clip": 1.04632258, "balance_loss_mlp": 1.01992583, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 1.763582687951691, "language_loss": 0.68646902, "learning_rate": 1.7861921544150867e-06, "loss": 0.70948058, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.6647815704345703 }, { "auxiliary_loss_clip": 0.01420766, "auxiliary_loss_mlp": 0.02566921, "balance_loss_clip": 1.04587042, "balance_loss_mlp": 0.99998569, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 2.6158473172140715, "language_loss": 0.76695633, "learning_rate": 1.7854176630141856e-06, "loss": 0.80683315, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 3.6588268280029297 }, { "auxiliary_loss_clip": 0.01187977, "auxiliary_loss_mlp": 0.01029373, "balance_loss_clip": 1.05519128, "balance_loss_mlp": 1.02154672, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.403270253988939, "language_loss": 0.84664434, "learning_rate": 1.784643204165255e-06, "loss": 0.86881787, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.6644489765167236 }, { "auxiliary_loss_clip": 0.01227845, "auxiliary_loss_mlp": 0.01023994, "balance_loss_clip": 1.05223608, "balance_loss_mlp": 1.01677537, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 1.8351761355650127, "language_loss": 0.7749095, "learning_rate": 1.7838687779857783e-06, "loss": 0.79742789, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 3.598186492919922 }, { "auxiliary_loss_clip": 0.01278825, "auxiliary_loss_mlp": 0.01027723, "balance_loss_clip": 1.0482316, "balance_loss_mlp": 1.02014756, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 2.206789147246992, "language_loss": 0.64496541, "learning_rate": 1.7830943845932366e-06, "loss": 0.66803098, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.74924373626709 }, { "auxiliary_loss_clip": 0.01288911, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.05283606, "balance_loss_mlp": 1.02163458, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 2.7234820120898715, "language_loss": 0.75220937, "learning_rate": 1.7823200241051044e-06, "loss": 0.77539086, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.6385128498077393 }, { "auxiliary_loss_clip": 0.01186259, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.05574226, "balance_loss_mlp": 1.02739549, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 2.162138675944536, "language_loss": 0.80351186, "learning_rate": 1.7815456966388513e-06, "loss": 0.82572663, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.634920120239258 }, { "auxiliary_loss_clip": 0.012354, "auxiliary_loss_mlp": 0.01025486, "balance_loss_clip": 1.0469749, "balance_loss_mlp": 1.01849401, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 5.703596231249841, "language_loss": 0.8094312, "learning_rate": 1.780771402311943e-06, "loss": 0.83204007, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 3.554806709289551 }, { "auxiliary_loss_clip": 0.01285985, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.05234861, "balance_loss_mlp": 1.02543974, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 2.199132603250437, "language_loss": 0.78660667, "learning_rate": 1.7799971412418374e-06, "loss": 0.80980277, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.71284556388855 }, { "auxiliary_loss_clip": 0.01236602, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.04817593, "balance_loss_mlp": 1.02041292, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 3.3116210034817164, "language_loss": 0.74298841, "learning_rate": 1.7792229135459918e-06, "loss": 0.76564342, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.566453695297241 }, { "auxiliary_loss_clip": 0.01297039, "auxiliary_loss_mlp": 0.01001359, "balance_loss_clip": 1.02930641, "balance_loss_mlp": 1.00036931, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7366953679895234, "language_loss": 0.61591882, "learning_rate": 1.7784487193418538e-06, "loss": 0.63890278, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.097522258758545 }, { "auxiliary_loss_clip": 0.01320614, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.04266357, "balance_loss_mlp": 1.02032447, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 2.3707279325351176, "language_loss": 0.60995948, "learning_rate": 1.7776745587468698e-06, "loss": 0.63344872, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.5947883129119873 }, { "auxiliary_loss_clip": 0.01183921, "auxiliary_loss_mlp": 0.01026625, "balance_loss_clip": 1.05203688, "balance_loss_mlp": 1.01950264, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 2.8430824723016763, "language_loss": 0.82338929, "learning_rate": 1.7769004318784776e-06, "loss": 0.84549487, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.704746961593628 }, { "auxiliary_loss_clip": 0.01231271, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.04989791, "balance_loss_mlp": 1.01835489, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 1.6934261350444662, "language_loss": 0.80537617, "learning_rate": 1.776126338854113e-06, "loss": 0.82794338, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.5802130699157715 }, { "auxiliary_loss_clip": 0.01238288, "auxiliary_loss_mlp": 0.01027132, "balance_loss_clip": 1.05776596, "balance_loss_mlp": 1.01962459, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.7851914431886786, "language_loss": 0.84453171, "learning_rate": 1.7753522797912044e-06, "loss": 0.86718589, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.6681549549102783 }, { "auxiliary_loss_clip": 0.01289196, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.04980206, "balance_loss_mlp": 1.02014112, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.260560481052079, "language_loss": 0.70017695, "learning_rate": 1.7745782548071765e-06, "loss": 0.72335041, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.643233299255371 }, { "auxiliary_loss_clip": 0.01335236, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.05563998, "balance_loss_mlp": 1.01778483, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.6912043240157393, "language_loss": 0.74257016, "learning_rate": 1.7738042640194482e-06, "loss": 0.76617259, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.688095808029175 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.01028838, "balance_loss_clip": 1.05232441, "balance_loss_mlp": 1.02102959, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.509259475590616, "language_loss": 0.70195031, "learning_rate": 1.7730303075454335e-06, "loss": 0.72407359, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.577312707901001 }, { "auxiliary_loss_clip": 0.01336363, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.04796553, "balance_loss_mlp": 1.02122593, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 1.9385994768995296, "language_loss": 0.84782892, "learning_rate": 1.7722563855025402e-06, "loss": 0.8714776, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.616804599761963 }, { "auxiliary_loss_clip": 0.01283625, "auxiliary_loss_mlp": 0.01022733, "balance_loss_clip": 1.04624486, "balance_loss_mlp": 1.01521957, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.527740756005029, "language_loss": 0.70783651, "learning_rate": 1.7714824980081721e-06, "loss": 0.73090011, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.648982286453247 }, { "auxiliary_loss_clip": 0.01230736, "auxiliary_loss_mlp": 0.01022201, "balance_loss_clip": 1.05358207, "balance_loss_mlp": 1.01505113, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 5.021141381762656, "language_loss": 0.73714876, "learning_rate": 1.7707086451797276e-06, "loss": 0.75967813, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.6209757328033447 }, { "auxiliary_loss_clip": 0.01230321, "auxiliary_loss_mlp": 0.010006, "balance_loss_clip": 1.00923371, "balance_loss_mlp": 0.99950927, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.7148361405147315, "language_loss": 0.52288961, "learning_rate": 1.7699348271345993e-06, "loss": 0.54519892, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.1932172775268555 }, { "auxiliary_loss_clip": 0.01292067, "auxiliary_loss_mlp": 0.00998735, "balance_loss_clip": 1.01070595, "balance_loss_mlp": 0.99767399, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7071810800813072, "language_loss": 0.54426432, "learning_rate": 1.7691610439901753e-06, "loss": 0.56717229, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.328946828842163 }, { "auxiliary_loss_clip": 0.01235601, "auxiliary_loss_mlp": 0.01026215, "balance_loss_clip": 1.05187166, "balance_loss_mlp": 1.01912534, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 1.8088972977191407, "language_loss": 0.75374925, "learning_rate": 1.7683872958638367e-06, "loss": 0.77636737, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.602073907852173 }, { "auxiliary_loss_clip": 0.01281298, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.04812503, "balance_loss_mlp": 1.02235353, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.2097415108079685, "language_loss": 0.84065425, "learning_rate": 1.7676135828729614e-06, "loss": 0.86376619, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 3.5674266815185547 }, { "auxiliary_loss_clip": 0.0123105, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.05130231, "balance_loss_mlp": 1.01997924, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 2.217713858330521, "language_loss": 0.83121961, "learning_rate": 1.7668399051349205e-06, "loss": 0.85380328, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.808623790740967 }, { "auxiliary_loss_clip": 0.01332972, "auxiliary_loss_mlp": 0.01023862, "balance_loss_clip": 1.04965162, "balance_loss_mlp": 1.01614904, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.8908481794429806, "language_loss": 0.83432186, "learning_rate": 1.766066262767081e-06, "loss": 0.85789019, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 3.6763291358947754 }, { "auxiliary_loss_clip": 0.01276185, "auxiliary_loss_mlp": 0.01026947, "balance_loss_clip": 1.04996133, "balance_loss_mlp": 1.0195117, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.128123405659414, "language_loss": 0.77010506, "learning_rate": 1.765292655886803e-06, "loss": 0.79313636, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.6600000858306885 }, { "auxiliary_loss_clip": 0.01338911, "auxiliary_loss_mlp": 0.01028288, "balance_loss_clip": 1.04756963, "balance_loss_mlp": 1.02055407, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 3.6720807319292916, "language_loss": 0.70712692, "learning_rate": 1.764519084611443e-06, "loss": 0.7307989, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.7389307022094727 }, { "auxiliary_loss_clip": 0.01283338, "auxiliary_loss_mlp": 0.01024156, "balance_loss_clip": 1.04944062, "balance_loss_mlp": 1.01630056, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 2.0478188633420316, "language_loss": 0.78095233, "learning_rate": 1.7637455490583505e-06, "loss": 0.80402732, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 3.559061288833618 }, { "auxiliary_loss_clip": 0.01235237, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.05341852, "balance_loss_mlp": 1.0203836, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.1955105760240534, "language_loss": 0.7750935, "learning_rate": 1.7629720493448701e-06, "loss": 0.7977187, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.61098313331604 }, { "auxiliary_loss_clip": 0.01290539, "auxiliary_loss_mlp": 0.01026211, "balance_loss_clip": 1.04974961, "balance_loss_mlp": 1.01895165, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.8161021040661782, "language_loss": 0.8503474, "learning_rate": 1.7621985855883418e-06, "loss": 0.87351489, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.7922823429107666 }, { "auxiliary_loss_clip": 0.01275385, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.04781747, "balance_loss_mlp": 1.02100968, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 2.2286114814836293, "language_loss": 0.72150129, "learning_rate": 1.7614251579060983e-06, "loss": 0.74453908, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 3.622863531112671 }, { "auxiliary_loss_clip": 0.01232624, "auxiliary_loss_mlp": 0.01027211, "balance_loss_clip": 1.048141, "balance_loss_mlp": 1.01979637, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.8416988980400224, "language_loss": 0.84881127, "learning_rate": 1.76065176641547e-06, "loss": 0.87140965, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.771068811416626 }, { "auxiliary_loss_clip": 0.0123162, "auxiliary_loss_mlp": 0.01022859, "balance_loss_clip": 1.04757953, "balance_loss_mlp": 1.0154804, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 1.9072031534947504, "language_loss": 0.7813319, "learning_rate": 1.759878411233777e-06, "loss": 0.8038767, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.675658702850342 }, { "auxiliary_loss_clip": 0.01232721, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.05220127, "balance_loss_mlp": 1.02446544, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.2654592697140687, "language_loss": 0.76516283, "learning_rate": 1.7591050924783388e-06, "loss": 0.78780806, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.568741798400879 }, { "auxiliary_loss_clip": 0.01277512, "auxiliary_loss_mlp": 0.01005862, "balance_loss_clip": 1.00898504, "balance_loss_mlp": 1.00483119, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 2.0976446927768464, "language_loss": 0.57864106, "learning_rate": 1.7583318102664661e-06, "loss": 0.60147488, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.356858491897583 }, { "auxiliary_loss_clip": 0.01236042, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.04795313, "balance_loss_mlp": 1.01660085, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.9252315158910271, "language_loss": 0.79389113, "learning_rate": 1.757558564715466e-06, "loss": 0.81649351, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.5958051681518555 }, { "auxiliary_loss_clip": 0.01235232, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.05102575, "balance_loss_mlp": 1.02326679, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 2.7147134546213865, "language_loss": 0.73973954, "learning_rate": 1.7567853559426386e-06, "loss": 0.76240152, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.6043500900268555 }, { "auxiliary_loss_clip": 0.01236613, "auxiliary_loss_mlp": 0.01030497, "balance_loss_clip": 1.05240512, "balance_loss_mlp": 1.02332091, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.950695040998139, "language_loss": 0.75328052, "learning_rate": 1.7560121840652797e-06, "loss": 0.77595162, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.717092752456665 }, { "auxiliary_loss_clip": 0.01322061, "auxiliary_loss_mlp": 0.01027472, "balance_loss_clip": 1.04667473, "balance_loss_mlp": 1.01961017, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.909875245096945, "language_loss": 0.69231892, "learning_rate": 1.7552390492006782e-06, "loss": 0.71581429, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.6672170162200928 }, { "auxiliary_loss_clip": 0.01388209, "auxiliary_loss_mlp": 0.02572355, "balance_loss_clip": 1.0440619, "balance_loss_mlp": 0.99999082, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 2.088140350309763, "language_loss": 0.64853007, "learning_rate": 1.7544659514661184e-06, "loss": 0.68813574, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.7999765872955322 }, { "auxiliary_loss_clip": 0.01278701, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.04581499, "balance_loss_mlp": 1.02039754, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.044326664767469, "language_loss": 0.79761249, "learning_rate": 1.7536928909788786e-06, "loss": 0.8206768, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.6741347312927246 }, { "auxiliary_loss_clip": 0.01185131, "auxiliary_loss_mlp": 0.01002928, "balance_loss_clip": 1.01003587, "balance_loss_mlp": 1.00190914, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8788862221009657, "language_loss": 0.61936533, "learning_rate": 1.752919867856231e-06, "loss": 0.64124584, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.145951747894287 }, { "auxiliary_loss_clip": 0.0127482, "auxiliary_loss_mlp": 0.01029756, "balance_loss_clip": 1.04664063, "balance_loss_mlp": 1.02234745, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.771566795645043, "language_loss": 0.78894484, "learning_rate": 1.7521468822154436e-06, "loss": 0.81199062, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.6239748001098633 }, { "auxiliary_loss_clip": 0.01276727, "auxiliary_loss_mlp": 0.01025697, "balance_loss_clip": 1.05110717, "balance_loss_mlp": 1.01855624, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 2.4641819182355658, "language_loss": 0.75349146, "learning_rate": 1.751373934173777e-06, "loss": 0.77651572, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.730943441390991 }, { "auxiliary_loss_clip": 0.01184125, "auxiliary_loss_mlp": 0.01023418, "balance_loss_clip": 1.05093706, "balance_loss_mlp": 1.01603293, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.6760280858645769, "language_loss": 0.7351684, "learning_rate": 1.750601023848487e-06, "loss": 0.75724387, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.620096445083618 }, { "auxiliary_loss_clip": 0.01181776, "auxiliary_loss_mlp": 0.02568561, "balance_loss_clip": 1.05286217, "balance_loss_mlp": 1.00001252, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 2.0717294825212202, "language_loss": 0.74182379, "learning_rate": 1.749828151356823e-06, "loss": 0.77932715, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.6339786052703857 }, { "auxiliary_loss_clip": 0.01283453, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.05067563, "balance_loss_mlp": 1.02258742, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 1.7188970466643212, "language_loss": 0.75727987, "learning_rate": 1.7490553168160297e-06, "loss": 0.78040862, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.645874500274658 }, { "auxiliary_loss_clip": 0.01282672, "auxiliary_loss_mlp": 0.01029457, "balance_loss_clip": 1.04875743, "balance_loss_mlp": 1.02234936, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 2.258578803181166, "language_loss": 0.76319629, "learning_rate": 1.748282520343345e-06, "loss": 0.78631759, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.6670987606048584 }, { "auxiliary_loss_clip": 0.01241606, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.05173945, "balance_loss_mlp": 1.01950538, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 2.245526154493969, "language_loss": 0.78669131, "learning_rate": 1.7475097620560023e-06, "loss": 0.80938089, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 3.649614095687866 }, { "auxiliary_loss_clip": 0.01183915, "auxiliary_loss_mlp": 0.01028948, "balance_loss_clip": 1.05322611, "balance_loss_mlp": 1.02134287, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 2.2809867914893207, "language_loss": 0.70971668, "learning_rate": 1.746737042071228e-06, "loss": 0.73184526, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.6223933696746826 }, { "auxiliary_loss_clip": 0.01273625, "auxiliary_loss_mlp": 0.01028577, "balance_loss_clip": 1.04935086, "balance_loss_mlp": 1.02122509, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 2.9668012067239125, "language_loss": 0.79102975, "learning_rate": 1.7459643605062424e-06, "loss": 0.81405175, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 3.5975637435913086 }, { "auxiliary_loss_clip": 0.01375754, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.04833913, "balance_loss_mlp": 1.02201891, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 3.5795488261394373, "language_loss": 0.80638444, "learning_rate": 1.745191717478262e-06, "loss": 0.83043659, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.77559232711792 }, { "auxiliary_loss_clip": 0.01284747, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.05256796, "balance_loss_mlp": 1.02535152, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 2.1804098760504633, "language_loss": 0.79651022, "learning_rate": 1.7444191131044948e-06, "loss": 0.81968164, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.7006454467773438 }, { "auxiliary_loss_clip": 0.01281095, "auxiliary_loss_mlp": 0.01027196, "balance_loss_clip": 1.0508945, "balance_loss_mlp": 1.01933396, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 2.1170997143835835, "language_loss": 0.72867382, "learning_rate": 1.7436465475021456e-06, "loss": 0.75175667, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.777911424636841 }, { "auxiliary_loss_clip": 0.01324994, "auxiliary_loss_mlp": 0.01030577, "balance_loss_clip": 1.0481894, "balance_loss_mlp": 1.02321613, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.4249659414913034, "language_loss": 0.71457011, "learning_rate": 1.7428740207884111e-06, "loss": 0.7381258, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 3.6802845001220703 }, { "auxiliary_loss_clip": 0.01285062, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.0479939, "balance_loss_mlp": 1.01973248, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 1.605776057227853, "language_loss": 0.61057639, "learning_rate": 1.7421015330804833e-06, "loss": 0.63369769, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.9002208709716797 }, { "auxiliary_loss_clip": 0.01182592, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 1.05117118, "balance_loss_mlp": 1.01842999, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 1.9911530438905423, "language_loss": 0.72632152, "learning_rate": 1.7413290844955475e-06, "loss": 0.7484079, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 3.5866050720214844 }, { "auxiliary_loss_clip": 0.0122906, "auxiliary_loss_mlp": 0.01032885, "balance_loss_clip": 1.05109692, "balance_loss_mlp": 1.0256846, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.8771519641287169, "language_loss": 0.78017861, "learning_rate": 1.7405566751507843e-06, "loss": 0.80279803, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.5815443992614746 }, { "auxiliary_loss_clip": 0.01328748, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.04720891, "balance_loss_mlp": 1.02318931, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.486885687801532, "language_loss": 0.67416346, "learning_rate": 1.7397843051633668e-06, "loss": 0.69775414, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 2.9624972343444824 }, { "auxiliary_loss_clip": 0.01230279, "auxiliary_loss_mlp": 0.01030175, "balance_loss_clip": 1.05186629, "balance_loss_mlp": 1.02240229, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.6952314365326349, "language_loss": 0.7139436, "learning_rate": 1.739011974650464e-06, "loss": 0.73654813, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.5826756954193115 }, { "auxiliary_loss_clip": 0.01381963, "auxiliary_loss_mlp": 0.0102563, "balance_loss_clip": 1.04642737, "balance_loss_mlp": 1.01754785, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 1.9855115091093647, "language_loss": 0.76956093, "learning_rate": 1.7382396837292365e-06, "loss": 0.79363686, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.773322820663452 }, { "auxiliary_loss_clip": 0.01183577, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.0527215, "balance_loss_mlp": 1.02445579, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 2.29905138215585, "language_loss": 0.73556507, "learning_rate": 1.737467432516841e-06, "loss": 0.75772572, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.607388734817505 }, { "auxiliary_loss_clip": 0.01282279, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.046296, "balance_loss_mlp": 1.01907325, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 3.4753835554769714, "language_loss": 0.74679452, "learning_rate": 1.7366952211304274e-06, "loss": 0.76988423, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.668647050857544 }, { "auxiliary_loss_clip": 0.0127583, "auxiliary_loss_mlp": 0.01025084, "balance_loss_clip": 1.04861283, "balance_loss_mlp": 1.01797307, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.7817891683496003, "language_loss": 0.83466876, "learning_rate": 1.735923049687139e-06, "loss": 0.85767794, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.618623971939087 }, { "auxiliary_loss_clip": 0.01278129, "auxiliary_loss_mlp": 0.01029634, "balance_loss_clip": 1.04580212, "balance_loss_mlp": 1.0219394, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.5203832898271694, "language_loss": 0.73956919, "learning_rate": 1.7351509183041144e-06, "loss": 0.76264679, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.712900161743164 }, { "auxiliary_loss_clip": 0.01186175, "auxiliary_loss_mlp": 0.01028498, "balance_loss_clip": 1.05372369, "balance_loss_mlp": 1.02072561, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.8114967153717172, "language_loss": 0.71930814, "learning_rate": 1.7343788270984852e-06, "loss": 0.74145484, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.61588454246521 }, { "auxiliary_loss_clip": 0.01281217, "auxiliary_loss_mlp": 0.01021129, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.01360035, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.9595745358462602, "language_loss": 0.74957156, "learning_rate": 1.7336067761873764e-06, "loss": 0.77259493, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.8278305530548096 }, { "auxiliary_loss_clip": 0.01238717, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.05077291, "balance_loss_mlp": 1.02431726, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 1.8352517639817474, "language_loss": 0.76375127, "learning_rate": 1.7328347656879076e-06, "loss": 0.78646374, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.641871213912964 }, { "auxiliary_loss_clip": 0.01333469, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.04497838, "balance_loss_mlp": 1.01938021, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 4.499199770203564, "language_loss": 0.68640804, "learning_rate": 1.7320627957171927e-06, "loss": 0.7100091, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.643577814102173 }, { "auxiliary_loss_clip": 0.01184262, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.05455232, "balance_loss_mlp": 1.01828504, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 2.9036872126386633, "language_loss": 0.81494492, "learning_rate": 1.7312908663923382e-06, "loss": 0.83704865, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.647935628890991 }, { "auxiliary_loss_clip": 0.01225841, "auxiliary_loss_mlp": 0.01024729, "balance_loss_clip": 1.04697347, "balance_loss_mlp": 1.0170517, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 2.756942277045708, "language_loss": 0.67861199, "learning_rate": 1.7305189778304463e-06, "loss": 0.70111763, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.5871338844299316 }, { "auxiliary_loss_clip": 0.01281891, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.05186605, "balance_loss_mlp": 1.02117562, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.8372563623035347, "language_loss": 0.800623, "learning_rate": 1.729747130148611e-06, "loss": 0.82372564, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.657994031906128 }, { "auxiliary_loss_clip": 0.01334805, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.04736495, "balance_loss_mlp": 1.01832628, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 1.9403013848577986, "language_loss": 0.76982647, "learning_rate": 1.7289753234639208e-06, "loss": 0.79344273, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.7456836700439453 }, { "auxiliary_loss_clip": 0.0123661, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.05145121, "balance_loss_mlp": 1.02158344, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 2.2537286677341184, "language_loss": 0.76653457, "learning_rate": 1.7282035578934592e-06, "loss": 0.789195, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.653689384460449 }, { "auxiliary_loss_clip": 0.01274655, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.05029702, "balance_loss_mlp": 1.02181363, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.8241874152798876, "language_loss": 0.7920717, "learning_rate": 1.727431833554301e-06, "loss": 0.81511593, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 3.549848794937134 }, { "auxiliary_loss_clip": 0.01339411, "auxiliary_loss_mlp": 0.01028095, "balance_loss_clip": 1.04276419, "balance_loss_mlp": 1.02076352, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 2.1164139413523437, "language_loss": 0.7776686, "learning_rate": 1.7266601505635175e-06, "loss": 0.80134362, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.6800999641418457 }, { "auxiliary_loss_clip": 0.0123185, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.05145025, "balance_loss_mlp": 1.0258007, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 3.1189186649040668, "language_loss": 0.75888985, "learning_rate": 1.7258885090381717e-06, "loss": 0.78154117, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.6703035831451416 }, { "auxiliary_loss_clip": 0.01285851, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.04968286, "balance_loss_mlp": 1.02054393, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 1.8935357211592778, "language_loss": 0.78752142, "learning_rate": 1.7251169090953213e-06, "loss": 0.8106603, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 3.7283318042755127 }, { "auxiliary_loss_clip": 0.01228387, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.04929137, "balance_loss_mlp": 1.01868403, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 3.7337356271998283, "language_loss": 0.76413727, "learning_rate": 1.7243453508520168e-06, "loss": 0.78668255, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 2.71344256401062 }, { "auxiliary_loss_clip": 0.01282653, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.04839945, "balance_loss_mlp": 1.02540684, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 1.9499342363012724, "language_loss": 0.84583902, "learning_rate": 1.7235738344253038e-06, "loss": 0.86899865, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.6307010650634766 }, { "auxiliary_loss_clip": 0.01237735, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.05596423, "balance_loss_mlp": 1.01892841, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 1.988011514503129, "language_loss": 0.82837552, "learning_rate": 1.72280235993222e-06, "loss": 0.85102123, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 3.596297264099121 }, { "auxiliary_loss_clip": 0.01231687, "auxiliary_loss_mlp": 0.02573475, "balance_loss_clip": 1.05292463, "balance_loss_mlp": 1.0000391, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.4213848103777327, "language_loss": 0.69776511, "learning_rate": 1.722030927489798e-06, "loss": 0.73581672, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 3.5228207111358643 }, { "auxiliary_loss_clip": 0.01332229, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.04988527, "balance_loss_mlp": 1.02099657, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 2.324881766587197, "language_loss": 0.74025601, "learning_rate": 1.7212595372150634e-06, "loss": 0.76387268, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.6850671768188477 }, { "auxiliary_loss_clip": 0.01180601, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.05148935, "balance_loss_mlp": 1.02183008, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.1310174329459106, "language_loss": 0.7307834, "learning_rate": 1.720488189225035e-06, "loss": 0.7528823, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.5933361053466797 }, { "auxiliary_loss_clip": 0.012352, "auxiliary_loss_mlp": 0.01023942, "balance_loss_clip": 1.0506165, "balance_loss_mlp": 1.01636016, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 2.3637434539954802, "language_loss": 0.79030955, "learning_rate": 1.7197168836367265e-06, "loss": 0.8129009, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.614452600479126 }, { "auxiliary_loss_clip": 0.01228032, "auxiliary_loss_mlp": 0.02568718, "balance_loss_clip": 1.04883265, "balance_loss_mlp": 1.00001359, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 2.1452355535023044, "language_loss": 0.81934577, "learning_rate": 1.7189456205671433e-06, "loss": 0.85731328, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.6258387565612793 }, { "auxiliary_loss_clip": 0.01139764, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.05206752, "balance_loss_mlp": 1.02211523, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 1.9522779551608997, "language_loss": 0.82291847, "learning_rate": 1.7181744001332866e-06, "loss": 0.84462088, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.6066930294036865 }, { "auxiliary_loss_clip": 0.01181779, "auxiliary_loss_mlp": 0.01027845, "balance_loss_clip": 1.0532968, "balance_loss_mlp": 1.02060318, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 1.941043804896725, "language_loss": 0.63564718, "learning_rate": 1.7174032224521493e-06, "loss": 0.65774345, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.6045773029327393 }, { "auxiliary_loss_clip": 0.01231789, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.05070364, "balance_loss_mlp": 1.0223093, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.8025397438608308, "language_loss": 0.69516063, "learning_rate": 1.7166320876407184e-06, "loss": 0.71777642, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.579118251800537 }, { "auxiliary_loss_clip": 0.01184809, "auxiliary_loss_mlp": 0.02572379, "balance_loss_clip": 1.05292201, "balance_loss_mlp": 1.00003338, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 1.9055282525413095, "language_loss": 0.6774435, "learning_rate": 1.7158609958159742e-06, "loss": 0.71501535, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.6062703132629395 }, { "auxiliary_loss_clip": 0.01352273, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.05060911, "balance_loss_mlp": 1.02115631, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 2.140975441452478, "language_loss": 0.78457433, "learning_rate": 1.7150899470948911e-06, "loss": 0.80838752, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.778404712677002 }, { "auxiliary_loss_clip": 0.01175333, "auxiliary_loss_mlp": 0.01012218, "balance_loss_clip": 1.01083279, "balance_loss_mlp": 1.01122212, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.8036619941416573, "language_loss": 0.56622702, "learning_rate": 1.7143189415944365e-06, "loss": 0.58810252, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.252281665802002 }, { "auxiliary_loss_clip": 0.01228853, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.05189455, "balance_loss_mlp": 1.02201223, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.8019906175585627, "language_loss": 0.76631767, "learning_rate": 1.7135479794315714e-06, "loss": 0.78890288, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.6411983966827393 }, { "auxiliary_loss_clip": 0.01326746, "auxiliary_loss_mlp": 0.01029165, "balance_loss_clip": 1.04812968, "balance_loss_mlp": 1.02196157, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 3.443036862145537, "language_loss": 0.78918296, "learning_rate": 1.7127770607232502e-06, "loss": 0.81274211, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.619926929473877 }, { "auxiliary_loss_clip": 0.01335633, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.04651523, "balance_loss_mlp": 1.01837111, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 4.770354925760323, "language_loss": 0.79737961, "learning_rate": 1.7120061855864204e-06, "loss": 0.82099003, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.747706174850464 }, { "auxiliary_loss_clip": 0.0123117, "auxiliary_loss_mlp": 0.01030956, "balance_loss_clip": 1.05188608, "balance_loss_mlp": 1.02367282, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 1.9998948507211953, "language_loss": 0.71432221, "learning_rate": 1.7112353541380233e-06, "loss": 0.73694342, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.7289676666259766 }, { "auxiliary_loss_clip": 0.01280903, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.02807307, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.4554971271600567, "language_loss": 0.72495371, "learning_rate": 1.7104645664949931e-06, "loss": 0.74811518, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.8559632301330566 }, { "auxiliary_loss_clip": 0.0127771, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.04540801, "balance_loss_mlp": 1.02130556, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 1.8418306429117164, "language_loss": 0.71602023, "learning_rate": 1.7096938227742584e-06, "loss": 0.73909211, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.662653684616089 }, { "auxiliary_loss_clip": 0.01182643, "auxiliary_loss_mlp": 0.01028347, "balance_loss_clip": 1.05259192, "balance_loss_mlp": 1.0205332, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 1.8499586851300305, "language_loss": 0.84233725, "learning_rate": 1.70892312309274e-06, "loss": 0.86444718, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.54872465133667 }, { "auxiliary_loss_clip": 0.01280643, "auxiliary_loss_mlp": 0.01024743, "balance_loss_clip": 1.04306233, "balance_loss_mlp": 1.01711941, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.5275863332042934, "language_loss": 0.68290979, "learning_rate": 1.7081524675673523e-06, "loss": 0.70596361, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.668945789337158 }, { "auxiliary_loss_clip": 0.01178797, "auxiliary_loss_mlp": 0.01001901, "balance_loss_clip": 1.01051688, "balance_loss_mlp": 1.00095892, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7765296955693761, "language_loss": 0.59574258, "learning_rate": 1.7073818563150026e-06, "loss": 0.61754954, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 4.205419063568115 }, { "auxiliary_loss_clip": 0.01231048, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.04844224, "balance_loss_mlp": 1.01829314, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.7674134060749975, "language_loss": 0.86828279, "learning_rate": 1.7066112894525935e-06, "loss": 0.89085984, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.65032958984375 }, { "auxiliary_loss_clip": 0.01274972, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.04596639, "balance_loss_mlp": 1.02563953, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 2.255901216324383, "language_loss": 0.72723269, "learning_rate": 1.7058407670970177e-06, "loss": 0.75031966, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.693502426147461 }, { "auxiliary_loss_clip": 0.01138393, "auxiliary_loss_mlp": 0.01031612, "balance_loss_clip": 1.05013776, "balance_loss_mlp": 1.02397633, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 2.712008108302704, "language_loss": 0.614223, "learning_rate": 1.7050702893651643e-06, "loss": 0.63592303, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 3.478433132171631 }, { "auxiliary_loss_clip": 0.01232093, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.05217409, "balance_loss_mlp": 1.02510476, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 5.566435952659172, "language_loss": 0.75625336, "learning_rate": 1.7042998563739134e-06, "loss": 0.7789005, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.7922983169555664 }, { "auxiliary_loss_clip": 0.01288865, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.04789448, "balance_loss_mlp": 1.02208591, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 2.0382645362397733, "language_loss": 0.7170819, "learning_rate": 1.703529468240139e-06, "loss": 0.74027205, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 3.594461679458618 }, { "auxiliary_loss_clip": 0.01275824, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.04950833, "balance_loss_mlp": 1.02045608, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.5684094060693066, "language_loss": 0.73723954, "learning_rate": 1.7027591250807088e-06, "loss": 0.76028562, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.6891298294067383 }, { "auxiliary_loss_clip": 0.01183357, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.05176854, "balance_loss_mlp": 1.01812708, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 4.527439790597977, "language_loss": 0.84441602, "learning_rate": 1.7019888270124825e-06, "loss": 0.86650527, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.609100341796875 }, { "auxiliary_loss_clip": 0.01233213, "auxiliary_loss_mlp": 0.01031937, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.02515149, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 2.139555887559749, "language_loss": 0.82388103, "learning_rate": 1.7012185741523147e-06, "loss": 0.84653252, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 3.48781418800354 }, { "auxiliary_loss_clip": 0.01184551, "auxiliary_loss_mlp": 0.01026453, "balance_loss_clip": 1.05373824, "balance_loss_mlp": 1.01868951, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 3.1512086763689324, "language_loss": 0.6303333, "learning_rate": 1.7004483666170514e-06, "loss": 0.65244341, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.658010721206665 }, { "auxiliary_loss_clip": 0.01225372, "auxiliary_loss_mlp": 0.01029865, "balance_loss_clip": 1.04705572, "balance_loss_mlp": 1.02256596, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 1.9576714254078929, "language_loss": 0.80472094, "learning_rate": 1.699678204523533e-06, "loss": 0.82727325, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.6189095973968506 }, { "auxiliary_loss_clip": 0.01282686, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 1.05229354, "balance_loss_mlp": 1.02298915, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 4.330300691770355, "language_loss": 0.68810105, "learning_rate": 1.6989080879885918e-06, "loss": 0.71124136, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.6749045848846436 }, { "auxiliary_loss_clip": 0.0122647, "auxiliary_loss_mlp": 0.01001655, "balance_loss_clip": 1.01111853, "balance_loss_mlp": 1.00070155, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.9017038453871216, "language_loss": 0.60913855, "learning_rate": 1.6981380171290544e-06, "loss": 0.63141984, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.240976572036743 }, { "auxiliary_loss_clip": 0.01179461, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.04478621, "balance_loss_mlp": 1.0205152, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 1.9512698036326073, "language_loss": 0.74541628, "learning_rate": 1.6973679920617396e-06, "loss": 0.76749921, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.66615891456604 }, { "auxiliary_loss_clip": 0.0128069, "auxiliary_loss_mlp": 0.01022279, "balance_loss_clip": 1.05287766, "balance_loss_mlp": 1.01480508, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 2.808089972360874, "language_loss": 0.85596716, "learning_rate": 1.6965980129034603e-06, "loss": 0.87899685, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.6055829524993896 }, { "auxiliary_loss_clip": 0.01281279, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 1.05031753, "balance_loss_mlp": 1.01740301, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.7941612388210035, "language_loss": 0.7662794, "learning_rate": 1.6958280797710209e-06, "loss": 0.78933942, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.7271995544433594 }, { "auxiliary_loss_clip": 0.01169866, "auxiliary_loss_mlp": 0.00995839, "balance_loss_clip": 1.0104413, "balance_loss_mlp": 0.9948799, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7130211753556019, "language_loss": 0.54751605, "learning_rate": 1.6950581927812198e-06, "loss": 0.5691731, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.107309579849243 }, { "auxiliary_loss_clip": 0.01231912, "auxiliary_loss_mlp": 0.01026716, "balance_loss_clip": 1.04997468, "balance_loss_mlp": 1.01946211, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 3.6765822988844357, "language_loss": 0.79359126, "learning_rate": 1.6942883520508486e-06, "loss": 0.81617749, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.649785041809082 }, { "auxiliary_loss_clip": 0.01134995, "auxiliary_loss_mlp": 0.01026436, "balance_loss_clip": 1.05162096, "balance_loss_mlp": 1.0193578, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 5.234084230818149, "language_loss": 0.77151108, "learning_rate": 1.693518557696691e-06, "loss": 0.79312539, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.668201208114624 }, { "auxiliary_loss_clip": 0.01228871, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.04681706, "balance_loss_mlp": 1.01789761, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 4.373260342334784, "language_loss": 0.89272559, "learning_rate": 1.6927488098355252e-06, "loss": 0.91527021, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.5730412006378174 }, { "auxiliary_loss_clip": 0.0119205, "auxiliary_loss_mlp": 0.00997553, "balance_loss_clip": 1.01012266, "balance_loss_mlp": 0.99659389, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9146476598368409, "language_loss": 0.6318121, "learning_rate": 1.6919791085841201e-06, "loss": 0.65370822, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.2836501598358154 }, { "auxiliary_loss_clip": 0.01229185, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 1.04669976, "balance_loss_mlp": 1.01653063, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.5717088151973484, "language_loss": 0.78960043, "learning_rate": 1.6912094540592396e-06, "loss": 0.81213617, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.5542104244232178 }, { "auxiliary_loss_clip": 0.01227957, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.04979849, "balance_loss_mlp": 1.02438867, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.7073232235888414, "language_loss": 0.81776285, "learning_rate": 1.6904398463776393e-06, "loss": 0.84036136, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.610830068588257 }, { "auxiliary_loss_clip": 0.01234636, "auxiliary_loss_mlp": 0.01023576, "balance_loss_clip": 1.04955316, "balance_loss_mlp": 1.01622939, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 2.3524799253325135, "language_loss": 0.72755492, "learning_rate": 1.6896702856560683e-06, "loss": 0.75013697, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.601748466491699 }, { "auxiliary_loss_clip": 0.01325958, "auxiliary_loss_mlp": 0.01029405, "balance_loss_clip": 1.04162431, "balance_loss_mlp": 1.02182364, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 3.531567475944299, "language_loss": 0.69412905, "learning_rate": 1.6889007720112677e-06, "loss": 0.71768272, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.730217695236206 }, { "auxiliary_loss_clip": 0.01232737, "auxiliary_loss_mlp": 0.01023521, "balance_loss_clip": 1.05011964, "balance_loss_mlp": 1.01574826, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.649058852890975, "language_loss": 0.77249432, "learning_rate": 1.6881313055599734e-06, "loss": 0.79505694, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.657871961593628 }, { "auxiliary_loss_clip": 0.0127332, "auxiliary_loss_mlp": 0.01032263, "balance_loss_clip": 1.04474604, "balance_loss_mlp": 1.02379906, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 2.425841398581771, "language_loss": 0.82448572, "learning_rate": 1.6873618864189117e-06, "loss": 0.84754157, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 3.6981089115142822 }, { "auxiliary_loss_clip": 0.0122895, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.04831648, "balance_loss_mlp": 1.01932597, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 3.6422782039701285, "language_loss": 0.7790885, "learning_rate": 1.686592514704803e-06, "loss": 0.80164576, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.6639463901519775 }, { "auxiliary_loss_clip": 0.0127949, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.05100131, "balance_loss_mlp": 1.0181452, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 4.0972591405213725, "language_loss": 0.70670587, "learning_rate": 1.685823190534361e-06, "loss": 0.72975367, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.6268081665039062 }, { "auxiliary_loss_clip": 0.01184544, "auxiliary_loss_mlp": 0.01028977, "balance_loss_clip": 1.05038559, "balance_loss_mlp": 1.02096653, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.980051542105391, "language_loss": 0.83885252, "learning_rate": 1.6850539140242907e-06, "loss": 0.86098772, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 3.5100197792053223 }, { "auxiliary_loss_clip": 0.01135303, "auxiliary_loss_mlp": 0.01028939, "balance_loss_clip": 1.04996407, "balance_loss_mlp": 1.02122617, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 1.9922590216665472, "language_loss": 0.82043463, "learning_rate": 1.684284685291292e-06, "loss": 0.84207708, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.6841273307800293 }, { "auxiliary_loss_clip": 0.01183124, "auxiliary_loss_mlp": 0.01026597, "balance_loss_clip": 1.05174899, "balance_loss_mlp": 1.01865149, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 2.033110568566966, "language_loss": 0.81385946, "learning_rate": 1.683515504452055e-06, "loss": 0.83595669, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 3.450983762741089 }, { "auxiliary_loss_clip": 0.01322503, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.04455781, "balance_loss_mlp": 1.02097201, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.6035310502849736, "language_loss": 0.66584229, "learning_rate": 1.6827463716232648e-06, "loss": 0.68935883, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.7288155555725098 }, { "auxiliary_loss_clip": 0.01231856, "auxiliary_loss_mlp": 0.02568757, "balance_loss_clip": 1.04845953, "balance_loss_mlp": 1.00007129, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.843466453379911, "language_loss": 0.75746787, "learning_rate": 1.6819772869215972e-06, "loss": 0.79547393, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.599470615386963 }, { "auxiliary_loss_clip": 0.0128717, "auxiliary_loss_mlp": 0.01026992, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.01963651, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.6840945608597229, "language_loss": 0.8209182, "learning_rate": 1.6812082504637228e-06, "loss": 0.84405982, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 3.590618848800659 }, { "auxiliary_loss_clip": 0.01228047, "auxiliary_loss_mlp": 0.0102421, "balance_loss_clip": 1.05165887, "balance_loss_mlp": 1.01715553, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.4921170230097676, "language_loss": 0.74489367, "learning_rate": 1.6804392623663025e-06, "loss": 0.7674163, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.6966826915740967 }, { "auxiliary_loss_clip": 0.01225576, "auxiliary_loss_mlp": 0.01022924, "balance_loss_clip": 1.0486846, "balance_loss_mlp": 1.01560438, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.824233541456398, "language_loss": 0.78414363, "learning_rate": 1.6796703227459935e-06, "loss": 0.80662864, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.6232736110687256 }, { "auxiliary_loss_clip": 0.01373189, "auxiliary_loss_mlp": 0.01027794, "balance_loss_clip": 1.04246616, "balance_loss_mlp": 1.02000988, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 1.9825329287984166, "language_loss": 0.75894904, "learning_rate": 1.6789014317194407e-06, "loss": 0.78295887, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.8789963722229004 }, { "auxiliary_loss_clip": 0.01188601, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.04987526, "balance_loss_mlp": 1.02182293, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.646647085873654, "language_loss": 0.73006546, "learning_rate": 1.6781325894032853e-06, "loss": 0.75225937, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.6647844314575195 }, { "auxiliary_loss_clip": 0.01281907, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.05253637, "balance_loss_mlp": 1.0298059, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 3.2959709536054587, "language_loss": 0.92130446, "learning_rate": 1.6773637959141608e-06, "loss": 0.94449604, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.6307921409606934 }, { "auxiliary_loss_clip": 0.01276514, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 1.04816806, "balance_loss_mlp": 1.02007365, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.3929731708763375, "language_loss": 0.664846, "learning_rate": 1.6765950513686915e-06, "loss": 0.6878857, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.637324571609497 }, { "auxiliary_loss_clip": 0.01280724, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.04278123, "balance_loss_mlp": 1.01696408, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 6.03203248891467, "language_loss": 0.76383317, "learning_rate": 1.675826355883496e-06, "loss": 0.78689057, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.783923625946045 }, { "auxiliary_loss_clip": 0.01276535, "auxiliary_loss_mlp": 0.01025246, "balance_loss_clip": 1.05049276, "balance_loss_mlp": 1.01746488, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 2.197176249053272, "language_loss": 0.79145443, "learning_rate": 1.6750577095751848e-06, "loss": 0.81447226, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.6167078018188477 }, { "auxiliary_loss_clip": 0.01180372, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.05182016, "balance_loss_mlp": 1.02035403, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.6730662366314633, "language_loss": 0.73080349, "learning_rate": 1.6742891125603605e-06, "loss": 0.75288498, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.6359941959381104 }, { "auxiliary_loss_clip": 0.01230644, "auxiliary_loss_mlp": 0.01023222, "balance_loss_clip": 1.05044568, "balance_loss_mlp": 1.01530623, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 2.0248688320972748, "language_loss": 0.7241081, "learning_rate": 1.6735205649556185e-06, "loss": 0.74664676, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.6490819454193115 }, { "auxiliary_loss_clip": 0.01233289, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04736328, "balance_loss_mlp": 1.0195142, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.9860805306156282, "language_loss": 0.85010183, "learning_rate": 1.6727520668775476e-06, "loss": 0.87270141, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.6958301067352295 }, { "auxiliary_loss_clip": 0.01184141, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.0518024, "balance_loss_mlp": 1.01609254, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.7528722096283198, "language_loss": 0.75604194, "learning_rate": 1.6719836184427275e-06, "loss": 0.77812523, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.5875251293182373 }, { "auxiliary_loss_clip": 0.01274911, "auxiliary_loss_mlp": 0.01020466, "balance_loss_clip": 1.04589415, "balance_loss_mlp": 1.01339364, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 1.7687027827728092, "language_loss": 0.64248878, "learning_rate": 1.671215219767733e-06, "loss": 0.66544259, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.905208110809326 }, { "auxiliary_loss_clip": 0.01384729, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.04485345, "balance_loss_mlp": 1.02125454, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 1.9350664604565126, "language_loss": 0.76118815, "learning_rate": 1.670446870969127e-06, "loss": 0.78532642, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.783738374710083 }, { "auxiliary_loss_clip": 0.01187144, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.05129743, "balance_loss_mlp": 1.01580548, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.3527862832707496, "language_loss": 0.79792035, "learning_rate": 1.6696785721634685e-06, "loss": 0.82003218, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.6452348232269287 }, { "auxiliary_loss_clip": 0.01234386, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.05035305, "balance_loss_mlp": 1.0170908, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 1.9024627975701927, "language_loss": 0.73521495, "learning_rate": 1.6689103234673086e-06, "loss": 0.75780916, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.6219842433929443 }, { "auxiliary_loss_clip": 0.01276932, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.04852462, "balance_loss_mlp": 1.02520728, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 1.926486180993532, "language_loss": 0.77101666, "learning_rate": 1.668142124997189e-06, "loss": 0.79411638, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.6872847080230713 }, { "auxiliary_loss_clip": 0.01162531, "auxiliary_loss_mlp": 0.01012715, "balance_loss_clip": 1.00840223, "balance_loss_mlp": 1.01169622, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7348412636497615, "language_loss": 0.59740937, "learning_rate": 1.6673739768696453e-06, "loss": 0.61916184, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 4.234415054321289 }, { "auxiliary_loss_clip": 0.01286881, "auxiliary_loss_mlp": 0.01025057, "balance_loss_clip": 1.04747808, "balance_loss_mlp": 1.01708817, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.672834526337638, "language_loss": 0.77430654, "learning_rate": 1.6666058792012052e-06, "loss": 0.79742599, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.7485244274139404 }, { "auxiliary_loss_clip": 0.0112479, "auxiliary_loss_mlp": 0.01003468, "balance_loss_clip": 1.01072264, "balance_loss_mlp": 1.00249064, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.8766652000769987, "language_loss": 0.68780035, "learning_rate": 1.6658378321083878e-06, "loss": 0.70908296, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.2573964595794678 }, { "auxiliary_loss_clip": 0.01333834, "auxiliary_loss_mlp": 0.01030111, "balance_loss_clip": 1.04602265, "balance_loss_mlp": 1.02259529, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 2.1279814622783078, "language_loss": 0.82416368, "learning_rate": 1.6650698357077055e-06, "loss": 0.84780312, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 3.5946412086486816 }, { "auxiliary_loss_clip": 0.01285041, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.04857123, "balance_loss_mlp": 1.02252448, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.1348224791753005, "language_loss": 0.80991089, "learning_rate": 1.6643018901156632e-06, "loss": 0.83306831, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.6212716102600098 }, { "auxiliary_loss_clip": 0.012838, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.04792452, "balance_loss_mlp": 1.02116418, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 3.6230044664303396, "language_loss": 0.79498303, "learning_rate": 1.6635339954487566e-06, "loss": 0.81810665, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 3.5175905227661133 }, { "auxiliary_loss_clip": 0.01285173, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.05040729, "balance_loss_mlp": 1.02507114, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.717969033922128, "language_loss": 0.82374787, "learning_rate": 1.6627661518234765e-06, "loss": 0.84692872, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.7227799892425537 }, { "auxiliary_loss_clip": 0.01388534, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.05158341, "balance_loss_mlp": 1.025141, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 2.183539477077222, "language_loss": 0.85408044, "learning_rate": 1.661998359356302e-06, "loss": 0.87829447, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.732398509979248 }, { "auxiliary_loss_clip": 0.01068073, "auxiliary_loss_mlp": 0.01001556, "balance_loss_clip": 1.01003444, "balance_loss_mlp": 1.00059021, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7446859585483963, "language_loss": 0.55785942, "learning_rate": 1.6612306181637077e-06, "loss": 0.5785557, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 4.031984329223633 }, { "auxiliary_loss_clip": 0.0132986, "auxiliary_loss_mlp": 0.01022855, "balance_loss_clip": 1.04655886, "balance_loss_mlp": 1.01517773, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 2.768132014296347, "language_loss": 0.66014767, "learning_rate": 1.6604629283621598e-06, "loss": 0.68367481, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.7024054527282715 }, { "auxiliary_loss_clip": 0.01186506, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.0534029, "balance_loss_mlp": 1.01834655, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 3.019931851807437, "language_loss": 0.74390626, "learning_rate": 1.6596952900681152e-06, "loss": 0.7660352, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.714780569076538 }, { "auxiliary_loss_clip": 0.01382465, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.04989052, "balance_loss_mlp": 1.02372384, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.0566151128570125, "language_loss": 0.81963181, "learning_rate": 1.658927703398025e-06, "loss": 0.84377253, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.856614589691162 }, { "auxiliary_loss_clip": 0.01377219, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.03978407, "balance_loss_mlp": 1.02211165, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.68317589989917, "language_loss": 0.77754414, "learning_rate": 1.6581601684683309e-06, "loss": 0.80160952, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.733449697494507 }, { "auxiliary_loss_clip": 0.01232431, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.05279422, "balance_loss_mlp": 1.02630949, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 3.9345079217982946, "language_loss": 0.68586653, "learning_rate": 1.6573926853954674e-06, "loss": 0.70852852, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.6822268962860107 }, { "auxiliary_loss_clip": 0.01278691, "auxiliary_loss_mlp": 0.010332, "balance_loss_clip": 1.04723775, "balance_loss_mlp": 1.02595246, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 2.0827819691479648, "language_loss": 0.83000362, "learning_rate": 1.6566252542958608e-06, "loss": 0.85312259, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.6618611812591553 }, { "auxiliary_loss_clip": 0.01320812, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.04624081, "balance_loss_mlp": 1.02263653, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 1.8127639163180953, "language_loss": 0.7834909, "learning_rate": 1.6558578752859305e-06, "loss": 0.80700374, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.8196334838867188 }, { "auxiliary_loss_clip": 0.01332848, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.04595995, "balance_loss_mlp": 1.01962519, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 1.8878860813702019, "language_loss": 0.78787422, "learning_rate": 1.6550905484820865e-06, "loss": 0.81147569, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.7042295932769775 }, { "auxiliary_loss_clip": 0.0118455, "auxiliary_loss_mlp": 0.0102599, "balance_loss_clip": 1.05194616, "balance_loss_mlp": 1.01832163, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.3858696096509178, "language_loss": 0.78584313, "learning_rate": 1.6543232740007328e-06, "loss": 0.80794853, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.660003900527954 }, { "auxiliary_loss_clip": 0.01236853, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.05313361, "balance_loss_mlp": 1.02348614, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.594383047312774, "language_loss": 0.66641808, "learning_rate": 1.653556051958263e-06, "loss": 0.68911088, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.6903789043426514 }, { "auxiliary_loss_clip": 0.01416586, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.04191947, "balance_loss_mlp": 1.01988339, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 1.8614552345688236, "language_loss": 0.73947752, "learning_rate": 1.6527888824710642e-06, "loss": 0.7639159, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.760223150253296 }, { "auxiliary_loss_clip": 0.01325365, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.0439502, "balance_loss_mlp": 1.02396166, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.3402966168695256, "language_loss": 0.76499438, "learning_rate": 1.6520217656555166e-06, "loss": 0.788571, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.8031606674194336 }, { "auxiliary_loss_clip": 0.01275265, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.04711843, "balance_loss_mlp": 1.02266002, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.55225773506792, "language_loss": 0.70916206, "learning_rate": 1.65125470162799e-06, "loss": 0.73220867, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.7007946968078613 }, { "auxiliary_loss_clip": 0.01330993, "auxiliary_loss_mlp": 0.01024601, "balance_loss_clip": 1.0456574, "balance_loss_mlp": 1.01678729, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 2.7934266399625884, "language_loss": 0.69971073, "learning_rate": 1.6504876905048485e-06, "loss": 0.72326666, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.687138557434082 }, { "auxiliary_loss_clip": 0.01180043, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02179003, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.8294155897288726, "language_loss": 0.72204638, "learning_rate": 1.6497207324024464e-06, "loss": 0.7441386, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.572396993637085 }, { "auxiliary_loss_clip": 0.01186302, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.04574955, "balance_loss_mlp": 1.01756406, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 2.2622934219791118, "language_loss": 0.82757545, "learning_rate": 1.6489538274371305e-06, "loss": 0.84969473, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.6892971992492676 }, { "auxiliary_loss_clip": 0.01227225, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.05156422, "balance_loss_mlp": 1.01991415, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 1.889202669486198, "language_loss": 0.83407092, "learning_rate": 1.6481869757252396e-06, "loss": 0.85661423, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.609886884689331 }, { "auxiliary_loss_clip": 0.01231026, "auxiliary_loss_mlp": 0.01026462, "balance_loss_clip": 1.05156934, "balance_loss_mlp": 1.01947367, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.6708278704792228, "language_loss": 0.71827161, "learning_rate": 1.647420177383105e-06, "loss": 0.74084651, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 3.7080414295196533 }, { "auxiliary_loss_clip": 0.01227952, "auxiliary_loss_mlp": 0.01027004, "balance_loss_clip": 1.05338717, "balance_loss_mlp": 1.01979816, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 2.3871244355536305, "language_loss": 0.72529101, "learning_rate": 1.646653432527049e-06, "loss": 0.74784058, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.687326192855835 }, { "auxiliary_loss_clip": 0.01331907, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 1.04849565, "balance_loss_mlp": 1.01967144, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.6086021504869235, "language_loss": 0.746391, "learning_rate": 1.645886741273387e-06, "loss": 0.76997936, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.7366433143615723 }, { "auxiliary_loss_clip": 0.01334321, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.05642986, "balance_loss_mlp": 1.01953757, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 2.885650611618166, "language_loss": 0.74122882, "learning_rate": 1.645120103738424e-06, "loss": 0.76484805, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 3.5803277492523193 }, { "auxiliary_loss_clip": 0.01223904, "auxiliary_loss_mlp": 0.02564896, "balance_loss_clip": 1.04794443, "balance_loss_mlp": 1.00008357, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.285784452093703, "language_loss": 0.83795005, "learning_rate": 1.6443535200384591e-06, "loss": 0.87583804, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 3.51320219039917 }, { "auxiliary_loss_clip": 0.0118073, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.05222857, "balance_loss_mlp": 1.01815212, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.646453015490757, "language_loss": 0.70799458, "learning_rate": 1.6435869902897827e-06, "loss": 0.73006028, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.598745346069336 }, { "auxiliary_loss_clip": 0.01225295, "auxiliary_loss_mlp": 0.01002685, "balance_loss_clip": 1.00845027, "balance_loss_mlp": 1.00164819, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.7946960506215589, "language_loss": 0.61964142, "learning_rate": 1.6428205146086764e-06, "loss": 0.64192122, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.291846752166748 }, { "auxiliary_loss_clip": 0.01286415, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.04816854, "balance_loss_mlp": 1.02185535, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 1.6863916626238111, "language_loss": 0.71478951, "learning_rate": 1.6420540931114142e-06, "loss": 0.73794651, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.6983258724212646 }, { "auxiliary_loss_clip": 0.0128054, "auxiliary_loss_mlp": 0.01031758, "balance_loss_clip": 1.04906225, "balance_loss_mlp": 1.02464151, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 1.7118497089299085, "language_loss": 0.79157442, "learning_rate": 1.6412877259142616e-06, "loss": 0.81469744, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 3.568301200866699 }, { "auxiliary_loss_clip": 0.01280078, "auxiliary_loss_mlp": 0.01027889, "balance_loss_clip": 1.04833984, "balance_loss_mlp": 1.02041423, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 2.0877140401518823, "language_loss": 0.74110162, "learning_rate": 1.6405214131334757e-06, "loss": 0.76418126, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.72139573097229 }, { "auxiliary_loss_clip": 0.01371706, "auxiliary_loss_mlp": 0.01027555, "balance_loss_clip": 1.04703116, "balance_loss_mlp": 1.02016723, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 2.228295198985117, "language_loss": 0.79798913, "learning_rate": 1.6397551548853052e-06, "loss": 0.82198179, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.7810299396514893 }, { "auxiliary_loss_clip": 0.0127982, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.04910517, "balance_loss_mlp": 1.01756191, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 2.023012243093012, "language_loss": 0.70833284, "learning_rate": 1.6389889512859917e-06, "loss": 0.73138052, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.7019872665405273 }, { "auxiliary_loss_clip": 0.01171612, "auxiliary_loss_mlp": 0.01000871, "balance_loss_clip": 1.00864136, "balance_loss_mlp": 0.99988121, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8152350481735203, "language_loss": 0.60373354, "learning_rate": 1.638222802451767e-06, "loss": 0.62545836, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.254171848297119 }, { "auxiliary_loss_clip": 0.01226207, "auxiliary_loss_mlp": 0.01024779, "balance_loss_clip": 1.05091751, "balance_loss_mlp": 1.0176568, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.7825903793577667, "language_loss": 0.75307274, "learning_rate": 1.6374567084988561e-06, "loss": 0.77558255, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.7250876426696777 }, { "auxiliary_loss_clip": 0.01184452, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.05096173, "balance_loss_mlp": 1.02220738, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.8946219068546517, "language_loss": 0.76580131, "learning_rate": 1.6366906695434738e-06, "loss": 0.78794384, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.683281898498535 }, { "auxiliary_loss_clip": 0.01234149, "auxiliary_loss_mlp": 0.01026632, "balance_loss_clip": 1.05283856, "balance_loss_mlp": 1.0193727, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.3435150546467614, "language_loss": 0.86230248, "learning_rate": 1.6359246857018275e-06, "loss": 0.88491035, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.629426956176758 }, { "auxiliary_loss_clip": 0.01279442, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.04321933, "balance_loss_mlp": 1.02302456, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 1.8868508085526876, "language_loss": 0.78260285, "learning_rate": 1.6351587570901178e-06, "loss": 0.80570471, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.7108097076416016 }, { "auxiliary_loss_clip": 0.01336294, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.05178928, "balance_loss_mlp": 1.01980114, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.4075153750023595, "language_loss": 0.75876999, "learning_rate": 1.634392883824534e-06, "loss": 0.78240484, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.6844069957733154 }, { "auxiliary_loss_clip": 0.0128181, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.04571664, "balance_loss_mlp": 1.02553749, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.8128056282474951, "language_loss": 0.67995918, "learning_rate": 1.6336270660212595e-06, "loss": 0.70310962, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.8281702995300293 }, { "auxiliary_loss_clip": 0.01288166, "auxiliary_loss_mlp": 0.01028963, "balance_loss_clip": 1.05744934, "balance_loss_mlp": 1.0213809, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.196331051810071, "language_loss": 0.6605503, "learning_rate": 1.6328613037964676e-06, "loss": 0.68372154, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.8131961822509766 }, { "auxiliary_loss_clip": 0.01229302, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 1.04924059, "balance_loss_mlp": 1.01914954, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 4.9294899827133785, "language_loss": 0.68539453, "learning_rate": 1.6320955972663241e-06, "loss": 0.70795548, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.6700899600982666 }, { "auxiliary_loss_clip": 0.0122822, "auxiliary_loss_mlp": 0.01024415, "balance_loss_clip": 1.04845619, "balance_loss_mlp": 1.01664805, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 1.7586470769756641, "language_loss": 0.65874839, "learning_rate": 1.6313299465469857e-06, "loss": 0.68127477, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.72322678565979 }, { "auxiliary_loss_clip": 0.01229917, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.050367, "balance_loss_mlp": 1.01898122, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.7249803208593937, "language_loss": 0.7940011, "learning_rate": 1.6305643517546014e-06, "loss": 0.81656897, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.6486854553222656 }, { "auxiliary_loss_clip": 0.01182381, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.0530386, "balance_loss_mlp": 1.02297878, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 2.714352537631004, "language_loss": 0.84753388, "learning_rate": 1.629798813005311e-06, "loss": 0.86965644, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.5895488262176514 }, { "auxiliary_loss_clip": 0.01377627, "auxiliary_loss_mlp": 0.01024256, "balance_loss_clip": 1.05005383, "balance_loss_mlp": 1.01721621, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 1.9783247664762593, "language_loss": 0.71254176, "learning_rate": 1.6290333304152473e-06, "loss": 0.73656058, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.7515203952789307 }, { "auxiliary_loss_clip": 0.01284858, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 1.05471921, "balance_loss_mlp": 1.01919389, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 2.0936765108429323, "language_loss": 0.57054651, "learning_rate": 1.6282679041005314e-06, "loss": 0.59366304, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.8122787475585938 }, { "auxiliary_loss_clip": 0.01271858, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.04616606, "balance_loss_mlp": 1.0183332, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.405139475555805, "language_loss": 0.87432778, "learning_rate": 1.6275025341772789e-06, "loss": 0.89730519, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 3.5205283164978027 }, { "auxiliary_loss_clip": 0.01281936, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.04799461, "balance_loss_mlp": 1.02242064, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 2.3118542527215062, "language_loss": 0.8187874, "learning_rate": 1.626737220761596e-06, "loss": 0.84191096, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.713064193725586 }, { "auxiliary_loss_clip": 0.01231227, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.0535028, "balance_loss_mlp": 1.02451015, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 5.0461902993065575, "language_loss": 0.7863825, "learning_rate": 1.62597196396958e-06, "loss": 0.80901468, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.5387163162231445 }, { "auxiliary_loss_clip": 0.01230889, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 1.05162191, "balance_loss_mlp": 1.02459645, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 1.6976408354357884, "language_loss": 0.85474682, "learning_rate": 1.6252067639173197e-06, "loss": 0.87737775, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.7085955142974854 }, { "auxiliary_loss_clip": 0.01230759, "auxiliary_loss_mlp": 0.0102477, "balance_loss_clip": 1.04969323, "balance_loss_mlp": 1.0170424, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 2.027305000684339, "language_loss": 0.69777262, "learning_rate": 1.6244416207208956e-06, "loss": 0.72032791, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 3.640482187271118 }, { "auxiliary_loss_clip": 0.01328235, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.04790068, "balance_loss_mlp": 1.02120125, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.9391388821562516, "language_loss": 0.73919892, "learning_rate": 1.6236765344963787e-06, "loss": 0.76276618, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.749765396118164 }, { "auxiliary_loss_clip": 0.01278855, "auxiliary_loss_mlp": 0.01028013, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.02021098, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 33.005400086409196, "language_loss": 0.69000524, "learning_rate": 1.6229115053598322e-06, "loss": 0.71307391, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.802699089050293 }, { "auxiliary_loss_clip": 0.01232643, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.05331159, "balance_loss_mlp": 1.02165139, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 1.9341655720662432, "language_loss": 0.72266603, "learning_rate": 1.6221465334273108e-06, "loss": 0.74527919, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 3.4763355255126953 }, { "auxiliary_loss_clip": 0.01331984, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 1.04738533, "balance_loss_mlp": 1.01959062, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 1.9693663351122328, "language_loss": 0.61632776, "learning_rate": 1.6213816188148593e-06, "loss": 0.63992023, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.757981777191162 }, { "auxiliary_loss_clip": 0.01284327, "auxiliary_loss_mlp": 0.01026777, "balance_loss_clip": 1.05461812, "balance_loss_mlp": 1.01948786, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.8582079800691271, "language_loss": 0.77346468, "learning_rate": 1.6206167616385162e-06, "loss": 0.79657567, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.7102389335632324 }, { "auxiliary_loss_clip": 0.01284967, "auxiliary_loss_mlp": 0.01031974, "balance_loss_clip": 1.05087304, "balance_loss_mlp": 1.02394843, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.620752325614196, "language_loss": 0.73480672, "learning_rate": 1.6198519620143078e-06, "loss": 0.75797611, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.6490001678466797 }, { "auxiliary_loss_clip": 0.01336293, "auxiliary_loss_mlp": 0.01025111, "balance_loss_clip": 1.05148721, "balance_loss_mlp": 1.0177201, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.7585444962970336, "language_loss": 0.78129947, "learning_rate": 1.6190872200582546e-06, "loss": 0.80491352, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.7527284622192383 }, { "auxiliary_loss_clip": 0.01278203, "auxiliary_loss_mlp": 0.02568472, "balance_loss_clip": 1.0477773, "balance_loss_mlp": 1.00005782, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.441121299611465, "language_loss": 0.78540587, "learning_rate": 1.6183225358863676e-06, "loss": 0.82387257, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.6765708923339844 }, { "auxiliary_loss_clip": 0.01275132, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.04663718, "balance_loss_mlp": 1.0230031, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 2.206865837308673, "language_loss": 0.71679688, "learning_rate": 1.617557909614648e-06, "loss": 0.73985326, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.8889877796173096 }, { "auxiliary_loss_clip": 0.01324471, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.04623103, "balance_loss_mlp": 1.02477789, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 3.5343510766312516, "language_loss": 0.8627497, "learning_rate": 1.6167933413590899e-06, "loss": 0.88631797, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.671372652053833 }, { "auxiliary_loss_clip": 0.01229526, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04926944, "balance_loss_mlp": 1.01982737, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 2.344647590696276, "language_loss": 0.91036248, "learning_rate": 1.6160288312356773e-06, "loss": 0.93292665, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.685467004776001 }, { "auxiliary_loss_clip": 0.01236754, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.05105448, "balance_loss_mlp": 1.01973271, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 2.0485407968829588, "language_loss": 0.81603503, "learning_rate": 1.6152643793603857e-06, "loss": 0.83867443, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.6075875759124756 }, { "auxiliary_loss_clip": 0.01183151, "auxiliary_loss_mlp": 0.01031297, "balance_loss_clip": 1.05378914, "balance_loss_mlp": 1.02336311, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.6955943475555175, "language_loss": 0.87719882, "learning_rate": 1.6144999858491815e-06, "loss": 0.89934325, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.6581413745880127 }, { "auxiliary_loss_clip": 0.01280963, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.04727006, "balance_loss_mlp": 1.02171516, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.8596238483620298, "language_loss": 0.85902679, "learning_rate": 1.6137356508180232e-06, "loss": 0.8821345, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.717061758041382 }, { "auxiliary_loss_clip": 0.01181511, "auxiliary_loss_mlp": 0.02569651, "balance_loss_clip": 1.05044818, "balance_loss_mlp": 1.00008798, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 1.7015866024400699, "language_loss": 0.81449699, "learning_rate": 1.6129713743828593e-06, "loss": 0.85200858, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.6363980770111084 }, { "auxiliary_loss_clip": 0.0128056, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.04612374, "balance_loss_mlp": 1.02203405, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.4084662880832812, "language_loss": 0.75483149, "learning_rate": 1.6122071566596306e-06, "loss": 0.77793199, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.6607863903045654 }, { "auxiliary_loss_clip": 0.01231764, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.0511601, "balance_loss_mlp": 1.02486384, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.58845500733219, "language_loss": 0.83317429, "learning_rate": 1.6114429977642674e-06, "loss": 0.85581434, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.611516237258911 }, { "auxiliary_loss_clip": 0.01233879, "auxiliary_loss_mlp": 0.01028584, "balance_loss_clip": 1.05386901, "balance_loss_mlp": 1.02132988, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 1.8373139357813981, "language_loss": 0.73555815, "learning_rate": 1.6106788978126926e-06, "loss": 0.75818276, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.598598003387451 }, { "auxiliary_loss_clip": 0.01372212, "auxiliary_loss_mlp": 0.01025848, "balance_loss_clip": 1.04151642, "balance_loss_mlp": 1.0182482, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.2664482877017083, "language_loss": 0.78696305, "learning_rate": 1.6099148569208196e-06, "loss": 0.81094366, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.797889232635498 }, { "auxiliary_loss_clip": 0.01288813, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.05602598, "balance_loss_mlp": 1.02183616, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 1.7499104360936648, "language_loss": 0.63176894, "learning_rate": 1.6091508752045523e-06, "loss": 0.65496469, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.701235055923462 }, { "auxiliary_loss_clip": 0.01320579, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.04305041, "balance_loss_mlp": 1.01965809, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 2.2593761522529636, "language_loss": 0.86351216, "learning_rate": 1.608386952779787e-06, "loss": 0.88698351, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.718374490737915 }, { "auxiliary_loss_clip": 0.01284928, "auxiliary_loss_mlp": 0.01023089, "balance_loss_clip": 1.04852533, "balance_loss_mlp": 1.01582932, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.620829515571466, "language_loss": 0.74826694, "learning_rate": 1.6076230897624098e-06, "loss": 0.77134705, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.7147178649902344 }, { "auxiliary_loss_clip": 0.01230552, "auxiliary_loss_mlp": 0.01024564, "balance_loss_clip": 1.04741836, "balance_loss_mlp": 1.0167861, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 2.4910817551699167, "language_loss": 0.77762294, "learning_rate": 1.6068592862682974e-06, "loss": 0.80017406, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 3.648292064666748 }, { "auxiliary_loss_clip": 0.01282511, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.05112052, "balance_loss_mlp": 1.02185297, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.9900896808974597, "language_loss": 0.73815036, "learning_rate": 1.6060955424133187e-06, "loss": 0.76126909, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 2.812748432159424 }, { "auxiliary_loss_clip": 0.01229674, "auxiliary_loss_mlp": 0.01031952, "balance_loss_clip": 1.05092263, "balance_loss_mlp": 1.02445376, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.8217243133592864, "language_loss": 0.89712846, "learning_rate": 1.6053318583133332e-06, "loss": 0.91974473, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 3.5296506881713867 }, { "auxiliary_loss_clip": 0.01231167, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.050843, "balance_loss_mlp": 1.02371573, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.4490043230058647, "language_loss": 0.75203204, "learning_rate": 1.6045682340841907e-06, "loss": 0.77466005, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 3.5852842330932617 }, { "auxiliary_loss_clip": 0.01227798, "auxiliary_loss_mlp": 0.02508821, "balance_loss_clip": 1.01057696, "balance_loss_mlp": 1.00023866, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.7500367498041917, "language_loss": 0.57912683, "learning_rate": 1.6038046698417336e-06, "loss": 0.61649299, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.226593255996704 }, { "auxiliary_loss_clip": 0.01228305, "auxiliary_loss_mlp": 0.01031319, "balance_loss_clip": 1.04791689, "balance_loss_mlp": 1.02351701, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 2.435746811852171, "language_loss": 0.6910429, "learning_rate": 1.6030411657017919e-06, "loss": 0.71363902, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.6472907066345215 }, { "auxiliary_loss_clip": 0.0122648, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.04936624, "balance_loss_mlp": 1.02796197, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.799870099401235, "language_loss": 0.84533966, "learning_rate": 1.6022777217801903e-06, "loss": 0.86796099, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.6663472652435303 }, { "auxiliary_loss_clip": 0.01330156, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.05097127, "balance_loss_mlp": 1.02124274, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.9129731547245359, "language_loss": 0.74081618, "learning_rate": 1.601514338192742e-06, "loss": 0.76440632, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 3.6281416416168213 }, { "auxiliary_loss_clip": 0.01175211, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.04895139, "balance_loss_mlp": 1.02116561, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.363700419654705, "language_loss": 0.71050644, "learning_rate": 1.6007510150552514e-06, "loss": 0.73253804, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.6197187900543213 }, { "auxiliary_loss_clip": 0.01233523, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.04839468, "balance_loss_mlp": 1.02299631, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.6437784289167907, "language_loss": 0.62070101, "learning_rate": 1.599987752483515e-06, "loss": 0.64334619, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.8816757202148438 }, { "auxiliary_loss_clip": 0.01326414, "auxiliary_loss_mlp": 0.01026065, "balance_loss_clip": 1.04547834, "balance_loss_mlp": 1.01860797, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.5896672190292311, "language_loss": 0.68435818, "learning_rate": 1.5992245505933184e-06, "loss": 0.707883, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.686903715133667 }, { "auxiliary_loss_clip": 0.01184726, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.05364716, "balance_loss_mlp": 1.01855123, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.3316194907503975, "language_loss": 0.71480864, "learning_rate": 1.5984614095004388e-06, "loss": 0.736911, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.6644363403320312 }, { "auxiliary_loss_clip": 0.01231609, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.05129528, "balance_loss_mlp": 1.02118301, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.5689693837002783, "language_loss": 0.81068051, "learning_rate": 1.5976983293206438e-06, "loss": 0.83328152, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.633852481842041 }, { "auxiliary_loss_clip": 0.01278456, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.04483461, "balance_loss_mlp": 1.02309251, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 2.182558673707391, "language_loss": 0.71141517, "learning_rate": 1.5969353101696928e-06, "loss": 0.73451221, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.6870031356811523 }, { "auxiliary_loss_clip": 0.0123228, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.05033112, "balance_loss_mlp": 1.0213654, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.6791778892028784, "language_loss": 0.80036741, "learning_rate": 1.5961723521633341e-06, "loss": 0.82297593, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.7238833904266357 }, { "auxiliary_loss_clip": 0.01179374, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 1.04644418, "balance_loss_mlp": 1.01917696, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 1.9900594277492383, "language_loss": 0.90959299, "learning_rate": 1.5954094554173097e-06, "loss": 0.93165028, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.6554222106933594 }, { "auxiliary_loss_clip": 0.01285622, "auxiliary_loss_mlp": 0.01025242, "balance_loss_clip": 1.05098796, "balance_loss_mlp": 1.01832175, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.321269763078272, "language_loss": 0.79485208, "learning_rate": 1.5946466200473482e-06, "loss": 0.81796074, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.7147319316864014 }, { "auxiliary_loss_clip": 0.01284718, "auxiliary_loss_mlp": 0.01025807, "balance_loss_clip": 1.04787457, "balance_loss_mlp": 1.01863074, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.731544401088325, "language_loss": 0.83327675, "learning_rate": 1.5938838461691723e-06, "loss": 0.85638201, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.6082839965820312 }, { "auxiliary_loss_clip": 0.0118352, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.05403674, "balance_loss_mlp": 1.02117765, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.234158267709471, "language_loss": 0.83296025, "learning_rate": 1.593121133898494e-06, "loss": 0.85507989, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.6428661346435547 }, { "auxiliary_loss_clip": 0.01238558, "auxiliary_loss_mlp": 0.0103027, "balance_loss_clip": 1.05293691, "balance_loss_mlp": 1.02301645, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 2.5637402119041974, "language_loss": 0.78982174, "learning_rate": 1.592358483351016e-06, "loss": 0.81251001, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.6304194927215576 }, { "auxiliary_loss_clip": 0.01226018, "auxiliary_loss_mlp": 0.01027218, "balance_loss_clip": 1.04992926, "balance_loss_mlp": 1.02000546, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 2.118972807050266, "language_loss": 0.72088492, "learning_rate": 1.5915958946424326e-06, "loss": 0.74341726, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.63069748878479 }, { "auxiliary_loss_clip": 0.01339621, "auxiliary_loss_mlp": 0.02570954, "balance_loss_clip": 1.05106974, "balance_loss_mlp": 1.00013638, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 5.3087604541277065, "language_loss": 0.74325836, "learning_rate": 1.5908333678884271e-06, "loss": 0.78236401, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.881743907928467 }, { "auxiliary_loss_clip": 0.01230544, "auxiliary_loss_mlp": 0.01027829, "balance_loss_clip": 1.05251563, "balance_loss_mlp": 1.02077138, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 1.9179577014403295, "language_loss": 0.73726285, "learning_rate": 1.5900709032046743e-06, "loss": 0.75984657, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.646627187728882 }, { "auxiliary_loss_clip": 0.01278729, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.05207324, "balance_loss_mlp": 1.02217317, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 1.9720035265371234, "language_loss": 0.78438139, "learning_rate": 1.5893085007068391e-06, "loss": 0.80746686, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.640532970428467 }, { "auxiliary_loss_clip": 0.01272979, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.04479694, "balance_loss_mlp": 1.02205825, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 2.241207615653504, "language_loss": 0.71115792, "learning_rate": 1.5885461605105786e-06, "loss": 0.73419636, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.727259635925293 }, { "auxiliary_loss_clip": 0.01289972, "auxiliary_loss_mlp": 0.0102773, "balance_loss_clip": 1.05245268, "balance_loss_mlp": 1.01993394, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 2.272715159166717, "language_loss": 0.77203906, "learning_rate": 1.5877838827315375e-06, "loss": 0.79521608, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.685189962387085 }, { "auxiliary_loss_clip": 0.01182644, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.05453122, "balance_loss_mlp": 1.02605486, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 2.795079008196703, "language_loss": 0.70458865, "learning_rate": 1.587021667485355e-06, "loss": 0.72674918, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 3.5299205780029297 }, { "auxiliary_loss_clip": 0.01283405, "auxiliary_loss_mlp": 0.01025328, "balance_loss_clip": 1.04765105, "balance_loss_mlp": 1.0184021, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.75175457919788, "language_loss": 0.78471959, "learning_rate": 1.5862595148876559e-06, "loss": 0.80780685, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.6879210472106934 }, { "auxiliary_loss_clip": 0.01387243, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.04950726, "balance_loss_mlp": 1.02523923, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.33178823390954, "language_loss": 0.76065099, "learning_rate": 1.58549742505406e-06, "loss": 0.78485882, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 3.6230781078338623 }, { "auxiliary_loss_clip": 0.01180911, "auxiliary_loss_mlp": 0.01028008, "balance_loss_clip": 1.0504508, "balance_loss_mlp": 1.02004433, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.5995514640334965, "language_loss": 0.75689006, "learning_rate": 1.5847353981001747e-06, "loss": 0.77897924, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 3.4900455474853516 }, { "auxiliary_loss_clip": 0.01278362, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.04513121, "balance_loss_mlp": 1.02093256, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.744666156371003, "language_loss": 0.69842374, "learning_rate": 1.5839734341415993e-06, "loss": 0.72149742, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.874349355697632 }, { "auxiliary_loss_clip": 0.01227997, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 1.05479157, "balance_loss_mlp": 1.02055883, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.5709638015616176, "language_loss": 0.76552582, "learning_rate": 1.5832115332939238e-06, "loss": 0.78807646, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.662214756011963 }, { "auxiliary_loss_clip": 0.01232956, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.05211067, "balance_loss_mlp": 1.02162111, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 1.5766886873598256, "language_loss": 0.74722499, "learning_rate": 1.5824496956727272e-06, "loss": 0.76984191, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.6852262020111084 }, { "auxiliary_loss_clip": 0.01278623, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.04698515, "balance_loss_mlp": 1.01735163, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 2.472379708660372, "language_loss": 0.73089969, "learning_rate": 1.5816879213935797e-06, "loss": 0.75393081, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 3.501483201980591 }, { "auxiliary_loss_clip": 0.0122719, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.05097592, "balance_loss_mlp": 1.01983857, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.806251251230941, "language_loss": 0.79887593, "learning_rate": 1.5809262105720416e-06, "loss": 0.82141519, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.693979263305664 }, { "auxiliary_loss_clip": 0.01179161, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.05166435, "balance_loss_mlp": 1.02097821, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.869206833053959, "language_loss": 0.7947346, "learning_rate": 1.5801645633236644e-06, "loss": 0.81680715, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.630711078643799 }, { "auxiliary_loss_clip": 0.01273623, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.04527676, "balance_loss_mlp": 1.02136636, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.90727517965331, "language_loss": 0.77544498, "learning_rate": 1.579402979763989e-06, "loss": 0.79846996, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.6901824474334717 }, { "auxiliary_loss_clip": 0.01342219, "auxiliary_loss_mlp": 0.01024883, "balance_loss_clip": 1.04652309, "balance_loss_mlp": 1.01785302, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.174281632677387, "language_loss": 0.81211573, "learning_rate": 1.578641460008548e-06, "loss": 0.8357867, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.7782506942749023 }, { "auxiliary_loss_clip": 0.01225803, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.04972959, "balance_loss_mlp": 1.01895452, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 2.2979740790139536, "language_loss": 0.68355614, "learning_rate": 1.5778800041728613e-06, "loss": 0.70607501, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.592829942703247 }, { "auxiliary_loss_clip": 0.01225674, "auxiliary_loss_mlp": 0.01023594, "balance_loss_clip": 1.05074883, "balance_loss_mlp": 1.0166831, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.5467400022033504, "language_loss": 0.66102952, "learning_rate": 1.577118612372443e-06, "loss": 0.68352222, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.8792619705200195 }, { "auxiliary_loss_clip": 0.01276108, "auxiliary_loss_mlp": 0.02570292, "balance_loss_clip": 1.04701471, "balance_loss_mlp": 1.00004578, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 2.4570303343346596, "language_loss": 0.70617473, "learning_rate": 1.5763572847227943e-06, "loss": 0.74463868, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.797685146331787 }, { "auxiliary_loss_clip": 0.01227385, "auxiliary_loss_mlp": 0.01025088, "balance_loss_clip": 1.04743028, "balance_loss_mlp": 1.01861811, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 2.3535103702669082, "language_loss": 0.81662625, "learning_rate": 1.5755960213394091e-06, "loss": 0.83915102, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.703148603439331 }, { "auxiliary_loss_clip": 0.01233476, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 1.04698122, "balance_loss_mlp": 1.01896095, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.8644100318575374, "language_loss": 0.78451765, "learning_rate": 1.5748348223377703e-06, "loss": 0.80711007, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.807342529296875 }, { "auxiliary_loss_clip": 0.012774, "auxiliary_loss_mlp": 0.01023925, "balance_loss_clip": 1.0494554, "balance_loss_mlp": 1.0169754, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.5944308543982018, "language_loss": 0.78098243, "learning_rate": 1.5740736878333507e-06, "loss": 0.80399573, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.6747958660125732 }, { "auxiliary_loss_clip": 0.01283665, "auxiliary_loss_mlp": 0.01027039, "balance_loss_clip": 1.04823315, "balance_loss_mlp": 1.0192194, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 4.201151641071148, "language_loss": 0.78006279, "learning_rate": 1.5733126179416143e-06, "loss": 0.80316985, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.6477737426757812 }, { "auxiliary_loss_clip": 0.01226509, "auxiliary_loss_mlp": 0.0102201, "balance_loss_clip": 1.04924583, "balance_loss_mlp": 1.01481545, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.215275937590883, "language_loss": 0.72159445, "learning_rate": 1.5725516127780137e-06, "loss": 0.74407959, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.802097797393799 }, { "auxiliary_loss_clip": 0.01234239, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.04861546, "balance_loss_mlp": 1.01698601, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 3.713376823607042, "language_loss": 0.88857073, "learning_rate": 1.5717906724579943e-06, "loss": 0.91115594, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.6694746017456055 }, { "auxiliary_loss_clip": 0.01337403, "auxiliary_loss_mlp": 0.01026032, "balance_loss_clip": 1.0460906, "balance_loss_mlp": 1.01874816, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.1032924858356212, "language_loss": 0.68200749, "learning_rate": 1.571029797096989e-06, "loss": 0.70564181, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.7962183952331543 }, { "auxiliary_loss_clip": 0.01177896, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04995227, "balance_loss_mlp": 1.01908529, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.871598908674626, "language_loss": 0.79230797, "learning_rate": 1.570268986810423e-06, "loss": 0.81434727, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.6555752754211426 }, { "auxiliary_loss_clip": 0.01275842, "auxiliary_loss_mlp": 0.01025112, "balance_loss_clip": 1.04710174, "balance_loss_mlp": 1.01799846, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 4.941044375805735, "language_loss": 0.74943435, "learning_rate": 1.5695082417137096e-06, "loss": 0.77244389, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.612604856491089 }, { "auxiliary_loss_clip": 0.01275816, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.04491973, "balance_loss_mlp": 1.02139497, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.7521577694618597, "language_loss": 0.75805008, "learning_rate": 1.5687475619222539e-06, "loss": 0.78108817, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.7020676136016846 }, { "auxiliary_loss_clip": 0.01278406, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.04645181, "balance_loss_mlp": 1.01849937, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.185038307732515, "language_loss": 0.73468983, "learning_rate": 1.5679869475514496e-06, "loss": 0.75772965, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.6540772914886475 }, { "auxiliary_loss_clip": 0.01225694, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.04824972, "balance_loss_mlp": 1.02313638, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.014111977216435, "language_loss": 0.81173813, "learning_rate": 1.567226398716682e-06, "loss": 0.83430159, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 3.5804665088653564 }, { "auxiliary_loss_clip": 0.01288374, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.05057728, "balance_loss_mlp": 1.02728224, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.729620737474344, "language_loss": 0.61962694, "learning_rate": 1.566465915533326e-06, "loss": 0.64286274, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 3.764631748199463 }, { "auxiliary_loss_clip": 0.01225497, "auxiliary_loss_mlp": 0.0102711, "balance_loss_clip": 1.04804242, "balance_loss_mlp": 1.01986182, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 2.6272636903491002, "language_loss": 0.8823477, "learning_rate": 1.5657054981167458e-06, "loss": 0.90487379, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 3.5296711921691895 }, { "auxiliary_loss_clip": 0.01225208, "auxiliary_loss_mlp": 0.01023056, "balance_loss_clip": 1.04730153, "balance_loss_mlp": 1.0163238, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 1.8206936142666568, "language_loss": 0.67994767, "learning_rate": 1.5649451465822965e-06, "loss": 0.70243037, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.704608201980591 }, { "auxiliary_loss_clip": 0.01376499, "auxiliary_loss_mlp": 0.01028414, "balance_loss_clip": 1.0482645, "balance_loss_mlp": 1.02147293, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.73099298143603, "language_loss": 0.83861411, "learning_rate": 1.5641848610453218e-06, "loss": 0.86266327, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.705509662628174 }, { "auxiliary_loss_clip": 0.01226127, "auxiliary_loss_mlp": 0.0102534, "balance_loss_clip": 1.04932523, "balance_loss_mlp": 1.01787758, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 2.4619939873528494, "language_loss": 0.86238885, "learning_rate": 1.563424641621158e-06, "loss": 0.88490349, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.622224807739258 }, { "auxiliary_loss_clip": 0.01282426, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.0486939, "balance_loss_mlp": 1.02404141, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 2.339824930721859, "language_loss": 0.69978344, "learning_rate": 1.5626644884251282e-06, "loss": 0.72292393, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.7268850803375244 }, { "auxiliary_loss_clip": 0.01180555, "auxiliary_loss_mlp": 0.01022902, "balance_loss_clip": 1.05125833, "balance_loss_mlp": 1.01630068, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.8180830366339165, "language_loss": 0.88092351, "learning_rate": 1.5619044015725488e-06, "loss": 0.9029581, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 3.5213418006896973 }, { "auxiliary_loss_clip": 0.01188686, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.05549383, "balance_loss_mlp": 1.02192533, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.286041133119409, "language_loss": 0.86902404, "learning_rate": 1.5611443811787224e-06, "loss": 0.8912077, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.629037618637085 }, { "auxiliary_loss_clip": 0.01223223, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.04888868, "balance_loss_mlp": 1.02030969, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.262701474149554, "language_loss": 0.69566929, "learning_rate": 1.560384427358945e-06, "loss": 0.71817714, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.6456990242004395 }, { "auxiliary_loss_clip": 0.01274999, "auxiliary_loss_mlp": 0.01027871, "balance_loss_clip": 1.04356802, "balance_loss_mlp": 1.02019954, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 3.4025726155310583, "language_loss": 0.73109829, "learning_rate": 1.5596245402284998e-06, "loss": 0.75412703, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.7701127529144287 }, { "auxiliary_loss_clip": 0.01233587, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.05390954, "balance_loss_mlp": 1.02195024, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.8162073619363863, "language_loss": 0.82020545, "learning_rate": 1.5588647199026619e-06, "loss": 0.84283125, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.5997965335845947 }, { "auxiliary_loss_clip": 0.01186812, "auxiliary_loss_mlp": 0.0102854, "balance_loss_clip": 1.05503798, "balance_loss_mlp": 1.02069044, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.136248848528647, "language_loss": 0.87464166, "learning_rate": 1.5581049664966956e-06, "loss": 0.89679515, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.6021649837493896 }, { "auxiliary_loss_clip": 0.01341042, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.01051557, "balance_loss_mlp": 1.0008831, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9887608753668745, "language_loss": 0.65036178, "learning_rate": 1.5573452801258545e-06, "loss": 0.67379177, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.121966600418091 }, { "auxiliary_loss_clip": 0.01239117, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.05273092, "balance_loss_mlp": 1.01985431, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 2.2228892841275285, "language_loss": 0.63716245, "learning_rate": 1.5565856609053824e-06, "loss": 0.6598258, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.6598527431488037 }, { "auxiliary_loss_clip": 0.01182467, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.05282044, "balance_loss_mlp": 1.02590704, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 2.194533631779001, "language_loss": 0.80119431, "learning_rate": 1.5558261089505127e-06, "loss": 0.82335854, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.615567684173584 }, { "auxiliary_loss_clip": 0.01230197, "auxiliary_loss_mlp": 0.010275, "balance_loss_clip": 1.05140138, "balance_loss_mlp": 1.02010274, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 2.0890239022740587, "language_loss": 0.80079508, "learning_rate": 1.5550666243764697e-06, "loss": 0.82337201, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.6441969871520996 }, { "auxiliary_loss_clip": 0.01229584, "auxiliary_loss_mlp": 0.01027088, "balance_loss_clip": 1.05061173, "balance_loss_mlp": 1.01978958, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.35505343464797, "language_loss": 0.77842104, "learning_rate": 1.554307207298465e-06, "loss": 0.80098778, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.62872314453125 }, { "auxiliary_loss_clip": 0.01183554, "auxiliary_loss_mlp": 0.01024696, "balance_loss_clip": 1.05257416, "balance_loss_mlp": 1.0166254, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 3.6071499289872677, "language_loss": 0.79036343, "learning_rate": 1.553547857831704e-06, "loss": 0.81244594, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.554435968399048 }, { "auxiliary_loss_clip": 0.01067526, "auxiliary_loss_mlp": 0.01002128, "balance_loss_clip": 1.00978303, "balance_loss_mlp": 1.00121641, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8844613864978814, "language_loss": 0.64139724, "learning_rate": 1.5527885760913771e-06, "loss": 0.66209376, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.985801935195923 }, { "auxiliary_loss_clip": 0.01278866, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.05106258, "balance_loss_mlp": 1.02202106, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.624249311798478, "language_loss": 0.7677232, "learning_rate": 1.552029362192668e-06, "loss": 0.79079998, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.644968032836914 }, { "auxiliary_loss_clip": 0.01325126, "auxiliary_loss_mlp": 0.01026871, "balance_loss_clip": 1.0466907, "balance_loss_mlp": 1.0194298, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 3.3786523461187525, "language_loss": 0.72816324, "learning_rate": 1.5512702162507478e-06, "loss": 0.75168318, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.7942891120910645 }, { "auxiliary_loss_clip": 0.01171997, "auxiliary_loss_mlp": 0.01001188, "balance_loss_clip": 1.00777459, "balance_loss_mlp": 1.00025773, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.0844009218728101, "language_loss": 0.55655968, "learning_rate": 1.5505111383807792e-06, "loss": 0.57829154, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.297513484954834 }, { "auxiliary_loss_clip": 0.01380598, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.04572761, "balance_loss_mlp": 1.02654767, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.9194578783197533, "language_loss": 0.80840737, "learning_rate": 1.5497521286979138e-06, "loss": 0.83255303, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.741168737411499 }, { "auxiliary_loss_clip": 0.01335428, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.04782403, "balance_loss_mlp": 1.01721573, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.0782930992104696, "language_loss": 0.74765921, "learning_rate": 1.5489931873172927e-06, "loss": 0.77126789, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.766758680343628 }, { "auxiliary_loss_clip": 0.0141486, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.038481, "balance_loss_mlp": 1.02252555, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 4.446565618672845, "language_loss": 0.79279333, "learning_rate": 1.5482343143540467e-06, "loss": 0.81724173, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.8766002655029297 }, { "auxiliary_loss_clip": 0.01231756, "auxiliary_loss_mlp": 0.02565804, "balance_loss_clip": 1.04653251, "balance_loss_mlp": 1.00007367, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 2.213372081468802, "language_loss": 0.82905316, "learning_rate": 1.547475509923295e-06, "loss": 0.86702877, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 3.7288763523101807 }, { "auxiliary_loss_clip": 0.01183239, "auxiliary_loss_mlp": 0.01000303, "balance_loss_clip": 1.00789404, "balance_loss_mlp": 0.99930811, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7251557890195105, "language_loss": 0.56085795, "learning_rate": 1.5467167741401495e-06, "loss": 0.5826934, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 4.171045541763306 }, { "auxiliary_loss_clip": 0.01278809, "auxiliary_loss_mlp": 0.01025082, "balance_loss_clip": 1.04507041, "balance_loss_mlp": 1.01781023, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.4884213988673, "language_loss": 0.71343338, "learning_rate": 1.5459581071197083e-06, "loss": 0.73647225, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 3.5228490829467773 }, { "auxiliary_loss_clip": 0.01232077, "auxiliary_loss_mlp": 0.01028523, "balance_loss_clip": 1.05172849, "balance_loss_mlp": 1.02141809, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.590804599974693, "language_loss": 0.83063281, "learning_rate": 1.5451995089770624e-06, "loss": 0.85323882, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.6350066661834717 }, { "auxiliary_loss_clip": 0.01179327, "auxiliary_loss_mlp": 0.0102591, "balance_loss_clip": 1.05222344, "balance_loss_mlp": 1.01925802, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.4767662141615823, "language_loss": 0.71815538, "learning_rate": 1.5444409798272885e-06, "loss": 0.74020773, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.697554349899292 }, { "auxiliary_loss_clip": 0.01328315, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.0461278, "balance_loss_mlp": 1.01828814, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.735769549963971, "language_loss": 0.80273783, "learning_rate": 1.543682519785456e-06, "loss": 0.82627439, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.725426435470581 }, { "auxiliary_loss_clip": 0.01277407, "auxiliary_loss_mlp": 0.01027198, "balance_loss_clip": 1.04674172, "balance_loss_mlp": 1.02033424, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.9006918176079295, "language_loss": 0.80534518, "learning_rate": 1.5429241289666219e-06, "loss": 0.82839125, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.663512706756592 }, { "auxiliary_loss_clip": 0.01274737, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.04751182, "balance_loss_mlp": 1.02128422, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 2.094731679885173, "language_loss": 0.69787788, "learning_rate": 1.5421658074858342e-06, "loss": 0.72090936, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 3.6098787784576416 }, { "auxiliary_loss_clip": 0.01275336, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 1.04776287, "balance_loss_mlp": 1.0206883, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.1159662685825618, "language_loss": 0.66213822, "learning_rate": 1.5414075554581298e-06, "loss": 0.68517923, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.7206687927246094 }, { "auxiliary_loss_clip": 0.01183381, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.05252516, "balance_loss_mlp": 1.02341104, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.4450014153567, "language_loss": 0.780402, "learning_rate": 1.5406493729985348e-06, "loss": 0.80254686, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.6856584548950195 }, { "auxiliary_loss_clip": 0.01286675, "auxiliary_loss_mlp": 0.02566839, "balance_loss_clip": 1.04997826, "balance_loss_mlp": 1.0000304, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 2.049020735797245, "language_loss": 0.720415, "learning_rate": 1.5398912602220644e-06, "loss": 0.75895011, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.747319459915161 }, { "auxiliary_loss_clip": 0.01292535, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.04828262, "balance_loss_mlp": 1.02278793, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.779498105598538, "language_loss": 0.78762424, "learning_rate": 1.539133217243724e-06, "loss": 0.81085169, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.7096054553985596 }, { "auxiliary_loss_clip": 0.01334114, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.0479629, "balance_loss_mlp": 1.02470422, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.561190026680902, "language_loss": 0.75993133, "learning_rate": 1.5383752441785081e-06, "loss": 0.78359878, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.7251193523406982 }, { "auxiliary_loss_clip": 0.01234975, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.05078316, "balance_loss_mlp": 1.02502489, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.218530458053412, "language_loss": 0.8600018, "learning_rate": 1.5376173411414003e-06, "loss": 0.88267994, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.619295120239258 }, { "auxiliary_loss_clip": 0.01280071, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 1.04471278, "balance_loss_mlp": 1.01933801, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 1.8752600933302197, "language_loss": 0.78839397, "learning_rate": 1.5368595082473753e-06, "loss": 0.81146234, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.6768808364868164 }, { "auxiliary_loss_clip": 0.01232054, "auxiliary_loss_mlp": 0.01024328, "balance_loss_clip": 1.04954875, "balance_loss_mlp": 1.01713407, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.7917421240308316, "language_loss": 0.77925515, "learning_rate": 1.5361017456113935e-06, "loss": 0.80181897, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.6603455543518066 }, { "auxiliary_loss_clip": 0.01231894, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.04840565, "balance_loss_mlp": 1.02090216, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 2.1950775166792242, "language_loss": 0.85587752, "learning_rate": 1.5353440533484085e-06, "loss": 0.87847829, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.6024210453033447 }, { "auxiliary_loss_clip": 0.01282539, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.04921067, "balance_loss_mlp": 1.02365279, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 2.0214986564758655, "language_loss": 0.66419852, "learning_rate": 1.534586431573361e-06, "loss": 0.6873368, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 2.9422740936279297 }, { "auxiliary_loss_clip": 0.01426463, "auxiliary_loss_mlp": 0.01027698, "balance_loss_clip": 1.04163194, "balance_loss_mlp": 1.01939523, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 2.2577997180893403, "language_loss": 0.79083103, "learning_rate": 1.5338288804011817e-06, "loss": 0.81537265, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.800546884536743 }, { "auxiliary_loss_clip": 0.01279589, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.04636335, "balance_loss_mlp": 1.02118015, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 2.2275535363432515, "language_loss": 0.71010906, "learning_rate": 1.533071399946791e-06, "loss": 0.7331903, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.6662819385528564 }, { "auxiliary_loss_clip": 0.01180035, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04598379, "balance_loss_mlp": 1.01808667, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.8765733569299288, "language_loss": 0.57339597, "learning_rate": 1.5323139903250977e-06, "loss": 0.5954482, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.6916184425354004 }, { "auxiliary_loss_clip": 0.01283737, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.05308914, "balance_loss_mlp": 1.02345252, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.505714500069263, "language_loss": 0.76921403, "learning_rate": 1.5315566516510002e-06, "loss": 0.79236042, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.693025827407837 }, { "auxiliary_loss_clip": 0.01182664, "auxiliary_loss_mlp": 0.01028408, "balance_loss_clip": 1.05311668, "balance_loss_mlp": 1.02071846, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 2.1091547560871056, "language_loss": 0.67522407, "learning_rate": 1.5307993840393857e-06, "loss": 0.69733477, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.5640146732330322 }, { "auxiliary_loss_clip": 0.01177848, "auxiliary_loss_mlp": 0.01026347, "balance_loss_clip": 1.04925764, "balance_loss_mlp": 1.01960874, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 3.2765494452427752, "language_loss": 0.80537146, "learning_rate": 1.530042187605132e-06, "loss": 0.82741344, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.660757541656494 }, { "auxiliary_loss_clip": 0.01232932, "auxiliary_loss_mlp": 0.02559297, "balance_loss_clip": 1.05228662, "balance_loss_mlp": 1.00002623, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.4238618828705611, "language_loss": 0.8424834, "learning_rate": 1.5292850624631044e-06, "loss": 0.88040566, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.709372043609619 }, { "auxiliary_loss_clip": 0.01225392, "auxiliary_loss_mlp": 0.0102701, "balance_loss_clip": 1.04937983, "balance_loss_mlp": 1.01923776, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 1.9065226455021858, "language_loss": 0.80295539, "learning_rate": 1.5285280087281593e-06, "loss": 0.82547939, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.7115821838378906 }, { "auxiliary_loss_clip": 0.01174719, "auxiliary_loss_mlp": 0.01001952, "balance_loss_clip": 1.01021886, "balance_loss_mlp": 1.00104654, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6382227889364126, "language_loss": 0.56590366, "learning_rate": 1.5277710265151398e-06, "loss": 0.58767033, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 4.418470859527588 }, { "auxiliary_loss_clip": 0.01231861, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.05003333, "balance_loss_mlp": 1.02432489, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 3.6914189502320265, "language_loss": 0.76945555, "learning_rate": 1.5270141159388803e-06, "loss": 0.7920996, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 3.5653738975524902 }, { "auxiliary_loss_clip": 0.0117933, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.04902351, "balance_loss_mlp": 1.02277994, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.6588395038730217, "language_loss": 0.80567241, "learning_rate": 1.526257277114203e-06, "loss": 0.82776761, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 3.5308895111083984 }, { "auxiliary_loss_clip": 0.01276996, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.04985988, "balance_loss_mlp": 1.01698852, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 2.185215540655273, "language_loss": 0.79906261, "learning_rate": 1.5255005101559201e-06, "loss": 0.82207143, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.7029061317443848 }, { "auxiliary_loss_clip": 0.01134194, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.05113661, "balance_loss_mlp": 1.02272153, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 1.9460835167240103, "language_loss": 0.76792991, "learning_rate": 1.524743815178833e-06, "loss": 0.78957152, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.607790946960449 }, { "auxiliary_loss_clip": 0.01280209, "auxiliary_loss_mlp": 0.01021578, "balance_loss_clip": 1.04641616, "balance_loss_mlp": 1.01505768, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 1.6834040087677489, "language_loss": 0.80887157, "learning_rate": 1.5239871922977315e-06, "loss": 0.83188939, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.8031342029571533 }, { "auxiliary_loss_clip": 0.01277642, "auxiliary_loss_mlp": 0.01026296, "balance_loss_clip": 1.04499936, "balance_loss_mlp": 1.01932204, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 2.0273388057386414, "language_loss": 0.8984099, "learning_rate": 1.523230641627394e-06, "loss": 0.9214493, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.6556529998779297 }, { "auxiliary_loss_clip": 0.01379361, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.04088378, "balance_loss_mlp": 1.02274787, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 2.5961645722358235, "language_loss": 0.72723007, "learning_rate": 1.5224741632825888e-06, "loss": 0.7513203, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.961007595062256 }, { "auxiliary_loss_clip": 0.01184268, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.05353367, "balance_loss_mlp": 1.02636361, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.9051272877245542, "language_loss": 0.69417393, "learning_rate": 1.521717757378074e-06, "loss": 0.71636283, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 3.630335807800293 }, { "auxiliary_loss_clip": 0.01234591, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.05143094, "balance_loss_mlp": 1.02076554, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 1.7770570064866504, "language_loss": 0.69033635, "learning_rate": 1.5209614240285943e-06, "loss": 0.71296692, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.5994129180908203 }, { "auxiliary_loss_clip": 0.0117948, "auxiliary_loss_mlp": 0.02566016, "balance_loss_clip": 1.05041111, "balance_loss_mlp": 1.000054, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.422222914744051, "language_loss": 0.84641343, "learning_rate": 1.520205163348887e-06, "loss": 0.8838684, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.6055867671966553 }, { "auxiliary_loss_clip": 0.01233068, "auxiliary_loss_mlp": 0.0099815, "balance_loss_clip": 1.01135731, "balance_loss_mlp": 0.99717247, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7243282852635303, "language_loss": 0.56908643, "learning_rate": 1.519448975453674e-06, "loss": 0.59139872, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.127502202987671 }, { "auxiliary_loss_clip": 0.01230268, "auxiliary_loss_mlp": 0.02566224, "balance_loss_clip": 1.05317533, "balance_loss_mlp": 1.00002551, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 1.9462892785717094, "language_loss": 0.76484525, "learning_rate": 1.5186928604576696e-06, "loss": 0.80281013, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.61940336227417 }, { "auxiliary_loss_clip": 0.01280488, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.04807997, "balance_loss_mlp": 1.02454078, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.283521560852937, "language_loss": 0.77184325, "learning_rate": 1.5179368184755752e-06, "loss": 0.79496413, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.6896493434906006 }, { "auxiliary_loss_clip": 0.01279083, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.05018151, "balance_loss_mlp": 1.02263153, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.7979602650968298, "language_loss": 0.82472658, "learning_rate": 1.5171808496220821e-06, "loss": 0.84781158, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.6496469974517822 }, { "auxiliary_loss_clip": 0.01286847, "auxiliary_loss_mlp": 0.01024161, "balance_loss_clip": 1.05041599, "balance_loss_mlp": 1.01701403, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.7614502752277854, "language_loss": 0.8129425, "learning_rate": 1.5164249540118708e-06, "loss": 0.83605254, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.7177062034606934 }, { "auxiliary_loss_clip": 0.01430621, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.04378343, "balance_loss_mlp": 1.02104568, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.5648577063070823, "language_loss": 0.83342493, "learning_rate": 1.5156691317596093e-06, "loss": 0.85800922, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.8109965324401855 }, { "auxiliary_loss_clip": 0.01236536, "auxiliary_loss_mlp": 0.02562734, "balance_loss_clip": 1.05217075, "balance_loss_mlp": 1.0000453, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.896457067939656, "language_loss": 0.6688832, "learning_rate": 1.5149133829799556e-06, "loss": 0.70687592, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.750356912612915 }, { "auxiliary_loss_clip": 0.01189123, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.04974449, "balance_loss_mlp": 1.02230024, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 1.9356435555514857, "language_loss": 0.81350088, "learning_rate": 1.5141577077875556e-06, "loss": 0.83568448, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.650779962539673 }, { "auxiliary_loss_clip": 0.01233878, "auxiliary_loss_mlp": 0.01024777, "balance_loss_clip": 1.0508728, "balance_loss_mlp": 1.01699555, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 2.1738160727642573, "language_loss": 0.7233454, "learning_rate": 1.5134021062970451e-06, "loss": 0.74593198, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.6439640522003174 }, { "auxiliary_loss_clip": 0.01326422, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.04918301, "balance_loss_mlp": 1.02182615, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 2.0589331540156492, "language_loss": 0.81199527, "learning_rate": 1.5126465786230483e-06, "loss": 0.83554906, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.6942994594573975 }, { "auxiliary_loss_clip": 0.01179533, "auxiliary_loss_mlp": 0.01022063, "balance_loss_clip": 1.0504967, "balance_loss_mlp": 1.01452327, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 2.481435285146199, "language_loss": 0.81960499, "learning_rate": 1.5118911248801787e-06, "loss": 0.84162098, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.6322696208953857 }, { "auxiliary_loss_clip": 0.01222207, "auxiliary_loss_mlp": 0.01029187, "balance_loss_clip": 1.04722643, "balance_loss_mlp": 1.02186716, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.5739457616673587, "language_loss": 0.79651517, "learning_rate": 1.5111357451830364e-06, "loss": 0.81902909, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.593173027038574 }, { "auxiliary_loss_clip": 0.01229775, "auxiliary_loss_mlp": 0.01023019, "balance_loss_clip": 1.04991901, "balance_loss_mlp": 1.01624191, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 3.5723430638978972, "language_loss": 0.71113139, "learning_rate": 1.5103804396462131e-06, "loss": 0.73365933, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.6730809211730957 }, { "auxiliary_loss_clip": 0.01234179, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.04944599, "balance_loss_mlp": 1.01907432, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.1222232227162876, "language_loss": 0.80418116, "learning_rate": 1.5096252083842877e-06, "loss": 0.82679313, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.6532180309295654 }, { "auxiliary_loss_clip": 0.01227002, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.04527259, "balance_loss_mlp": 1.02004385, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.7839762179434189, "language_loss": 0.85452539, "learning_rate": 1.5088700515118285e-06, "loss": 0.87707078, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.6998281478881836 }, { "auxiliary_loss_clip": 0.0133033, "auxiliary_loss_mlp": 0.01026579, "balance_loss_clip": 1.04916203, "balance_loss_mlp": 1.01966751, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.6475585561793864, "language_loss": 0.66462827, "learning_rate": 1.508114969143392e-06, "loss": 0.68819737, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 3.693866014480591 }, { "auxiliary_loss_clip": 0.01280198, "auxiliary_loss_mlp": 0.01024085, "balance_loss_clip": 1.04542899, "balance_loss_mlp": 1.01710796, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.3789306119883358, "language_loss": 0.77831978, "learning_rate": 1.5073599613935238e-06, "loss": 0.80136263, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 3.6270229816436768 }, { "auxiliary_loss_clip": 0.01280638, "auxiliary_loss_mlp": 0.01022469, "balance_loss_clip": 1.04827595, "balance_loss_mlp": 1.01512909, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 2.216935361769543, "language_loss": 0.57385898, "learning_rate": 1.5066050283767574e-06, "loss": 0.59689003, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.720912456512451 }, { "auxiliary_loss_clip": 0.0117888, "auxiliary_loss_mlp": 0.01022965, "balance_loss_clip": 1.04827142, "balance_loss_mlp": 1.01576138, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 1.9925239205227272, "language_loss": 0.83073866, "learning_rate": 1.505850170207616e-06, "loss": 0.85275716, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 3.55501127243042 }, { "auxiliary_loss_clip": 0.01275836, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04557347, "balance_loss_mlp": 1.01783442, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.1492739769620712, "language_loss": 0.78149772, "learning_rate": 1.505095387000611e-06, "loss": 0.80450857, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.7500159740448 }, { "auxiliary_loss_clip": 0.01273896, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.04765463, "balance_loss_mlp": 1.02217662, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 2.1627234024104367, "language_loss": 0.74355823, "learning_rate": 1.504340678870242e-06, "loss": 0.76658714, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.7132694721221924 }, { "auxiliary_loss_clip": 0.01228207, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.04966593, "balance_loss_mlp": 1.02050519, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.0879309541886797, "language_loss": 0.89954334, "learning_rate": 1.5035860459309989e-06, "loss": 0.92210627, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.686352014541626 }, { "auxiliary_loss_clip": 0.01277956, "auxiliary_loss_mlp": 0.01024063, "balance_loss_clip": 1.04862285, "balance_loss_mlp": 1.01664853, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 1.8361767061074357, "language_loss": 0.63782811, "learning_rate": 1.5028314882973568e-06, "loss": 0.66084832, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.709462881088257 }, { "auxiliary_loss_clip": 0.0128094, "auxiliary_loss_mlp": 0.01027338, "balance_loss_clip": 1.0498302, "balance_loss_mlp": 1.02029312, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 1.8701327108025374, "language_loss": 0.84340405, "learning_rate": 1.502077006083783e-06, "loss": 0.86648685, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 3.572216749191284 }, { "auxiliary_loss_clip": 0.01134519, "auxiliary_loss_mlp": 0.0256223, "balance_loss_clip": 1.05126119, "balance_loss_mlp": 1.00005686, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 2.116306467028422, "language_loss": 0.7642563, "learning_rate": 1.5013225994047315e-06, "loss": 0.80122375, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.629594564437866 }, { "auxiliary_loss_clip": 0.01232615, "auxiliary_loss_mlp": 0.02564226, "balance_loss_clip": 1.05161572, "balance_loss_mlp": 1.00005913, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.8486755150075518, "language_loss": 0.80764556, "learning_rate": 1.5005682683746452e-06, "loss": 0.84561396, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.6687803268432617 }, { "auxiliary_loss_clip": 0.01232639, "auxiliary_loss_mlp": 0.01027521, "balance_loss_clip": 1.05399752, "balance_loss_mlp": 1.02032053, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.4326342815862176, "language_loss": 0.72541404, "learning_rate": 1.4998140131079553e-06, "loss": 0.74801564, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.6272292137145996 }, { "auxiliary_loss_clip": 0.01422108, "auxiliary_loss_mlp": 0.02561526, "balance_loss_clip": 1.04321837, "balance_loss_mlp": 1.00011218, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 3.7415046094473148, "language_loss": 0.73167503, "learning_rate": 1.4990598337190821e-06, "loss": 0.77151144, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.8123438358306885 }, { "auxiliary_loss_clip": 0.01181622, "auxiliary_loss_mlp": 0.02564264, "balance_loss_clip": 1.05157948, "balance_loss_mlp": 1.00007033, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.698412608235749, "language_loss": 0.68097746, "learning_rate": 1.4983057303224338e-06, "loss": 0.71843624, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.6316516399383545 }, { "auxiliary_loss_clip": 0.01376666, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.04480839, "balance_loss_mlp": 1.02782857, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 2.1688876545382545, "language_loss": 0.87645298, "learning_rate": 1.4975517030324072e-06, "loss": 0.90056986, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.796238660812378 }, { "auxiliary_loss_clip": 0.01068339, "auxiliary_loss_mlp": 0.02507365, "balance_loss_clip": 1.01116765, "balance_loss_mlp": 1.0001229, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7812739531134544, "language_loss": 0.6176458, "learning_rate": 1.4967977519633882e-06, "loss": 0.65340275, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.28973388671875 }, { "auxiliary_loss_clip": 0.01321913, "auxiliary_loss_mlp": 0.01024754, "balance_loss_clip": 1.0449307, "balance_loss_mlp": 1.01725578, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 2.2282156462265257, "language_loss": 0.78465092, "learning_rate": 1.4960438772297494e-06, "loss": 0.80811757, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.7128188610076904 }, { "auxiliary_loss_clip": 0.01280302, "auxiliary_loss_mlp": 0.01024261, "balance_loss_clip": 1.04538846, "balance_loss_mlp": 1.01692653, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.431686285218414, "language_loss": 0.73852468, "learning_rate": 1.495290078945855e-06, "loss": 0.76157033, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.7139196395874023 }, { "auxiliary_loss_clip": 0.01179208, "auxiliary_loss_mlp": 0.01027898, "balance_loss_clip": 1.05173886, "balance_loss_mlp": 1.02082944, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 2.295080198097253, "language_loss": 0.74331701, "learning_rate": 1.4945363572260529e-06, "loss": 0.76538807, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.6969687938690186 }, { "auxiliary_loss_clip": 0.01225013, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.04714251, "balance_loss_mlp": 1.02104318, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.314074953676868, "language_loss": 0.68264091, "learning_rate": 1.4937827121846845e-06, "loss": 0.70517147, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.669736862182617 }, { "auxiliary_loss_clip": 0.01328956, "auxiliary_loss_mlp": 0.01025839, "balance_loss_clip": 1.0499835, "balance_loss_mlp": 1.01930904, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.5387300989986363, "language_loss": 0.73549068, "learning_rate": 1.4930291439360755e-06, "loss": 0.75903863, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.697032928466797 }, { "auxiliary_loss_clip": 0.01229711, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.05002201, "balance_loss_mlp": 1.02716768, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 2.396466780630469, "language_loss": 0.79355788, "learning_rate": 1.4922756525945427e-06, "loss": 0.81619549, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.717913866043091 }, { "auxiliary_loss_clip": 0.01022486, "auxiliary_loss_mlp": 0.01006837, "balance_loss_clip": 1.01134241, "balance_loss_mlp": 1.0058955, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7717893035362148, "language_loss": 0.59478658, "learning_rate": 1.4915222382743894e-06, "loss": 0.61507982, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.200469732284546 }, { "auxiliary_loss_clip": 0.01229933, "auxiliary_loss_mlp": 0.01024695, "balance_loss_clip": 1.05160725, "balance_loss_mlp": 1.01725674, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.1842079324228143, "language_loss": 0.72307658, "learning_rate": 1.4907689010899085e-06, "loss": 0.74562287, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.614574909210205 }, { "auxiliary_loss_clip": 0.01276664, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.04832339, "balance_loss_mlp": 1.02284801, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 1.969972762925356, "language_loss": 0.62712991, "learning_rate": 1.4900156411553804e-06, "loss": 0.65019262, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.711216926574707 }, { "auxiliary_loss_clip": 0.0127951, "auxiliary_loss_mlp": 0.01030182, "balance_loss_clip": 1.04968905, "balance_loss_mlp": 1.02244568, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 1.9708919071621118, "language_loss": 0.85604429, "learning_rate": 1.4892624585850739e-06, "loss": 0.87914127, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 3.6050212383270264 }, { "auxiliary_loss_clip": 0.01180981, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.05148673, "balance_loss_mlp": 1.02473593, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 2.6010535308205065, "language_loss": 0.79724896, "learning_rate": 1.4885093534932465e-06, "loss": 0.81937647, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.6808226108551025 }, { "auxiliary_loss_clip": 0.01275019, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.04973412, "balance_loss_mlp": 1.02011502, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.1904105338403363, "language_loss": 0.7145583, "learning_rate": 1.4877563259941433e-06, "loss": 0.73758513, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.599902391433716 }, { "auxiliary_loss_clip": 0.01236247, "auxiliary_loss_mlp": 0.01024621, "balance_loss_clip": 1.05095506, "balance_loss_mlp": 1.01718187, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 1.987947884797166, "language_loss": 0.67675436, "learning_rate": 1.4870033762019988e-06, "loss": 0.69936305, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 4.535677433013916 }, { "auxiliary_loss_clip": 0.01280518, "auxiliary_loss_mlp": 0.01029172, "balance_loss_clip": 1.0479027, "balance_loss_mlp": 1.02206421, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 2.4506688107665737, "language_loss": 0.73587537, "learning_rate": 1.4862505042310334e-06, "loss": 0.75897229, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.679192304611206 }, { "auxiliary_loss_clip": 0.01271943, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.04805958, "balance_loss_mlp": 1.01958382, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 2.5377553200190257, "language_loss": 0.69745159, "learning_rate": 1.4854977101954587e-06, "loss": 0.72043836, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.7574377059936523 }, { "auxiliary_loss_clip": 0.01229963, "auxiliary_loss_mlp": 0.01024066, "balance_loss_clip": 1.04714119, "balance_loss_mlp": 1.01676154, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 1.968153578359552, "language_loss": 0.86315805, "learning_rate": 1.4847449942094716e-06, "loss": 0.88569832, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.681190252304077 }, { "auxiliary_loss_clip": 0.01272038, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.04723287, "balance_loss_mlp": 1.01868224, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 2.192264759373798, "language_loss": 0.85907388, "learning_rate": 1.4839923563872598e-06, "loss": 0.88205558, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.62654185295105 }, { "auxiliary_loss_clip": 0.01335185, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.05126452, "balance_loss_mlp": 1.02190685, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.763263001429737, "language_loss": 0.75947183, "learning_rate": 1.483239796842997e-06, "loss": 0.78311229, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 3.682220220565796 }, { "auxiliary_loss_clip": 0.01226676, "auxiliary_loss_mlp": 0.01026003, "balance_loss_clip": 1.04913998, "balance_loss_mlp": 1.01922297, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 1.841145731353735, "language_loss": 0.83943522, "learning_rate": 1.4824873156908462e-06, "loss": 0.86196202, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.71268892288208 }, { "auxiliary_loss_clip": 0.0123167, "auxiliary_loss_mlp": 0.02570765, "balance_loss_clip": 1.05208731, "balance_loss_mlp": 1.00001335, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.573787749032818, "language_loss": 0.75697714, "learning_rate": 1.4817349130449584e-06, "loss": 0.79500151, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.673875570297241 }, { "auxiliary_loss_clip": 0.01227735, "auxiliary_loss_mlp": 0.01023402, "balance_loss_clip": 1.05040646, "balance_loss_mlp": 1.01621628, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 1.8655585309736469, "language_loss": 0.83121669, "learning_rate": 1.4809825890194717e-06, "loss": 0.85372806, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.6471030712127686 }, { "auxiliary_loss_clip": 0.01276872, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.0446744, "balance_loss_mlp": 1.02205312, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.8709159935305104, "language_loss": 0.77381819, "learning_rate": 1.4802303437285139e-06, "loss": 0.79687822, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.648054599761963 }, { "auxiliary_loss_clip": 0.01277616, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.04781818, "balance_loss_mlp": 1.01897252, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.28495750844855, "language_loss": 0.80980414, "learning_rate": 1.4794781772861994e-06, "loss": 0.8328439, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.6609606742858887 }, { "auxiliary_loss_clip": 0.01278991, "auxiliary_loss_mlp": 0.02563107, "balance_loss_clip": 1.04830647, "balance_loss_mlp": 1.00001359, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 2.085054705174388, "language_loss": 0.67044973, "learning_rate": 1.4787260898066324e-06, "loss": 0.70887071, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.788867712020874 }, { "auxiliary_loss_clip": 0.01179637, "auxiliary_loss_mlp": 0.01024997, "balance_loss_clip": 1.05265188, "balance_loss_mlp": 1.0181334, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.4366707052510197, "language_loss": 0.85894567, "learning_rate": 1.4779740814039023e-06, "loss": 0.880992, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.6918458938598633 }, { "auxiliary_loss_clip": 0.01179255, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.05098319, "balance_loss_mlp": 1.01864624, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 3.53947034211198, "language_loss": 0.68515122, "learning_rate": 1.4772221521920894e-06, "loss": 0.70719957, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.6488308906555176 }, { "auxiliary_loss_clip": 0.01285044, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.05251014, "balance_loss_mlp": 1.0223285, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 2.422183017124287, "language_loss": 0.74271607, "learning_rate": 1.4764703022852598e-06, "loss": 0.76586246, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.695699453353882 }, { "auxiliary_loss_clip": 0.01415664, "auxiliary_loss_mlp": 0.01026249, "balance_loss_clip": 1.04093313, "balance_loss_mlp": 1.01929927, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.862581002981766, "language_loss": 0.77134001, "learning_rate": 1.4757185317974696e-06, "loss": 0.7957592, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.748850107192993 }, { "auxiliary_loss_clip": 0.01229159, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.04876649, "balance_loss_mlp": 1.01934862, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.4984421177537945, "language_loss": 0.70798975, "learning_rate": 1.474966840842761e-06, "loss": 0.73055375, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.6595513820648193 }, { "auxiliary_loss_clip": 0.01231776, "auxiliary_loss_mlp": 0.01024328, "balance_loss_clip": 1.04913902, "balance_loss_mlp": 1.01746452, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.8099492978715312, "language_loss": 0.87059081, "learning_rate": 1.4742152295351655e-06, "loss": 0.89315182, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.647264242172241 }, { "auxiliary_loss_clip": 0.0123178, "auxiliary_loss_mlp": 0.02568442, "balance_loss_clip": 1.05082095, "balance_loss_mlp": 1.00004745, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 2.5619880514731483, "language_loss": 0.64141893, "learning_rate": 1.4734636979887016e-06, "loss": 0.67942119, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.6911959648132324 }, { "auxiliary_loss_clip": 0.01332021, "auxiliary_loss_mlp": 0.01026602, "balance_loss_clip": 1.0457437, "balance_loss_mlp": 1.01883531, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.4279775343243575, "language_loss": 0.90534782, "learning_rate": 1.4727122463173755e-06, "loss": 0.9289341, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.7546751499176025 }, { "auxiliary_loss_clip": 0.01276981, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.05022955, "balance_loss_mlp": 1.02009773, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.7783423812992145, "language_loss": 0.6413914, "learning_rate": 1.471960874635183e-06, "loss": 0.6644327, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.626188039779663 }, { "auxiliary_loss_clip": 0.01272129, "auxiliary_loss_mlp": 0.01029794, "balance_loss_clip": 1.04549348, "balance_loss_mlp": 1.02202201, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.316053087415415, "language_loss": 0.70836508, "learning_rate": 1.4712095830561055e-06, "loss": 0.7313844, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.651453733444214 }, { "auxiliary_loss_clip": 0.01279566, "auxiliary_loss_mlp": 0.01026394, "balance_loss_clip": 1.04671311, "balance_loss_mlp": 1.01960528, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 5.382848437052313, "language_loss": 0.8113845, "learning_rate": 1.4704583716941147e-06, "loss": 0.83444411, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.666487693786621 }, { "auxiliary_loss_clip": 0.01226824, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.05078816, "balance_loss_mlp": 1.02264738, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 2.0551511584108977, "language_loss": 0.72301042, "learning_rate": 1.4697072406631672e-06, "loss": 0.74557459, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.7093098163604736 }, { "auxiliary_loss_clip": 0.01381805, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.04878592, "balance_loss_mlp": 1.02161908, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.8861516180703375, "language_loss": 0.73140049, "learning_rate": 1.4689561900772097e-06, "loss": 0.75550437, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 3.739610195159912 }, { "auxiliary_loss_clip": 0.01274923, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.04625249, "balance_loss_mlp": 1.01986313, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.621648589853824, "language_loss": 0.72831249, "learning_rate": 1.4682052200501758e-06, "loss": 0.75133222, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 3.6090829372406006 }, { "auxiliary_loss_clip": 0.01180588, "auxiliary_loss_mlp": 0.01024625, "balance_loss_clip": 1.05198085, "balance_loss_mlp": 1.01771092, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.8607055152965746, "language_loss": 0.80234241, "learning_rate": 1.4674543306959876e-06, "loss": 0.82439452, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 3.5440964698791504 }, { "auxiliary_loss_clip": 0.01285404, "auxiliary_loss_mlp": 0.01034873, "balance_loss_clip": 1.04918861, "balance_loss_mlp": 1.02723789, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.554889394101337, "language_loss": 0.84562808, "learning_rate": 1.4667035221285535e-06, "loss": 0.8688308, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.6623101234436035 }, { "auxiliary_loss_clip": 0.01226586, "auxiliary_loss_mlp": 0.01021299, "balance_loss_clip": 1.05080211, "balance_loss_mlp": 1.01470125, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 2.5621422620279986, "language_loss": 0.74745607, "learning_rate": 1.4659527944617715e-06, "loss": 0.76993489, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.7912042140960693 }, { "auxiliary_loss_clip": 0.01422003, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.0410049, "balance_loss_mlp": 1.02078474, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.870878767918522, "language_loss": 0.76324403, "learning_rate": 1.465202147809526e-06, "loss": 0.78773975, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.7675669193267822 }, { "auxiliary_loss_clip": 0.01179147, "auxiliary_loss_mlp": 0.01023689, "balance_loss_clip": 1.05019879, "balance_loss_mlp": 1.01705217, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 2.061824626798423, "language_loss": 0.76322949, "learning_rate": 1.4644515822856888e-06, "loss": 0.78525788, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.670954465866089 }, { "auxiliary_loss_clip": 0.01226338, "auxiliary_loss_mlp": 0.01006214, "balance_loss_clip": 1.01022124, "balance_loss_mlp": 1.00528419, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7565022048817791, "language_loss": 0.5650444, "learning_rate": 1.4637010980041215e-06, "loss": 0.58736992, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.280036449432373 }, { "auxiliary_loss_clip": 0.01178606, "auxiliary_loss_mlp": 0.01029672, "balance_loss_clip": 1.04910588, "balance_loss_mlp": 1.02198017, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 2.170206332495691, "language_loss": 0.8988874, "learning_rate": 1.4629506950786707e-06, "loss": 0.9209702, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 3.511972188949585 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.01003679, "balance_loss_clip": 1.01057708, "balance_loss_mlp": 1.002702, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8116728744637332, "language_loss": 0.56057322, "learning_rate": 1.4622003736231733e-06, "loss": 0.58128738, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.2264480590820312 }, { "auxiliary_loss_clip": 0.0123592, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.05350912, "balance_loss_mlp": 1.02164733, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 3.2796118829485557, "language_loss": 0.80513275, "learning_rate": 1.461450133751451e-06, "loss": 0.8277849, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.674395799636841 }, { "auxiliary_loss_clip": 0.0113435, "auxiliary_loss_mlp": 0.01024964, "balance_loss_clip": 1.05054021, "balance_loss_mlp": 1.01798987, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.778179844774408, "language_loss": 0.7619313, "learning_rate": 1.4606999755773153e-06, "loss": 0.78352439, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.6875221729278564 }, { "auxiliary_loss_clip": 0.01179337, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.05208433, "balance_loss_mlp": 1.02084744, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.6551710610030634, "language_loss": 0.82652843, "learning_rate": 1.4599498992145643e-06, "loss": 0.84859848, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.6555368900299072 }, { "auxiliary_loss_clip": 0.01284935, "auxiliary_loss_mlp": 0.02565454, "balance_loss_clip": 1.04961622, "balance_loss_mlp": 1.00005198, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 1.8889249690480252, "language_loss": 0.7107867, "learning_rate": 1.4591999047769846e-06, "loss": 0.74929059, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.6539177894592285 }, { "auxiliary_loss_clip": 0.01426845, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.04193783, "balance_loss_mlp": 1.02708638, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 5.608182133896914, "language_loss": 0.75120193, "learning_rate": 1.4584499923783486e-06, "loss": 0.77581513, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.7686898708343506 }, { "auxiliary_loss_clip": 0.01278591, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.04977345, "balance_loss_mlp": 1.0191896, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 2.630237564523838, "language_loss": 0.76292837, "learning_rate": 1.457700162132419e-06, "loss": 0.78597474, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.6642487049102783 }, { "auxiliary_loss_clip": 0.01374981, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.04688025, "balance_loss_mlp": 1.01962829, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.3493794544032003, "language_loss": 0.72507954, "learning_rate": 1.4569504141529433e-06, "loss": 0.74909461, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.7434444427490234 }, { "auxiliary_loss_clip": 0.01235842, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.05495656, "balance_loss_mlp": 1.02115726, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.655564464153192, "language_loss": 0.72349858, "learning_rate": 1.456200748553658e-06, "loss": 0.74614328, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.6351711750030518 }, { "auxiliary_loss_clip": 0.01182083, "auxiliary_loss_mlp": 0.01024129, "balance_loss_clip": 1.05373394, "balance_loss_mlp": 1.01677966, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.6185906012753408, "language_loss": 0.78932512, "learning_rate": 1.455451165448287e-06, "loss": 0.81138724, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.704078197479248 }, { "auxiliary_loss_clip": 0.01277605, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 1.04905009, "balance_loss_mlp": 1.0190109, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.633404017167911, "language_loss": 0.73187929, "learning_rate": 1.4547016649505407e-06, "loss": 0.75491977, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.6411120891571045 }, { "auxiliary_loss_clip": 0.01328455, "auxiliary_loss_mlp": 0.01025853, "balance_loss_clip": 1.04413712, "balance_loss_mlp": 1.01879025, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 16.03565795513229, "language_loss": 0.84734726, "learning_rate": 1.4539522471741193e-06, "loss": 0.87089032, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 2.7058205604553223 }, { "auxiliary_loss_clip": 0.01234317, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.05124021, "balance_loss_mlp": 1.02398181, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.897092949821549, "language_loss": 0.70965207, "learning_rate": 1.4532029122327067e-06, "loss": 0.73231179, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.690169334411621 }, { "auxiliary_loss_clip": 0.01325091, "auxiliary_loss_mlp": 0.01024731, "balance_loss_clip": 1.05053258, "balance_loss_mlp": 1.01809049, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 2.2076842159742087, "language_loss": 0.75611639, "learning_rate": 1.4524536602399783e-06, "loss": 0.77961463, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.7360687255859375 }, { "auxiliary_loss_clip": 0.0127236, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.04996502, "balance_loss_mlp": 1.02959991, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 2.020546614684073, "language_loss": 0.77177954, "learning_rate": 1.4517044913095938e-06, "loss": 0.79487592, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.6986284255981445 }, { "auxiliary_loss_clip": 0.01228264, "auxiliary_loss_mlp": 0.0102379, "balance_loss_clip": 1.05115879, "balance_loss_mlp": 1.01677799, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.797895157939401, "language_loss": 0.81607342, "learning_rate": 1.4509554055552022e-06, "loss": 0.83859396, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.7828688621520996 }, { "auxiliary_loss_clip": 0.01277227, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.04780316, "balance_loss_mlp": 1.02290511, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.3437118836755393, "language_loss": 0.83947289, "learning_rate": 1.450206403090439e-06, "loss": 0.86254847, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 3.755180597305298 }, { "auxiliary_loss_clip": 0.01222979, "auxiliary_loss_mlp": 0.01026444, "balance_loss_clip": 1.04960907, "balance_loss_mlp": 1.02001858, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.0609127894036656, "language_loss": 0.86758608, "learning_rate": 1.4494574840289274e-06, "loss": 0.89008033, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.658665895462036 }, { "auxiliary_loss_clip": 0.0123595, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.05032873, "balance_loss_mlp": 1.02045012, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 2.428905565698822, "language_loss": 0.73938626, "learning_rate": 1.4487086484842782e-06, "loss": 0.76202768, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 3.602520227432251 }, { "auxiliary_loss_clip": 0.01179325, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 1.05158162, "balance_loss_mlp": 1.01897764, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 2.008251606523724, "language_loss": 0.60384429, "learning_rate": 1.4479598965700878e-06, "loss": 0.62589645, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 3.495940923690796 }, { "auxiliary_loss_clip": 0.01325189, "auxiliary_loss_mlp": 0.0102827, "balance_loss_clip": 1.04440928, "balance_loss_mlp": 1.02105188, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.2058892719794896, "language_loss": 0.68816835, "learning_rate": 1.4472112283999427e-06, "loss": 0.71170294, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.711676597595215 }, { "auxiliary_loss_clip": 0.01230128, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 1.05254197, "balance_loss_mlp": 1.01760435, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 1.9882913523248578, "language_loss": 0.69265223, "learning_rate": 1.4464626440874143e-06, "loss": 0.71520138, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.6581599712371826 }, { "auxiliary_loss_clip": 0.01287127, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.04364741, "balance_loss_mlp": 1.02126944, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.4728459276616945, "language_loss": 0.74232656, "learning_rate": 1.4457141437460636e-06, "loss": 0.76548505, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.7309627532958984 }, { "auxiliary_loss_clip": 0.01278933, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.04918361, "balance_loss_mlp": 1.01852548, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.689996030921193, "language_loss": 0.73184574, "learning_rate": 1.444965727489436e-06, "loss": 0.75489891, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.715041399002075 }, { "auxiliary_loss_clip": 0.01328689, "auxiliary_loss_mlp": 0.01027083, "balance_loss_clip": 1.04526901, "balance_loss_mlp": 1.01970649, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 1.7003024254803598, "language_loss": 0.63348317, "learning_rate": 1.444217395431066e-06, "loss": 0.65704089, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 3.7348692417144775 }, { "auxiliary_loss_clip": 0.01228912, "auxiliary_loss_mlp": 0.01003527, "balance_loss_clip": 1.01576447, "balance_loss_mlp": 1.00258529, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.7972476205367459, "language_loss": 0.55762768, "learning_rate": 1.4434691476844755e-06, "loss": 0.57995212, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.3232083320617676 }, { "auxiliary_loss_clip": 0.01276316, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.05035603, "balance_loss_mlp": 1.02501237, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.218038815036032, "language_loss": 0.66917479, "learning_rate": 1.4427209843631729e-06, "loss": 0.69226193, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.8046412467956543 }, { "auxiliary_loss_clip": 0.01178219, "auxiliary_loss_mlp": 0.02566774, "balance_loss_clip": 1.0514133, "balance_loss_mlp": 1.00004959, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.904332361372777, "language_loss": 0.81346107, "learning_rate": 1.4419729055806534e-06, "loss": 0.85091102, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.660416841506958 }, { "auxiliary_loss_clip": 0.01272898, "auxiliary_loss_mlp": 0.02564853, "balance_loss_clip": 1.04989588, "balance_loss_mlp": 0.99999154, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.7338182783881266, "language_loss": 0.82429445, "learning_rate": 1.441224911450401e-06, "loss": 0.86267197, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.7176027297973633 }, { "auxiliary_loss_clip": 0.01232457, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.04979181, "balance_loss_mlp": 1.01986313, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.9081200557476563, "language_loss": 0.82389176, "learning_rate": 1.4404770020858851e-06, "loss": 0.84649026, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.7317960262298584 }, { "auxiliary_loss_clip": 0.01223152, "auxiliary_loss_mlp": 0.01023632, "balance_loss_clip": 1.04894447, "balance_loss_mlp": 1.01681352, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 2.080285334622596, "language_loss": 0.86318272, "learning_rate": 1.439729177600563e-06, "loss": 0.88565052, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.796569585800171 }, { "auxiliary_loss_clip": 0.01228813, "auxiliary_loss_mlp": 0.01029264, "balance_loss_clip": 1.05380964, "balance_loss_mlp": 1.02217376, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 2.2254281548957224, "language_loss": 0.73425752, "learning_rate": 1.4389814381078793e-06, "loss": 0.75683832, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.686742067337036 }, { "auxiliary_loss_clip": 0.01577815, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.04248786, "balance_loss_mlp": 1.02048874, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 2.1875956001995553, "language_loss": 0.80085951, "learning_rate": 1.438233783721265e-06, "loss": 0.8269124, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 3.048121452331543 }, { "auxiliary_loss_clip": 0.01278946, "auxiliary_loss_mlp": 0.0102523, "balance_loss_clip": 1.05517852, "balance_loss_mlp": 1.0184145, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 2.102401068104626, "language_loss": 0.78084451, "learning_rate": 1.43748621455414e-06, "loss": 0.80388629, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 3.2835686206817627 }, { "auxiliary_loss_clip": 0.01274135, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.04849255, "balance_loss_mlp": 1.02089548, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.5356648149773457, "language_loss": 0.80820858, "learning_rate": 1.4367387307199082e-06, "loss": 0.83123934, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.686060667037964 }, { "auxiliary_loss_clip": 0.01225617, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.047194, "balance_loss_mlp": 1.01810026, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 1.7355313547302338, "language_loss": 0.82548952, "learning_rate": 1.4359913323319632e-06, "loss": 0.84800351, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.7059874534606934 }, { "auxiliary_loss_clip": 0.01420945, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.0419662, "balance_loss_mlp": 1.02430582, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 2.010110050340027, "language_loss": 0.77951711, "learning_rate": 1.4352440195036847e-06, "loss": 0.80404282, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.886763095855713 }, { "auxiliary_loss_clip": 0.01380064, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.0405798, "balance_loss_mlp": 1.02423286, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 2.1098677521393507, "language_loss": 0.80119169, "learning_rate": 1.4344967923484395e-06, "loss": 0.8253051, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.916806221008301 }, { "auxiliary_loss_clip": 0.01226752, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.05001616, "balance_loss_mlp": 1.0218749, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.1711853162579815, "language_loss": 0.72255075, "learning_rate": 1.433749650979581e-06, "loss": 0.74511158, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.7875478267669678 }, { "auxiliary_loss_clip": 0.01233862, "auxiliary_loss_mlp": 0.01021005, "balance_loss_clip": 1.04860198, "balance_loss_mlp": 1.01411819, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.8945913533465975, "language_loss": 0.68201149, "learning_rate": 1.433002595510451e-06, "loss": 0.70456016, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.8953819274902344 }, { "auxiliary_loss_clip": 0.0127711, "auxiliary_loss_mlp": 0.0256761, "balance_loss_clip": 1.04688931, "balance_loss_mlp": 0.99998057, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.871680781186828, "language_loss": 0.71912295, "learning_rate": 1.4322556260543757e-06, "loss": 0.75757015, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.783935785293579 }, { "auxiliary_loss_clip": 0.01226116, "auxiliary_loss_mlp": 0.01002395, "balance_loss_clip": 1.00887036, "balance_loss_mlp": 1.00148857, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.8983325597508501, "language_loss": 0.62655783, "learning_rate": 1.4315087427246703e-06, "loss": 0.64884293, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.2141313552856445 }, { "auxiliary_loss_clip": 0.01065338, "auxiliary_loss_mlp": 0.01004061, "balance_loss_clip": 1.00782537, "balance_loss_mlp": 1.00310695, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8689754599828391, "language_loss": 0.58425117, "learning_rate": 1.4307619456346372e-06, "loss": 0.60494512, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 4.5787739753723145 }, { "auxiliary_loss_clip": 0.01232447, "auxiliary_loss_mlp": 0.01025663, "balance_loss_clip": 1.04818475, "balance_loss_mlp": 1.01825452, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 2.9019303056002057, "language_loss": 0.74097514, "learning_rate": 1.430015234897564e-06, "loss": 0.76355624, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.8195230960845947 }, { "auxiliary_loss_clip": 0.01179874, "auxiliary_loss_mlp": 0.02568029, "balance_loss_clip": 1.05033517, "balance_loss_mlp": 0.99997336, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.6076980803066245, "language_loss": 0.66399097, "learning_rate": 1.4292686106267274e-06, "loss": 0.70146996, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 3.7468481063842773 }, { "auxiliary_loss_clip": 0.01236322, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.05159044, "balance_loss_mlp": 1.02570176, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.7525948249560988, "language_loss": 0.7750802, "learning_rate": 1.4285220729353876e-06, "loss": 0.79777205, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 3.929567813873291 }, { "auxiliary_loss_clip": 0.01279253, "auxiliary_loss_mlp": 0.01030591, "balance_loss_clip": 1.04539084, "balance_loss_mlp": 1.02280688, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 3.966907182364601, "language_loss": 0.77950674, "learning_rate": 1.4277756219367957e-06, "loss": 0.80260515, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.7531237602233887 }, { "auxiliary_loss_clip": 0.01339315, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.05094194, "balance_loss_mlp": 1.01992321, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 3.0033499232898087, "language_loss": 0.7974084, "learning_rate": 1.4270292577441864e-06, "loss": 0.82108188, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.7097952365875244 }, { "auxiliary_loss_clip": 0.01233704, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.04827952, "balance_loss_mlp": 1.01881957, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.9545509554948204, "language_loss": 0.71638322, "learning_rate": 1.4262829804707836e-06, "loss": 0.73898357, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.7280526161193848 }, { "auxiliary_loss_clip": 0.01233521, "auxiliary_loss_mlp": 0.01034262, "balance_loss_clip": 1.04931927, "balance_loss_mlp": 1.02647817, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.4066600806637084, "language_loss": 0.70025283, "learning_rate": 1.4255367902297958e-06, "loss": 0.72293067, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.7799429893493652 }, { "auxiliary_loss_clip": 0.011801, "auxiliary_loss_mlp": 0.01029391, "balance_loss_clip": 1.05210042, "balance_loss_mlp": 1.0225575, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.6613396842999713, "language_loss": 0.78924572, "learning_rate": 1.4247906871344215e-06, "loss": 0.81134063, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 3.550607919692993 }, { "auxiliary_loss_clip": 0.01272555, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04404557, "balance_loss_mlp": 1.0186516, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.5644566588622633, "language_loss": 0.74938524, "learning_rate": 1.4240446712978415e-06, "loss": 0.77236682, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.817582607269287 }, { "auxiliary_loss_clip": 0.01234036, "auxiliary_loss_mlp": 0.01029941, "balance_loss_clip": 1.04955792, "balance_loss_mlp": 1.02169716, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 1.9070273822012955, "language_loss": 0.74593818, "learning_rate": 1.423298742833227e-06, "loss": 0.76857793, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.7709474563598633 }, { "auxiliary_loss_clip": 0.01333371, "auxiliary_loss_mlp": 0.01024834, "balance_loss_clip": 1.04626477, "balance_loss_mlp": 1.01726973, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 2.3823539755416467, "language_loss": 0.71914679, "learning_rate": 1.4225529018537352e-06, "loss": 0.74272883, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.739236354827881 }, { "auxiliary_loss_clip": 0.01181977, "auxiliary_loss_mlp": 0.01034175, "balance_loss_clip": 1.05219126, "balance_loss_mlp": 1.02677214, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 3.603574733174264, "language_loss": 0.77924377, "learning_rate": 1.4218071484725082e-06, "loss": 0.80140531, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.693207025527954 }, { "auxiliary_loss_clip": 0.01280212, "auxiliary_loss_mlp": 0.01031364, "balance_loss_clip": 1.05347753, "balance_loss_mlp": 1.02349019, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 2.086728143538502, "language_loss": 0.76417339, "learning_rate": 1.4210614828026786e-06, "loss": 0.78728914, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.709721326828003 }, { "auxiliary_loss_clip": 0.011792, "auxiliary_loss_mlp": 0.01031934, "balance_loss_clip": 1.04998922, "balance_loss_mlp": 1.02493072, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 2.031425669881099, "language_loss": 0.74455422, "learning_rate": 1.4203159049573605e-06, "loss": 0.76666558, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.8645055294036865 }, { "auxiliary_loss_clip": 0.01187378, "auxiliary_loss_mlp": 0.01026192, "balance_loss_clip": 1.04781139, "balance_loss_mlp": 1.01887512, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.2937301483063397, "language_loss": 0.87160778, "learning_rate": 1.4195704150496593e-06, "loss": 0.89374352, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.7440223693847656 }, { "auxiliary_loss_clip": 0.01281362, "auxiliary_loss_mlp": 0.01028728, "balance_loss_clip": 1.05087733, "balance_loss_mlp": 1.02100921, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.7075092883101335, "language_loss": 0.73981327, "learning_rate": 1.4188250131926639e-06, "loss": 0.76291418, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.6357831954956055 }, { "auxiliary_loss_clip": 0.01278966, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04639375, "balance_loss_mlp": 1.01812506, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 2.3033657665835627, "language_loss": 0.80372596, "learning_rate": 1.4180796994994525e-06, "loss": 0.82677001, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.6870367527008057 }, { "auxiliary_loss_clip": 0.01276345, "auxiliary_loss_mlp": 0.01028131, "balance_loss_clip": 1.04726529, "balance_loss_mlp": 1.0204004, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.9399255968810547, "language_loss": 0.71836913, "learning_rate": 1.4173344740830877e-06, "loss": 0.74141389, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.7284653186798096 }, { "auxiliary_loss_clip": 0.01283088, "auxiliary_loss_mlp": 0.01026125, "balance_loss_clip": 1.05286574, "balance_loss_mlp": 1.01877522, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.6117692071480423, "language_loss": 0.70862311, "learning_rate": 1.4165893370566206e-06, "loss": 0.7317152, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.7989611625671387 }, { "auxiliary_loss_clip": 0.01229442, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.04831886, "balance_loss_mlp": 1.01930881, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.8178210417728258, "language_loss": 0.77845156, "learning_rate": 1.4158442885330865e-06, "loss": 0.80101597, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.6994550228118896 }, { "auxiliary_loss_clip": 0.01226311, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.04766738, "balance_loss_mlp": 1.01953411, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 1.9846539028871306, "language_loss": 0.78971744, "learning_rate": 1.4150993286255094e-06, "loss": 0.81224942, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.6694443225860596 }, { "auxiliary_loss_clip": 0.01178727, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.05026615, "balance_loss_mlp": 1.02461398, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.1178600239081207, "language_loss": 0.79970717, "learning_rate": 1.4143544574468993e-06, "loss": 0.82181025, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.6635169982910156 }, { "auxiliary_loss_clip": 0.01225376, "auxiliary_loss_mlp": 0.01026207, "balance_loss_clip": 1.04951727, "balance_loss_mlp": 1.01891458, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 1.705358468667441, "language_loss": 0.82405257, "learning_rate": 1.4136096751102523e-06, "loss": 0.84656841, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.773298740386963 }, { "auxiliary_loss_clip": 0.01281522, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.05042124, "balance_loss_mlp": 1.02362788, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.382053825167616, "language_loss": 0.8273164, "learning_rate": 1.4128649817285516e-06, "loss": 0.85044199, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.830150842666626 }, { "auxiliary_loss_clip": 0.01278899, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.04491341, "balance_loss_mlp": 1.02444839, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 1.8738239737712616, "language_loss": 0.63194084, "learning_rate": 1.412120377414766e-06, "loss": 0.65504938, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.8006887435913086 }, { "auxiliary_loss_clip": 0.01178871, "auxiliary_loss_mlp": 0.01035334, "balance_loss_clip": 1.05214274, "balance_loss_mlp": 1.02740908, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.6620979789864923, "language_loss": 0.71454352, "learning_rate": 1.4113758622818522e-06, "loss": 0.73668551, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 4.141252040863037 }, { "auxiliary_loss_clip": 0.01286883, "auxiliary_loss_mlp": 0.02564543, "balance_loss_clip": 1.05218256, "balance_loss_mlp": 0.99996841, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 2.0492725985942313, "language_loss": 0.83405542, "learning_rate": 1.410631436442751e-06, "loss": 0.87256968, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 3.732515573501587 }, { "auxiliary_loss_clip": 0.01233554, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.04850221, "balance_loss_mlp": 1.02479386, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 1.9869314742199573, "language_loss": 0.86571693, "learning_rate": 1.4098871000103936e-06, "loss": 0.88837427, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.7047574520111084 }, { "auxiliary_loss_clip": 0.01180527, "auxiliary_loss_mlp": 0.01025095, "balance_loss_clip": 1.04677737, "balance_loss_mlp": 1.01800203, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 2.1586860286235723, "language_loss": 0.82618189, "learning_rate": 1.409142853097693e-06, "loss": 0.84823811, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 3.6650140285491943 }, { "auxiliary_loss_clip": 0.01278518, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.04855883, "balance_loss_mlp": 1.01957679, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 2.056654171044102, "language_loss": 0.79671979, "learning_rate": 1.408398695817553e-06, "loss": 0.81977946, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.812471628189087 }, { "auxiliary_loss_clip": 0.01279205, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.04618406, "balance_loss_mlp": 1.02042747, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 3.19384642144775, "language_loss": 0.70598334, "learning_rate": 1.4076546282828593e-06, "loss": 0.72906721, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.776175022125244 }, { "auxiliary_loss_clip": 0.0128233, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.04473627, "balance_loss_mlp": 1.01953495, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 3.8082812372556285, "language_loss": 0.66488147, "learning_rate": 1.4069106506064874e-06, "loss": 0.68797779, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.9406931400299072 }, { "auxiliary_loss_clip": 0.01275921, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.05092275, "balance_loss_mlp": 1.0178473, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 2.153407766123738, "language_loss": 0.78668827, "learning_rate": 1.4061667629012989e-06, "loss": 0.80969232, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.7238872051239014 }, { "auxiliary_loss_clip": 0.01270663, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.04790413, "balance_loss_mlp": 1.01894581, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 1.7336160507821639, "language_loss": 0.83341753, "learning_rate": 1.40542296528014e-06, "loss": 0.85638607, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 3.622704267501831 }, { "auxiliary_loss_clip": 0.01229735, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.0479095, "balance_loss_mlp": 1.02399206, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.2880985935399076, "language_loss": 0.76162452, "learning_rate": 1.4046792578558452e-06, "loss": 0.78423899, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.7045700550079346 }, { "auxiliary_loss_clip": 0.0127794, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 1.04683256, "balance_loss_mlp": 1.02023149, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.6860940550840278, "language_loss": 0.76101631, "learning_rate": 1.4039356407412325e-06, "loss": 0.78406489, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.730055809020996 }, { "auxiliary_loss_clip": 0.01118837, "auxiliary_loss_mlp": 0.00999149, "balance_loss_clip": 1.00738192, "balance_loss_mlp": 0.99816573, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7771161623019952, "language_loss": 0.57084423, "learning_rate": 1.40319211404911e-06, "loss": 0.59202409, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.218367576599121 }, { "auxiliary_loss_clip": 0.01182033, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05204296, "balance_loss_mlp": 1.02093983, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.7604036922074076, "language_loss": 0.90510488, "learning_rate": 1.4024486778922691e-06, "loss": 0.92720419, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.684919834136963 }, { "auxiliary_loss_clip": 0.01284927, "auxiliary_loss_mlp": 0.01024403, "balance_loss_clip": 1.04760289, "balance_loss_mlp": 1.01724124, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 2.4989852901873943, "language_loss": 0.77817535, "learning_rate": 1.4017053323834884e-06, "loss": 0.80126864, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.6925270557403564 }, { "auxiliary_loss_clip": 0.01279401, "auxiliary_loss_mlp": 0.01026013, "balance_loss_clip": 1.04566479, "balance_loss_mlp": 1.01909876, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 2.0483751071076877, "language_loss": 0.76034105, "learning_rate": 1.4009620776355333e-06, "loss": 0.78339517, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.8085622787475586 }, { "auxiliary_loss_clip": 0.01228504, "auxiliary_loss_mlp": 0.01023567, "balance_loss_clip": 1.04720771, "balance_loss_mlp": 1.01616967, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 1.747191434384784, "language_loss": 0.79150331, "learning_rate": 1.4002189137611553e-06, "loss": 0.81402403, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.7434160709381104 }, { "auxiliary_loss_clip": 0.01227366, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.04738891, "balance_loss_mlp": 1.01976275, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 1.983116778925235, "language_loss": 0.69790727, "learning_rate": 1.3994758408730901e-06, "loss": 0.7204504, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.733985185623169 }, { "auxiliary_loss_clip": 0.01281335, "auxiliary_loss_mlp": 0.01026771, "balance_loss_clip": 1.05075538, "balance_loss_mlp": 1.01964247, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 24.778478681251023, "language_loss": 0.76264465, "learning_rate": 1.3987328590840629e-06, "loss": 0.78572571, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.7102322578430176 }, { "auxiliary_loss_clip": 0.01229304, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.04787052, "balance_loss_mlp": 1.02432275, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.8250819287650555, "language_loss": 0.86709768, "learning_rate": 1.397989968506783e-06, "loss": 0.88971221, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.662245512008667 }, { "auxiliary_loss_clip": 0.01185095, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.05434442, "balance_loss_mlp": 1.01970053, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 2.4832002341530743, "language_loss": 0.72801512, "learning_rate": 1.3972471692539458e-06, "loss": 0.7501384, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.5485498905181885 }, { "auxiliary_loss_clip": 0.01276053, "auxiliary_loss_mlp": 0.01025908, "balance_loss_clip": 1.04706156, "balance_loss_mlp": 1.01842201, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.4331394185749806, "language_loss": 0.75345838, "learning_rate": 1.3965044614382348e-06, "loss": 0.77647799, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.6407153606414795 }, { "auxiliary_loss_clip": 0.01183915, "auxiliary_loss_mlp": 0.01023759, "balance_loss_clip": 1.05279875, "balance_loss_mlp": 1.01569438, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 3.1914612237330293, "language_loss": 0.75773156, "learning_rate": 1.3957618451723162e-06, "loss": 0.77980828, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.586203098297119 }, { "auxiliary_loss_clip": 0.01279355, "auxiliary_loss_mlp": 0.01032586, "balance_loss_clip": 1.04743218, "balance_loss_mlp": 1.02506638, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 2.339498541173769, "language_loss": 0.71637714, "learning_rate": 1.3950193205688457e-06, "loss": 0.73949659, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.799733877182007 }, { "auxiliary_loss_clip": 0.01277481, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.04876447, "balance_loss_mlp": 1.01918304, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.9241321393487125, "language_loss": 0.83883077, "learning_rate": 1.3942768877404627e-06, "loss": 0.86186576, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.6849329471588135 }, { "auxiliary_loss_clip": 0.01178805, "auxiliary_loss_mlp": 0.01022837, "balance_loss_clip": 1.04876351, "balance_loss_mlp": 1.01544023, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.7002385726661227, "language_loss": 0.73780036, "learning_rate": 1.393534546799795e-06, "loss": 0.75981677, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.6749420166015625 }, { "auxiliary_loss_clip": 0.01271104, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.04722333, "balance_loss_mlp": 1.02143168, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.6414674550216768, "language_loss": 0.68011057, "learning_rate": 1.3927922978594536e-06, "loss": 0.70311713, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.786829710006714 }, { "auxiliary_loss_clip": 0.01115416, "auxiliary_loss_mlp": 0.01002627, "balance_loss_clip": 1.00775933, "balance_loss_mlp": 1.00154257, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7721370157636712, "language_loss": 0.57393837, "learning_rate": 1.3920501410320387e-06, "loss": 0.59511882, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 4.255307912826538 }, { "auxiliary_loss_clip": 0.01277201, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 1.04540157, "balance_loss_mlp": 1.01501966, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.6902751429848393, "language_loss": 0.75746793, "learning_rate": 1.3913080764301333e-06, "loss": 0.78046668, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 3.625964403152466 }, { "auxiliary_loss_clip": 0.01386087, "auxiliary_loss_mlp": 0.0103124, "balance_loss_clip": 1.04314232, "balance_loss_mlp": 1.02367306, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 1.7802188404077606, "language_loss": 0.71476889, "learning_rate": 1.3905661041663085e-06, "loss": 0.73894221, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 2.841189384460449 }, { "auxiliary_loss_clip": 0.01231068, "auxiliary_loss_mlp": 0.01030497, "balance_loss_clip": 1.05143046, "balance_loss_mlp": 1.02277219, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.054643632275565, "language_loss": 0.64722079, "learning_rate": 1.389824224353122e-06, "loss": 0.66983646, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 3.659517526626587 }, { "auxiliary_loss_clip": 0.01227704, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.05134881, "balance_loss_mlp": 1.02026463, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.6337457802526285, "language_loss": 0.76893508, "learning_rate": 1.389082437103115e-06, "loss": 0.79148924, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.769146203994751 }, { "auxiliary_loss_clip": 0.01327851, "auxiliary_loss_mlp": 0.01026775, "balance_loss_clip": 1.04477632, "balance_loss_mlp": 1.01913965, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 1.934015409492076, "language_loss": 0.78669846, "learning_rate": 1.3883407425288172e-06, "loss": 0.81024474, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.769529342651367 }, { "auxiliary_loss_clip": 0.01278341, "auxiliary_loss_mlp": 0.01026901, "balance_loss_clip": 1.04550993, "balance_loss_mlp": 1.01964092, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.8838529712910326, "language_loss": 0.79736185, "learning_rate": 1.3875991407427417e-06, "loss": 0.82041425, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.6802284717559814 }, { "auxiliary_loss_clip": 0.01231161, "auxiliary_loss_mlp": 0.01001822, "balance_loss_clip": 1.00893283, "balance_loss_mlp": 1.00076711, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.7831935192920694, "language_loss": 0.58144033, "learning_rate": 1.38685763185739e-06, "loss": 0.60377014, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.2711102962493896 }, { "auxiliary_loss_clip": 0.01177929, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.05007136, "balance_loss_mlp": 1.01696253, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.62657510639017, "language_loss": 0.6770398, "learning_rate": 1.3861162159852476e-06, "loss": 0.69906294, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 3.5393564701080322 }, { "auxiliary_loss_clip": 0.01284866, "auxiliary_loss_mlp": 0.01024148, "balance_loss_clip": 1.05095577, "balance_loss_mlp": 1.01670909, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 2.075124553832144, "language_loss": 0.79806733, "learning_rate": 1.3853748932387875e-06, "loss": 0.8211574, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.7660224437713623 }, { "auxiliary_loss_clip": 0.0127205, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.04629374, "balance_loss_mlp": 1.01711798, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 6.067525705034636, "language_loss": 0.75035137, "learning_rate": 1.3846336637304671e-06, "loss": 0.77331758, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.735663414001465 }, { "auxiliary_loss_clip": 0.01280817, "auxiliary_loss_mlp": 0.01024088, "balance_loss_clip": 1.05004752, "balance_loss_mlp": 1.01700687, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 2.096768264603153, "language_loss": 0.83327842, "learning_rate": 1.3838925275727316e-06, "loss": 0.85632747, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.8134238719940186 }, { "auxiliary_loss_clip": 0.01181002, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.05196285, "balance_loss_mlp": 1.02164507, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 1.7086477015624633, "language_loss": 0.78824651, "learning_rate": 1.3831514848780089e-06, "loss": 0.8103478, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.6036839485168457 }, { "auxiliary_loss_clip": 0.01226345, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.04780459, "balance_loss_mlp": 1.02561879, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.33555512071763, "language_loss": 0.91948283, "learning_rate": 1.3824105357587152e-06, "loss": 0.94207656, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.6591315269470215 }, { "auxiliary_loss_clip": 0.01271779, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 1.04431272, "balance_loss_mlp": 1.0199815, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.734549861503467, "language_loss": 0.82519281, "learning_rate": 1.381669680327253e-06, "loss": 0.84818339, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.865927219390869 }, { "auxiliary_loss_clip": 0.01272866, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.04737306, "balance_loss_mlp": 1.019557, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 2.614178639255029, "language_loss": 0.7128185, "learning_rate": 1.380928918696008e-06, "loss": 0.73581755, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.6894822120666504 }, { "auxiliary_loss_clip": 0.01223882, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.04712176, "balance_loss_mlp": 1.01757407, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.3477450039779018, "language_loss": 0.71829361, "learning_rate": 1.3801882509773548e-06, "loss": 0.74078155, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.662302255630493 }, { "auxiliary_loss_clip": 0.01226464, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.04677665, "balance_loss_mlp": 1.02280021, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.8333544805554682, "language_loss": 0.81670219, "learning_rate": 1.3794476772836503e-06, "loss": 0.83926672, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.708315372467041 }, { "auxiliary_loss_clip": 0.01324966, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 1.04746175, "balance_loss_mlp": 1.01716387, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.6891738140571082, "language_loss": 0.84518534, "learning_rate": 1.3787071977272402e-06, "loss": 0.86868709, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.738004207611084 }, { "auxiliary_loss_clip": 0.01372547, "auxiliary_loss_mlp": 0.01025522, "balance_loss_clip": 1.04763174, "balance_loss_mlp": 1.01758575, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 2.5081761880912308, "language_loss": 0.72080177, "learning_rate": 1.3779668124204535e-06, "loss": 0.74478245, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.6716678142547607 }, { "auxiliary_loss_clip": 0.01271413, "auxiliary_loss_mlp": 0.01023229, "balance_loss_clip": 1.04971433, "balance_loss_mlp": 1.01674092, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.6724381783352378, "language_loss": 0.809461, "learning_rate": 1.3772265214756074e-06, "loss": 0.83240741, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.722831964492798 }, { "auxiliary_loss_clip": 0.01230669, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.04776382, "balance_loss_mlp": 1.02192628, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 1.8926896620040188, "language_loss": 0.75520289, "learning_rate": 1.3764863250050025e-06, "loss": 0.77780271, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.651766300201416 }, { "auxiliary_loss_clip": 0.01325393, "auxiliary_loss_mlp": 0.0102705, "balance_loss_clip": 1.04397845, "balance_loss_mlp": 1.02004623, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 2.0660251192747747, "language_loss": 0.80419147, "learning_rate": 1.3757462231209272e-06, "loss": 0.82771587, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.7281382083892822 }, { "auxiliary_loss_clip": 0.01269513, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.0448128, "balance_loss_mlp": 1.02142692, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 2.371458902252567, "language_loss": 0.88863951, "learning_rate": 1.3750062159356525e-06, "loss": 0.91162682, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.69254732131958 }, { "auxiliary_loss_clip": 0.01315366, "auxiliary_loss_mlp": 0.01020693, "balance_loss_clip": 1.04239547, "balance_loss_mlp": 1.01385617, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.8859435975993797, "language_loss": 0.83045381, "learning_rate": 1.3742663035614382e-06, "loss": 0.85381442, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.762586832046509 }, { "auxiliary_loss_clip": 0.01179121, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.05024481, "balance_loss_mlp": 1.02252102, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 2.034664740934993, "language_loss": 0.80163676, "learning_rate": 1.3735264861105283e-06, "loss": 0.82372618, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.7184720039367676 }, { "auxiliary_loss_clip": 0.01328545, "auxiliary_loss_mlp": 0.0102511, "balance_loss_clip": 1.04625583, "balance_loss_mlp": 1.01807117, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 5.907338035391929, "language_loss": 0.78415602, "learning_rate": 1.372786763695152e-06, "loss": 0.80769253, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 3.7518231868743896 }, { "auxiliary_loss_clip": 0.01233429, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05000186, "balance_loss_mlp": 1.0212009, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 2.443113099558924, "language_loss": 0.77373075, "learning_rate": 1.3720471364275257e-06, "loss": 0.79635674, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 3.6105997562408447 }, { "auxiliary_loss_clip": 0.01329178, "auxiliary_loss_mlp": 0.02568586, "balance_loss_clip": 1.04647756, "balance_loss_mlp": 0.99998236, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 2.0997370297787237, "language_loss": 0.78796077, "learning_rate": 1.3713076044198486e-06, "loss": 0.82693839, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 2.7435214519500732 }, { "auxiliary_loss_clip": 0.01274272, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.04676259, "balance_loss_mlp": 1.02200246, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.810606703306165, "language_loss": 0.81094229, "learning_rate": 1.3705681677843086e-06, "loss": 0.83398175, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 3.58648419380188 }, { "auxiliary_loss_clip": 0.01064833, "auxiliary_loss_mlp": 0.01000372, "balance_loss_clip": 1.00756872, "balance_loss_mlp": 0.9993704, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7936162611209164, "language_loss": 0.60491866, "learning_rate": 1.3698288266330768e-06, "loss": 0.62557071, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.2810003757476807 }, { "auxiliary_loss_clip": 0.01275077, "auxiliary_loss_mlp": 0.01022894, "balance_loss_clip": 1.05172014, "balance_loss_mlp": 1.01603329, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.1929564730477136, "language_loss": 0.72329116, "learning_rate": 1.3690895810783113e-06, "loss": 0.74627084, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.776644468307495 }, { "auxiliary_loss_clip": 0.01329843, "auxiliary_loss_mlp": 0.02567217, "balance_loss_clip": 1.04040289, "balance_loss_mlp": 0.9999795, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.2964756821147727, "language_loss": 0.71614981, "learning_rate": 1.3683504312321543e-06, "loss": 0.7551204, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.9300575256347656 }, { "auxiliary_loss_clip": 0.01234873, "auxiliary_loss_mlp": 0.01022743, "balance_loss_clip": 1.05085647, "balance_loss_mlp": 1.01547468, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 1.9803998944057724, "language_loss": 0.80130732, "learning_rate": 1.3676113772067355e-06, "loss": 0.82388353, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.9907724857330322 }, { "auxiliary_loss_clip": 0.01383491, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.04642177, "balance_loss_mlp": 1.02039361, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 1.961649429490757, "language_loss": 0.72380131, "learning_rate": 1.3668724191141671e-06, "loss": 0.74791789, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 3.729192018508911 }, { "auxiliary_loss_clip": 0.01329212, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.05345905, "balance_loss_mlp": 1.02488422, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 2.4881895040003394, "language_loss": 0.66502643, "learning_rate": 1.3661335570665493e-06, "loss": 0.68863928, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.7587826251983643 }, { "auxiliary_loss_clip": 0.01283091, "auxiliary_loss_mlp": 0.01027299, "balance_loss_clip": 1.05135393, "balance_loss_mlp": 1.02039075, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.6182252829555157, "language_loss": 0.69748318, "learning_rate": 1.3653947911759676e-06, "loss": 0.72058713, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.7654523849487305 }, { "auxiliary_loss_clip": 0.01373038, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.04583406, "balance_loss_mlp": 1.02497029, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.6807348041616355, "language_loss": 0.74602413, "learning_rate": 1.3646561215544904e-06, "loss": 0.7700758, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.9578561782836914 }, { "auxiliary_loss_clip": 0.01227424, "auxiliary_loss_mlp": 0.01024866, "balance_loss_clip": 1.05005574, "balance_loss_mlp": 1.01743889, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.3323324350479084, "language_loss": 0.79660273, "learning_rate": 1.363917548314176e-06, "loss": 0.81912565, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.775977373123169 }, { "auxiliary_loss_clip": 0.01134192, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.04886377, "balance_loss_mlp": 1.02700651, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.7972440066675273, "language_loss": 0.72958148, "learning_rate": 1.3631790715670626e-06, "loss": 0.75126743, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.7443110942840576 }, { "auxiliary_loss_clip": 0.01463147, "auxiliary_loss_mlp": 0.01023862, "balance_loss_clip": 1.04335523, "balance_loss_mlp": 1.01720452, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 2.173761382545158, "language_loss": 0.85850108, "learning_rate": 1.3624406914251783e-06, "loss": 0.88337123, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.9799575805664062 }, { "auxiliary_loss_clip": 0.01228751, "auxiliary_loss_mlp": 0.01023522, "balance_loss_clip": 1.04750562, "balance_loss_mlp": 1.01681685, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 1.8983580874208135, "language_loss": 0.88651681, "learning_rate": 1.3617024080005335e-06, "loss": 0.9090395, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.7704038619995117 }, { "auxiliary_loss_clip": 0.01178742, "auxiliary_loss_mlp": 0.02569393, "balance_loss_clip": 1.04594541, "balance_loss_mlp": 0.99995601, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.6271135355913813, "language_loss": 0.74328035, "learning_rate": 1.3609642214051266e-06, "loss": 0.78076166, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.8735740184783936 }, { "auxiliary_loss_clip": 0.01281828, "auxiliary_loss_mlp": 0.0102717, "balance_loss_clip": 1.0529449, "balance_loss_mlp": 1.01941574, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 1.8610784354118166, "language_loss": 0.66343129, "learning_rate": 1.3602261317509385e-06, "loss": 0.68652129, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.745975971221924 }, { "auxiliary_loss_clip": 0.01230423, "auxiliary_loss_mlp": 0.01022505, "balance_loss_clip": 1.05054808, "balance_loss_mlp": 1.01517069, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 3.1905826166171742, "language_loss": 0.82967532, "learning_rate": 1.3594881391499387e-06, "loss": 0.85220456, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.6698200702667236 }, { "auxiliary_loss_clip": 0.01277466, "auxiliary_loss_mlp": 0.01028885, "balance_loss_clip": 1.04678631, "balance_loss_mlp": 1.02164936, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 1.8514314356588815, "language_loss": 0.79604226, "learning_rate": 1.3587502437140778e-06, "loss": 0.81910574, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.7267770767211914 }, { "auxiliary_loss_clip": 0.01281474, "auxiliary_loss_mlp": 0.01031836, "balance_loss_clip": 1.04691994, "balance_loss_mlp": 1.02471876, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.5099835089305578, "language_loss": 0.85009122, "learning_rate": 1.3580124455552952e-06, "loss": 0.87322426, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.8027498722076416 }, { "auxiliary_loss_clip": 0.01226615, "auxiliary_loss_mlp": 0.02563958, "balance_loss_clip": 1.04950523, "balance_loss_mlp": 0.99996656, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.7755147425596598, "language_loss": 0.8731975, "learning_rate": 1.3572747447855148e-06, "loss": 0.91110319, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.760728597640991 }, { "auxiliary_loss_clip": 0.01183512, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.05328405, "balance_loss_mlp": 1.02139604, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 1.8922802823042264, "language_loss": 0.69351304, "learning_rate": 1.356537141516644e-06, "loss": 0.71564054, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.6115829944610596 }, { "auxiliary_loss_clip": 0.01226506, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.05128622, "balance_loss_mlp": 1.02142787, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 2.489054613470497, "language_loss": 0.61631799, "learning_rate": 1.3557996358605775e-06, "loss": 0.63886982, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.819504737854004 }, { "auxiliary_loss_clip": 0.01228565, "auxiliary_loss_mlp": 0.01026568, "balance_loss_clip": 1.0485934, "balance_loss_mlp": 1.01973724, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.2466334089035707, "language_loss": 0.70352024, "learning_rate": 1.3550622279291941e-06, "loss": 0.7260716, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.672429323196411 }, { "auxiliary_loss_clip": 0.01367066, "auxiliary_loss_mlp": 0.01024251, "balance_loss_clip": 1.04225373, "balance_loss_mlp": 1.01709008, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.418581236876818, "language_loss": 0.83451629, "learning_rate": 1.354324917834358e-06, "loss": 0.85842949, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.848684310913086 }, { "auxiliary_loss_clip": 0.01426828, "auxiliary_loss_mlp": 0.02567051, "balance_loss_clip": 1.04590964, "balance_loss_mlp": 0.99992347, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 3.5088033132772725, "language_loss": 0.77156854, "learning_rate": 1.353587705687918e-06, "loss": 0.81150734, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 4.22551703453064 }, { "auxiliary_loss_clip": 0.01287001, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.05263257, "balance_loss_mlp": 1.02402818, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 7.130138144029775, "language_loss": 0.72627622, "learning_rate": 1.3528505916017096e-06, "loss": 0.74947023, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 3.6219944953918457 }, { "auxiliary_loss_clip": 0.01230193, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 1.04780316, "balance_loss_mlp": 1.01704192, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.0887777051120198, "language_loss": 0.88790703, "learning_rate": 1.3521135756875514e-06, "loss": 0.91045332, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 2.7215476036071777 }, { "auxiliary_loss_clip": 0.01419657, "auxiliary_loss_mlp": 0.01022724, "balance_loss_clip": 1.04278588, "balance_loss_mlp": 1.01589656, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.4668499678122036, "language_loss": 0.86190706, "learning_rate": 1.3513766580572496e-06, "loss": 0.88633084, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 3.7690961360931396 }, { "auxiliary_loss_clip": 0.01229049, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.05060124, "balance_loss_mlp": 1.02122188, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.1272319484678173, "language_loss": 0.76946324, "learning_rate": 1.3506398388225924e-06, "loss": 0.79202664, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.655075788497925 }, { "auxiliary_loss_clip": 0.011782, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.05032694, "balance_loss_mlp": 1.02223277, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 2.0125856185551663, "language_loss": 0.72027028, "learning_rate": 1.349903118095355e-06, "loss": 0.74234188, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.636082887649536 }, { "auxiliary_loss_clip": 0.01233631, "auxiliary_loss_mlp": 0.01025224, "balance_loss_clip": 1.05049384, "balance_loss_mlp": 1.01786828, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 2.0770400971114977, "language_loss": 0.73945928, "learning_rate": 1.349166495987298e-06, "loss": 0.76204783, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.6061668395996094 }, { "auxiliary_loss_clip": 0.01192672, "auxiliary_loss_mlp": 0.00998517, "balance_loss_clip": 1.02265882, "balance_loss_mlp": 0.99740809, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8196722587977189, "language_loss": 0.60813844, "learning_rate": 1.348429972610166e-06, "loss": 0.6300503, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.3039610385894775 }, { "auxiliary_loss_clip": 0.01288837, "auxiliary_loss_mlp": 0.00999255, "balance_loss_clip": 1.02446675, "balance_loss_mlp": 0.99816382, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8437007096204273, "language_loss": 0.57816678, "learning_rate": 1.3476935480756897e-06, "loss": 0.60104769, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 4.062406063079834 }, { "auxiliary_loss_clip": 0.01326372, "auxiliary_loss_mlp": 0.01025083, "balance_loss_clip": 1.04460621, "balance_loss_mlp": 1.0175786, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.3417702857954237, "language_loss": 0.7524969, "learning_rate": 1.346957222495583e-06, "loss": 0.77601147, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.724106788635254 }, { "auxiliary_loss_clip": 0.01183666, "auxiliary_loss_mlp": 0.02568968, "balance_loss_clip": 1.05090725, "balance_loss_mlp": 0.99993706, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.5238112744606442, "language_loss": 0.71018457, "learning_rate": 1.3462209959815466e-06, "loss": 0.74771094, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.682579755783081 }, { "auxiliary_loss_clip": 0.01279668, "auxiliary_loss_mlp": 0.01025857, "balance_loss_clip": 1.04887605, "balance_loss_mlp": 1.01882362, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 2.214197109132397, "language_loss": 0.74303365, "learning_rate": 1.345484868645265e-06, "loss": 0.76608884, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.707698106765747 }, { "auxiliary_loss_clip": 0.01232497, "auxiliary_loss_mlp": 0.01026158, "balance_loss_clip": 1.04474545, "balance_loss_mlp": 1.01869297, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 1.935779881278282, "language_loss": 0.78419697, "learning_rate": 1.3447488405984088e-06, "loss": 0.80678356, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.79646372795105 }, { "auxiliary_loss_clip": 0.01277792, "auxiliary_loss_mlp": 0.01025788, "balance_loss_clip": 1.04856277, "balance_loss_mlp": 1.01838207, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.5057163566772243, "language_loss": 0.70282161, "learning_rate": 1.3440129119526322e-06, "loss": 0.72585738, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.8213860988616943 }, { "auxiliary_loss_clip": 0.01065416, "auxiliary_loss_mlp": 0.01002901, "balance_loss_clip": 1.00838947, "balance_loss_mlp": 1.00191772, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.80676825491449, "language_loss": 0.51128531, "learning_rate": 1.3432770828195762e-06, "loss": 0.53196847, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.346045732498169 }, { "auxiliary_loss_clip": 0.01323207, "auxiliary_loss_mlp": 0.01027944, "balance_loss_clip": 1.04455066, "balance_loss_mlp": 1.02054143, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.465603132389759, "language_loss": 0.70284605, "learning_rate": 1.3425413533108635e-06, "loss": 0.72635758, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.7806894779205322 }, { "auxiliary_loss_clip": 0.0137812, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.04899263, "balance_loss_mlp": 1.02169752, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 3.5990455474269822, "language_loss": 0.70301932, "learning_rate": 1.341805723538105e-06, "loss": 0.72709787, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.7901458740234375 }, { "auxiliary_loss_clip": 0.01185658, "auxiliary_loss_mlp": 0.01028904, "balance_loss_clip": 1.04879022, "balance_loss_mlp": 1.02215934, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.5945778671009538, "language_loss": 0.7723093, "learning_rate": 1.3410701936128948e-06, "loss": 0.79445493, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.756326675415039 }, { "auxiliary_loss_clip": 0.01228628, "auxiliary_loss_mlp": 0.01025341, "balance_loss_clip": 1.05052125, "balance_loss_mlp": 1.01820028, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.40433890243612, "language_loss": 0.85429478, "learning_rate": 1.340334763646812e-06, "loss": 0.87683445, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.615948438644409 }, { "auxiliary_loss_clip": 0.01181811, "auxiliary_loss_mlp": 0.01026074, "balance_loss_clip": 1.05045247, "balance_loss_mlp": 1.01834655, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.8257583671502058, "language_loss": 0.74144173, "learning_rate": 1.3395994337514218e-06, "loss": 0.7635206, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.673292636871338 }, { "auxiliary_loss_clip": 0.0122238, "auxiliary_loss_mlp": 0.0102744, "balance_loss_clip": 1.04573131, "balance_loss_mlp": 1.01998377, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.9065971433094664, "language_loss": 0.78674018, "learning_rate": 1.3388642040382725e-06, "loss": 0.80923843, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.710770845413208 }, { "auxiliary_loss_clip": 0.0133179, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.04314017, "balance_loss_mlp": 1.02051687, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.6437833153464387, "language_loss": 0.83941132, "learning_rate": 1.3381290746188975e-06, "loss": 0.86300254, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.810241222381592 }, { "auxiliary_loss_clip": 0.01230633, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.05348873, "balance_loss_mlp": 1.02665639, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.6715343022056415, "language_loss": 0.67448115, "learning_rate": 1.3373940456048152e-06, "loss": 0.69712406, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.7225418090820312 }, { "auxiliary_loss_clip": 0.01178223, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.05042207, "balance_loss_mlp": 1.01942778, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.6128307554999184, "language_loss": 0.59127796, "learning_rate": 1.3366591171075299e-06, "loss": 0.61332947, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.733884572982788 }, { "auxiliary_loss_clip": 0.01278503, "auxiliary_loss_mlp": 0.01028217, "balance_loss_clip": 1.05011201, "balance_loss_mlp": 1.02092481, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 1.9990743827208126, "language_loss": 0.91105294, "learning_rate": 1.335924289238529e-06, "loss": 0.93412024, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.7388012409210205 }, { "auxiliary_loss_clip": 0.01236512, "auxiliary_loss_mlp": 0.02568971, "balance_loss_clip": 1.05523515, "balance_loss_mlp": 0.99996525, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.6877600453122135, "language_loss": 0.76779288, "learning_rate": 1.3351895621092859e-06, "loss": 0.80584776, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.6688716411590576 }, { "auxiliary_loss_clip": 0.01524993, "auxiliary_loss_mlp": 0.01026513, "balance_loss_clip": 1.03589749, "balance_loss_mlp": 1.01842499, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 1.8058373888521724, "language_loss": 0.76798999, "learning_rate": 1.3344549358312567e-06, "loss": 0.79350507, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 4.180957794189453 }, { "auxiliary_loss_clip": 0.01231941, "auxiliary_loss_mlp": 0.01036059, "balance_loss_clip": 1.05124617, "balance_loss_mlp": 1.02851343, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 2.6121029146967607, "language_loss": 0.78211188, "learning_rate": 1.3337204105158852e-06, "loss": 0.80479193, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 3.913201332092285 }, { "auxiliary_loss_clip": 0.0131982, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.03885245, "balance_loss_mlp": 1.0253861, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 1.8835582428471784, "language_loss": 0.72564393, "learning_rate": 1.332985986274597e-06, "loss": 0.74916947, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 2.717623710632324 }, { "auxiliary_loss_clip": 0.01421813, "auxiliary_loss_mlp": 0.02565676, "balance_loss_clip": 1.0468595, "balance_loss_mlp": 0.99998248, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 2.0585769288183875, "language_loss": 0.75411177, "learning_rate": 1.3322516632188047e-06, "loss": 0.79398668, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 3.928401231765747 }, { "auxiliary_loss_clip": 0.01323846, "auxiliary_loss_mlp": 0.01023678, "balance_loss_clip": 1.04470432, "balance_loss_mlp": 1.01616192, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 3.890564585720792, "language_loss": 0.66819715, "learning_rate": 1.3315174414599045e-06, "loss": 0.69167244, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.7968170642852783 }, { "auxiliary_loss_clip": 0.01226453, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.04640496, "balance_loss_mlp": 1.02655995, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 1.7355635331986912, "language_loss": 0.75344729, "learning_rate": 1.3307833211092768e-06, "loss": 0.77605325, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.618452310562134 }, { "auxiliary_loss_clip": 0.01183503, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.05455685, "balance_loss_mlp": 1.0206579, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.7041583296043412, "language_loss": 0.75317687, "learning_rate": 1.3300493022782873e-06, "loss": 0.77529222, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.708855628967285 }, { "auxiliary_loss_clip": 0.01368004, "auxiliary_loss_mlp": 0.02572247, "balance_loss_clip": 1.04322314, "balance_loss_mlp": 0.99998617, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 2.887793092692339, "language_loss": 0.72628164, "learning_rate": 1.3293153850782855e-06, "loss": 0.76568413, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.709351062774658 }, { "auxiliary_loss_clip": 0.01321851, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.04454434, "balance_loss_mlp": 1.02023554, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 2.213024311017882, "language_loss": 0.71471488, "learning_rate": 1.3285815696206069e-06, "loss": 0.73821557, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 3.8434739112854004 }, { "auxiliary_loss_clip": 0.01329123, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.04506755, "balance_loss_mlp": 1.02085328, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 1.9361157871888601, "language_loss": 0.76934248, "learning_rate": 1.32784785601657e-06, "loss": 0.79291677, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.7412614822387695 }, { "auxiliary_loss_clip": 0.01282134, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 1.04841113, "balance_loss_mlp": 1.01748872, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 2.098074174440486, "language_loss": 0.7388038, "learning_rate": 1.3271142443774798e-06, "loss": 0.76187336, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.7375049591064453 }, { "auxiliary_loss_clip": 0.01273092, "auxiliary_loss_mlp": 0.01022584, "balance_loss_clip": 1.0490433, "balance_loss_mlp": 1.0155623, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 2.1665871642082313, "language_loss": 0.81863391, "learning_rate": 1.3263807348146228e-06, "loss": 0.84159064, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.6797335147857666 }, { "auxiliary_loss_clip": 0.01277095, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.0446341, "balance_loss_mlp": 1.0168736, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 1.8697945923739856, "language_loss": 0.73497301, "learning_rate": 1.3256473274392733e-06, "loss": 0.75799644, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.6757049560546875 }, { "auxiliary_loss_clip": 0.0118123, "auxiliary_loss_mlp": 0.01027196, "balance_loss_clip": 1.05261362, "balance_loss_mlp": 1.01923931, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 2.242564901148075, "language_loss": 0.70174891, "learning_rate": 1.3249140223626873e-06, "loss": 0.7238332, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.608325481414795 }, { "auxiliary_loss_clip": 0.01225788, "auxiliary_loss_mlp": 0.01024809, "balance_loss_clip": 1.05050254, "balance_loss_mlp": 1.01771951, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 2.3505204332417717, "language_loss": 0.7547614, "learning_rate": 1.3241808196961077e-06, "loss": 0.77726734, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.597623348236084 }, { "auxiliary_loss_clip": 0.01270181, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.04712796, "balance_loss_mlp": 1.02160692, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.8451329640585628, "language_loss": 0.70896196, "learning_rate": 1.3234477195507608e-06, "loss": 0.7319473, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.7126755714416504 }, { "auxiliary_loss_clip": 0.01327595, "auxiliary_loss_mlp": 0.01025741, "balance_loss_clip": 1.05054879, "balance_loss_mlp": 1.01820374, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 1.8530362377880827, "language_loss": 0.62816429, "learning_rate": 1.322714722037857e-06, "loss": 0.65169764, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.842797040939331 }, { "auxiliary_loss_clip": 0.01338388, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.04795551, "balance_loss_mlp": 1.02076733, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 3.30963765509893, "language_loss": 0.76940215, "learning_rate": 1.321981827268591e-06, "loss": 0.79307175, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.9384877681732178 }, { "auxiliary_loss_clip": 0.01280948, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 1.0473423, "balance_loss_mlp": 1.01934004, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.7103725717863467, "language_loss": 0.81408799, "learning_rate": 1.3212490353541426e-06, "loss": 0.83716583, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.7653162479400635 }, { "auxiliary_loss_clip": 0.01181833, "auxiliary_loss_mlp": 0.01026263, "balance_loss_clip": 1.05262268, "balance_loss_mlp": 1.01865411, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 2.0431840836157504, "language_loss": 0.80544585, "learning_rate": 1.3205163464056762e-06, "loss": 0.82752681, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.6027352809906006 }, { "auxiliary_loss_clip": 0.01226503, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.04924464, "balance_loss_mlp": 1.01813984, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.9340305325811804, "language_loss": 0.72704864, "learning_rate": 1.319783760534339e-06, "loss": 0.74956316, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.6610209941864014 }, { "auxiliary_loss_clip": 0.01230182, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.05012846, "balance_loss_mlp": 1.02186704, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.1270659508916143, "language_loss": 0.75668931, "learning_rate": 1.319051277851266e-06, "loss": 0.77928483, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.573702335357666 }, { "auxiliary_loss_clip": 0.01132172, "auxiliary_loss_mlp": 0.01028079, "balance_loss_clip": 1.05075049, "balance_loss_mlp": 1.02055109, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 1.8757151761914947, "language_loss": 0.84061658, "learning_rate": 1.3183188984675716e-06, "loss": 0.8622191, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.6659088134765625 }, { "auxiliary_loss_clip": 0.01277793, "auxiliary_loss_mlp": 0.01032056, "balance_loss_clip": 1.05070949, "balance_loss_mlp": 1.02477217, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.1480520963358565, "language_loss": 0.71681678, "learning_rate": 1.3175866224943586e-06, "loss": 0.73991531, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.676151752471924 }, { "auxiliary_loss_clip": 0.01284018, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.05092418, "balance_loss_mlp": 1.02076316, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.2846404041403208, "language_loss": 0.73349935, "learning_rate": 1.316854450042712e-06, "loss": 0.7566222, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.6740732192993164 }, { "auxiliary_loss_clip": 0.01232915, "auxiliary_loss_mlp": 0.01031981, "balance_loss_clip": 1.05099905, "balance_loss_mlp": 1.02414894, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 1.9291144280621508, "language_loss": 0.74416292, "learning_rate": 1.3161223812237024e-06, "loss": 0.76681191, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 3.5173895359039307 }, { "auxiliary_loss_clip": 0.01179974, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.04969859, "balance_loss_mlp": 1.02293694, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.89019033178809, "language_loss": 0.85476196, "learning_rate": 1.3153904161483842e-06, "loss": 0.87687123, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.5421395301818848 }, { "auxiliary_loss_clip": 0.01326616, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.0459609, "balance_loss_mlp": 1.01699781, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.288192458924868, "language_loss": 0.85684264, "learning_rate": 1.3146585549277953e-06, "loss": 0.88035238, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 3.587916851043701 }, { "auxiliary_loss_clip": 0.0119138, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 1.05265808, "balance_loss_mlp": 1.01730728, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 2.0657661511649277, "language_loss": 0.78731394, "learning_rate": 1.3139267976729591e-06, "loss": 0.8094722, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 2.6124725341796875 }, { "auxiliary_loss_clip": 0.01233559, "auxiliary_loss_mlp": 0.01024946, "balance_loss_clip": 1.05161548, "balance_loss_mlp": 1.0177846, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.9745758488708953, "language_loss": 0.7176975, "learning_rate": 1.3131951444948815e-06, "loss": 0.7402826, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 3.6561830043792725 }, { "auxiliary_loss_clip": 0.01285339, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.05320978, "balance_loss_mlp": 1.02491045, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 1.9103366897641774, "language_loss": 0.7642498, "learning_rate": 1.3124635955045546e-06, "loss": 0.78743035, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.7294771671295166 }, { "auxiliary_loss_clip": 0.0127125, "auxiliary_loss_mlp": 0.02567668, "balance_loss_clip": 1.04175615, "balance_loss_mlp": 0.99995434, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.969891007262231, "language_loss": 0.84271288, "learning_rate": 1.3117321508129537e-06, "loss": 0.88110209, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.764082431793213 }, { "auxiliary_loss_clip": 0.01281492, "auxiliary_loss_mlp": 0.01031103, "balance_loss_clip": 1.04946971, "balance_loss_mlp": 1.02371788, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.6974515740998122, "language_loss": 0.76268929, "learning_rate": 1.3110008105310388e-06, "loss": 0.78581524, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.709744930267334 }, { "auxiliary_loss_clip": 0.01183065, "auxiliary_loss_mlp": 0.01025044, "balance_loss_clip": 1.05122101, "balance_loss_mlp": 1.01740241, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 2.8923904763969612, "language_loss": 0.78076375, "learning_rate": 1.3102695747697526e-06, "loss": 0.80284476, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.670607566833496 }, { "auxiliary_loss_clip": 0.01427667, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 1.04726493, "balance_loss_mlp": 1.02039492, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 2.205005947132641, "language_loss": 0.90754461, "learning_rate": 1.3095384436400237e-06, "loss": 0.93210721, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 3.8011813163757324 }, { "auxiliary_loss_clip": 0.01187387, "auxiliary_loss_mlp": 0.01028554, "balance_loss_clip": 1.04876733, "balance_loss_mlp": 1.02080512, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.1058535734879684, "language_loss": 0.82734048, "learning_rate": 1.3088074172527633e-06, "loss": 0.84949994, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 2.6194944381713867 }, { "auxiliary_loss_clip": 0.01284781, "auxiliary_loss_mlp": 0.01032071, "balance_loss_clip": 1.04957616, "balance_loss_mlp": 1.02432227, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 2.221692202759293, "language_loss": 0.7168895, "learning_rate": 1.3080764957188684e-06, "loss": 0.74005806, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.7706222534179688 }, { "auxiliary_loss_clip": 0.012857, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.04545879, "balance_loss_mlp": 1.0230819, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.8103341700646838, "language_loss": 0.70599806, "learning_rate": 1.3073456791492192e-06, "loss": 0.72915673, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.7717013359069824 }, { "auxiliary_loss_clip": 0.01281926, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.0480082, "balance_loss_mlp": 1.02340245, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 1.842808891261161, "language_loss": 0.78428793, "learning_rate": 1.3066149676546801e-06, "loss": 0.80741197, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.660034656524658 }, { "auxiliary_loss_clip": 0.01284699, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 1.0551666, "balance_loss_mlp": 1.01751888, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 7.61983536006757, "language_loss": 0.66427243, "learning_rate": 1.3058843613460985e-06, "loss": 0.68736595, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.723572015762329 }, { "auxiliary_loss_clip": 0.01239102, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.0494467, "balance_loss_mlp": 1.02162659, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 2.334462620099171, "language_loss": 0.74450731, "learning_rate": 1.3051538603343075e-06, "loss": 0.76718819, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.7307798862457275 }, { "auxiliary_loss_clip": 0.01227499, "auxiliary_loss_mlp": 0.01026477, "balance_loss_clip": 1.05135751, "balance_loss_mlp": 1.01923466, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 2.0457238825788147, "language_loss": 0.67678607, "learning_rate": 1.3044234647301235e-06, "loss": 0.69932586, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.6403303146362305 }, { "auxiliary_loss_clip": 0.01225466, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 1.05149782, "balance_loss_mlp": 1.02255893, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.7156315443861265, "language_loss": 0.72776508, "learning_rate": 1.303693174644347e-06, "loss": 0.75031972, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.6161599159240723 }, { "auxiliary_loss_clip": 0.01272262, "auxiliary_loss_mlp": 0.0102456, "balance_loss_clip": 1.04532743, "balance_loss_mlp": 1.01690054, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 2.029532668543354, "language_loss": 0.80717981, "learning_rate": 1.3029629901877625e-06, "loss": 0.83014798, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.7429633140563965 }, { "auxiliary_loss_clip": 0.01137751, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.0507803, "balance_loss_mlp": 1.02282667, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 4.249011547311132, "language_loss": 0.77396667, "learning_rate": 1.3022329114711376e-06, "loss": 0.79564226, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.6942427158355713 }, { "auxiliary_loss_clip": 0.01276326, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.04841995, "balance_loss_mlp": 1.02135921, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.868194934020265, "language_loss": 0.6968022, "learning_rate": 1.3015029386052256e-06, "loss": 0.71985269, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.838588237762451 }, { "auxiliary_loss_clip": 0.01242112, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.05128682, "balance_loss_mlp": 1.02180719, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 2.6019472876714422, "language_loss": 0.72895747, "learning_rate": 1.3007730717007622e-06, "loss": 0.75166947, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.7884914875030518 }, { "auxiliary_loss_clip": 0.01186305, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.05355215, "balance_loss_mlp": 1.02130973, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.982189716871274, "language_loss": 0.75760674, "learning_rate": 1.3000433108684676e-06, "loss": 0.77976584, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.666295051574707 }, { "auxiliary_loss_clip": 0.01227432, "auxiliary_loss_mlp": 0.01027327, "balance_loss_clip": 1.05048203, "balance_loss_mlp": 1.01982892, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 2.371057201589831, "language_loss": 0.80644453, "learning_rate": 1.2993136562190467e-06, "loss": 0.82899213, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.6810905933380127 }, { "auxiliary_loss_clip": 0.01280967, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 1.04903197, "balance_loss_mlp": 1.01965809, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.6107068382200134, "language_loss": 0.70357275, "learning_rate": 1.2985841078631871e-06, "loss": 0.72665143, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.7017014026641846 }, { "auxiliary_loss_clip": 0.01426394, "auxiliary_loss_mlp": 0.01024183, "balance_loss_clip": 1.04036295, "balance_loss_mlp": 1.01657176, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.6946631554952534, "language_loss": 0.781528, "learning_rate": 1.2978546659115608e-06, "loss": 0.80603373, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.8244731426239014 }, { "auxiliary_loss_clip": 0.01282019, "auxiliary_loss_mlp": 0.01027633, "balance_loss_clip": 1.04928863, "balance_loss_mlp": 1.02057004, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.8291364951069515, "language_loss": 0.84975839, "learning_rate": 1.2971253304748228e-06, "loss": 0.87285489, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 3.6474146842956543 }, { "auxiliary_loss_clip": 0.01233389, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 1.05276132, "balance_loss_mlp": 1.02292776, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.9337071688269172, "language_loss": 0.75007188, "learning_rate": 1.296396101663614e-06, "loss": 0.77271926, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.65181303024292 }, { "auxiliary_loss_clip": 0.01234778, "auxiliary_loss_mlp": 0.01021093, "balance_loss_clip": 1.05160522, "balance_loss_mlp": 1.01422358, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 6.066796636192211, "language_loss": 0.84447604, "learning_rate": 1.2956669795885565e-06, "loss": 0.86703479, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 3.560659646987915 }, { "auxiliary_loss_clip": 0.01322944, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.04858685, "balance_loss_mlp": 1.02283287, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 1.6792332758772577, "language_loss": 0.68574381, "learning_rate": 1.294937964360259e-06, "loss": 0.70927429, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 3.7277519702911377 }, { "auxiliary_loss_clip": 0.01185757, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.04892087, "balance_loss_mlp": 1.0222733, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.302988849097869, "language_loss": 0.71555918, "learning_rate": 1.2942090560893108e-06, "loss": 0.73772156, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.7351410388946533 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0102425, "balance_loss_clip": 1.05172396, "balance_loss_mlp": 1.01753545, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 2.002068888090633, "language_loss": 0.60676467, "learning_rate": 1.2934802548862882e-06, "loss": 0.62880361, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.705451726913452 }, { "auxiliary_loss_clip": 0.01276181, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.04669738, "balance_loss_mlp": 1.01819336, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.9284353282893616, "language_loss": 0.82640642, "learning_rate": 1.292751560861749e-06, "loss": 0.84942359, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.714510440826416 }, { "auxiliary_loss_clip": 0.01184253, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.05230856, "balance_loss_mlp": 1.02032375, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.8368973995079338, "language_loss": 0.79722935, "learning_rate": 1.2920229741262354e-06, "loss": 0.81935, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.598170757293701 }, { "auxiliary_loss_clip": 0.01278148, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.04873872, "balance_loss_mlp": 1.02066445, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.0939179913414883, "language_loss": 0.7512297, "learning_rate": 1.2912944947902739e-06, "loss": 0.7742933, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 3.597698926925659 }, { "auxiliary_loss_clip": 0.01286469, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.04905844, "balance_loss_mlp": 1.01929116, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.5395770104276134, "language_loss": 0.71853673, "learning_rate": 1.2905661229643742e-06, "loss": 0.74167299, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.8033905029296875 }, { "auxiliary_loss_clip": 0.01179926, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.05004597, "balance_loss_mlp": 1.01764488, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.4503313023663167, "language_loss": 0.83980465, "learning_rate": 1.2898378587590299e-06, "loss": 0.86185622, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.5678577423095703 }, { "auxiliary_loss_clip": 0.01229179, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.05122018, "balance_loss_mlp": 1.02010477, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 2.0411573757529347, "language_loss": 0.87129951, "learning_rate": 1.2891097022847173e-06, "loss": 0.89386523, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.7097458839416504 }, { "auxiliary_loss_clip": 0.01276205, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.04763198, "balance_loss_mlp": 1.0236882, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 1.9269455583442034, "language_loss": 0.66565782, "learning_rate": 1.2883816536518978e-06, "loss": 0.68873101, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.7054436206817627 }, { "auxiliary_loss_clip": 0.01224365, "auxiliary_loss_mlp": 0.0102505, "balance_loss_clip": 1.04935503, "balance_loss_mlp": 1.01791811, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 2.2171834010980986, "language_loss": 0.81656194, "learning_rate": 1.2876537129710155e-06, "loss": 0.83905613, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.672982931137085 }, { "auxiliary_loss_clip": 0.01284211, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.05570459, "balance_loss_mlp": 1.02259851, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 3.78961055908765, "language_loss": 0.75605667, "learning_rate": 1.286925880352499e-06, "loss": 0.77920824, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.7014381885528564 }, { "auxiliary_loss_clip": 0.01282454, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.05009139, "balance_loss_mlp": 1.02093506, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.7785320595914071, "language_loss": 0.71172941, "learning_rate": 1.2861981559067592e-06, "loss": 0.73483849, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.7380990982055664 }, { "auxiliary_loss_clip": 0.01373036, "auxiliary_loss_mlp": 0.01024402, "balance_loss_clip": 1.04599416, "balance_loss_mlp": 1.01708889, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 1.9203878163776462, "language_loss": 0.80535662, "learning_rate": 1.2854705397441917e-06, "loss": 0.82933098, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.7552683353424072 }, { "auxiliary_loss_clip": 0.01321501, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.04384589, "balance_loss_mlp": 1.02182829, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.2601976505738857, "language_loss": 0.77521902, "learning_rate": 1.2847430319751747e-06, "loss": 0.79873043, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.729901075363159 }, { "auxiliary_loss_clip": 0.01226782, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.051512, "balance_loss_mlp": 1.02342689, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 3.4046659177686522, "language_loss": 0.67519516, "learning_rate": 1.2840156327100712e-06, "loss": 0.69777054, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.6693599224090576 }, { "auxiliary_loss_clip": 0.01180726, "auxiliary_loss_mlp": 0.01026641, "balance_loss_clip": 1.05274868, "balance_loss_mlp": 1.01902986, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.7129232632631066, "language_loss": 0.72301424, "learning_rate": 1.2832883420592272e-06, "loss": 0.74508786, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.6220614910125732 }, { "auxiliary_loss_clip": 0.01276586, "auxiliary_loss_mlp": 0.01030435, "balance_loss_clip": 1.04826832, "balance_loss_mlp": 1.02274942, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.1736297850715633, "language_loss": 0.6416012, "learning_rate": 1.282561160132972e-06, "loss": 0.66467142, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.857266664505005 }, { "auxiliary_loss_clip": 0.01284876, "auxiliary_loss_mlp": 0.01029333, "balance_loss_clip": 1.04609144, "balance_loss_mlp": 1.02181077, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 2.444198818965886, "language_loss": 0.80891258, "learning_rate": 1.2818340870416186e-06, "loss": 0.83205473, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.72715163230896 }, { "auxiliary_loss_clip": 0.01337256, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.04593253, "balance_loss_mlp": 1.01992095, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 4.409693665180063, "language_loss": 0.75872684, "learning_rate": 1.2811071228954626e-06, "loss": 0.78237486, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.768714666366577 }, { "auxiliary_loss_clip": 0.01278262, "auxiliary_loss_mlp": 0.0102567, "balance_loss_clip": 1.04979944, "balance_loss_mlp": 1.01852012, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 2.0823425975974876, "language_loss": 0.80741644, "learning_rate": 1.2803802678047846e-06, "loss": 0.83045572, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.698317050933838 }, { "auxiliary_loss_clip": 0.01285698, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.0524826, "balance_loss_mlp": 1.02371299, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.7463572881287777, "language_loss": 0.74092978, "learning_rate": 1.279653521879848e-06, "loss": 0.76411474, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.6969852447509766 }, { "auxiliary_loss_clip": 0.01464817, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.03931236, "balance_loss_mlp": 1.01981306, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 2.20501053662743, "language_loss": 0.83993769, "learning_rate": 1.2789268852308997e-06, "loss": 0.86485696, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.7914962768554688 }, { "auxiliary_loss_clip": 0.01226974, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 1.04956698, "balance_loss_mlp": 1.02590632, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.021888885453244, "language_loss": 0.70989525, "learning_rate": 1.2782003579681688e-06, "loss": 0.73249829, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 3.607811450958252 }, { "auxiliary_loss_clip": 0.01183896, "auxiliary_loss_mlp": 0.01028841, "balance_loss_clip": 1.0522325, "balance_loss_mlp": 1.02156937, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 1.8426764453027429, "language_loss": 0.7416178, "learning_rate": 1.2774739402018701e-06, "loss": 0.76374513, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.700361490249634 }, { "auxiliary_loss_clip": 0.0123854, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.05767632, "balance_loss_mlp": 1.02052402, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.7823799680677446, "language_loss": 0.73080593, "learning_rate": 1.2767476320422002e-06, "loss": 0.7534759, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 3.55440092086792 }, { "auxiliary_loss_clip": 0.01234997, "auxiliary_loss_mlp": 0.01001097, "balance_loss_clip": 1.01045299, "balance_loss_mlp": 1.00007808, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.6837399785350393, "language_loss": 0.57146657, "learning_rate": 1.2760214335993392e-06, "loss": 0.59382749, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.35624098777771 }, { "auxiliary_loss_clip": 0.01227738, "auxiliary_loss_mlp": 0.01025654, "balance_loss_clip": 1.04887044, "balance_loss_mlp": 1.01877594, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 2.1483736888928004, "language_loss": 0.58242559, "learning_rate": 1.2752953449834514e-06, "loss": 0.60495949, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 3.6812658309936523 }, { "auxiliary_loss_clip": 0.01181427, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 1.052356, "balance_loss_mlp": 1.01957774, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 2.216772044019813, "language_loss": 0.80476153, "learning_rate": 1.2745693663046836e-06, "loss": 0.82684171, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.626579999923706 }, { "auxiliary_loss_clip": 0.01224565, "auxiliary_loss_mlp": 0.01024977, "balance_loss_clip": 1.05017364, "balance_loss_mlp": 1.01774669, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 2.6793175924225854, "language_loss": 0.81050617, "learning_rate": 1.2738434976731662e-06, "loss": 0.83300161, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.6783668994903564 }, { "auxiliary_loss_clip": 0.01278971, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.04943061, "balance_loss_mlp": 1.02059484, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 3.8363346495975077, "language_loss": 0.75039554, "learning_rate": 1.2731177391990125e-06, "loss": 0.77346539, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.6868369579315186 }, { "auxiliary_loss_clip": 0.01279937, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.04714429, "balance_loss_mlp": 1.01817465, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 1.9173752880746324, "language_loss": 0.81794739, "learning_rate": 1.2723920909923203e-06, "loss": 0.84100282, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.6843395233154297 }, { "auxiliary_loss_clip": 0.01067986, "auxiliary_loss_mlp": 0.01006305, "balance_loss_clip": 1.0113008, "balance_loss_mlp": 1.0052979, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 1.0004861188553915, "language_loss": 0.60396427, "learning_rate": 1.2716665531631688e-06, "loss": 0.62470716, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.974187135696411 }, { "auxiliary_loss_clip": 0.0113416, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.04911494, "balance_loss_mlp": 1.02533185, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 2.4079822232954613, "language_loss": 0.77429599, "learning_rate": 1.270941125821623e-06, "loss": 0.79596823, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.70611310005188 }, { "auxiliary_loss_clip": 0.01227386, "auxiliary_loss_mlp": 0.01025736, "balance_loss_clip": 1.04767108, "balance_loss_mlp": 1.01900923, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.818001164859896, "language_loss": 0.75621247, "learning_rate": 1.2702158090777278e-06, "loss": 0.77874368, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.7473702430725098 }, { "auxiliary_loss_clip": 0.01326662, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.04576719, "balance_loss_mlp": 1.02548838, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 2.813112057760493, "language_loss": 0.7515735, "learning_rate": 1.2694906030415148e-06, "loss": 0.7751646, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.7301151752471924 }, { "auxiliary_loss_clip": 0.01285846, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.04724777, "balance_loss_mlp": 1.0221796, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 3.4664413119164865, "language_loss": 0.82258189, "learning_rate": 1.2687655078229958e-06, "loss": 0.8457396, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.717881679534912 }, { "auxiliary_loss_clip": 0.01277244, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.05146706, "balance_loss_mlp": 1.01854849, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.8456645586947897, "language_loss": 0.69128603, "learning_rate": 1.2680405235321678e-06, "loss": 0.71431607, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.7474567890167236 }, { "auxiliary_loss_clip": 0.01286453, "auxiliary_loss_mlp": 0.02570352, "balance_loss_clip": 1.05560398, "balance_loss_mlp": 0.99988598, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.0632607644560816, "language_loss": 0.78449005, "learning_rate": 1.267315650279011e-06, "loss": 0.82305807, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.6856634616851807 }, { "auxiliary_loss_clip": 0.01321156, "auxiliary_loss_mlp": 0.01031118, "balance_loss_clip": 1.04765201, "balance_loss_mlp": 1.02315438, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.9669207060561098, "language_loss": 0.73973566, "learning_rate": 1.2665908881734874e-06, "loss": 0.76325834, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.7034502029418945 }, { "auxiliary_loss_clip": 0.01227234, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.04886651, "balance_loss_mlp": 1.01860225, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.255887720296965, "language_loss": 0.85421461, "learning_rate": 1.2658662373255432e-06, "loss": 0.87674296, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.6199700832366943 }, { "auxiliary_loss_clip": 0.0117256, "auxiliary_loss_mlp": 0.01006581, "balance_loss_clip": 1.01031876, "balance_loss_mlp": 1.00560975, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7117239665389015, "language_loss": 0.52194118, "learning_rate": 1.2651416978451063e-06, "loss": 0.54373258, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.3058671951293945 }, { "auxiliary_loss_clip": 0.01186323, "auxiliary_loss_mlp": 0.01026321, "balance_loss_clip": 1.05386317, "balance_loss_mlp": 1.01845372, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 2.5629892148116684, "language_loss": 0.65270674, "learning_rate": 1.2644172698420903e-06, "loss": 0.67483318, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.7889795303344727 }, { "auxiliary_loss_clip": 0.01328652, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.02722144, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.9926496973763286, "language_loss": 0.85196859, "learning_rate": 1.2636929534263892e-06, "loss": 0.87560093, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.7114148139953613 }, { "auxiliary_loss_clip": 0.01331079, "auxiliary_loss_mlp": 0.0102676, "balance_loss_clip": 1.04313207, "balance_loss_mlp": 1.01963091, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 4.4396584599345, "language_loss": 0.77206576, "learning_rate": 1.2629687487078821e-06, "loss": 0.79564416, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.757223129272461 }, { "auxiliary_loss_clip": 0.01231865, "auxiliary_loss_mlp": 0.01030509, "balance_loss_clip": 1.04732025, "balance_loss_mlp": 1.02246547, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 2.389215926509584, "language_loss": 0.76536751, "learning_rate": 1.2622446557964293e-06, "loss": 0.78799129, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.68548583984375 }, { "auxiliary_loss_clip": 0.01278353, "auxiliary_loss_mlp": 0.01019873, "balance_loss_clip": 1.04542851, "balance_loss_mlp": 1.01240444, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 2.136544840285888, "language_loss": 0.71831954, "learning_rate": 1.261520674801876e-06, "loss": 0.74130177, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.784088134765625 }, { "auxiliary_loss_clip": 0.01285631, "auxiliary_loss_mlp": 0.01035564, "balance_loss_clip": 1.05625749, "balance_loss_mlp": 1.02712381, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 2.3634801171899755, "language_loss": 0.72594404, "learning_rate": 1.2607968058340488e-06, "loss": 0.749156, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.7550034523010254 }, { "auxiliary_loss_clip": 0.0127498, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.0464381, "balance_loss_mlp": 1.01962972, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.984211437842188, "language_loss": 0.73435342, "learning_rate": 1.2600730490027583e-06, "loss": 0.75737208, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.7152047157287598 }, { "auxiliary_loss_clip": 0.0132509, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.04585958, "balance_loss_mlp": 1.02549803, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 10.538864755527547, "language_loss": 0.80564594, "learning_rate": 1.2593494044177984e-06, "loss": 0.82922471, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 3.620846748352051 }, { "auxiliary_loss_clip": 0.0118185, "auxiliary_loss_mlp": 0.01025594, "balance_loss_clip": 1.04990959, "balance_loss_mlp": 1.01805401, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.67724874632327, "language_loss": 0.80525362, "learning_rate": 1.2586258721889448e-06, "loss": 0.82732809, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.5657198429107666 }, { "auxiliary_loss_clip": 0.01372077, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.04523015, "balance_loss_mlp": 1.0198487, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 2.124776939129925, "language_loss": 0.81699967, "learning_rate": 1.2579024524259573e-06, "loss": 0.84099633, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 3.63861083984375 }, { "auxiliary_loss_clip": 0.01274109, "auxiliary_loss_mlp": 0.01026397, "balance_loss_clip": 1.04296684, "balance_loss_mlp": 1.0190326, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 2.1155244453714763, "language_loss": 0.91460067, "learning_rate": 1.2571791452385768e-06, "loss": 0.93760574, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 2.6852428913116455 }, { "auxiliary_loss_clip": 0.01277485, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.04940939, "balance_loss_mlp": 1.02329493, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.6300720456763362, "language_loss": 0.77230424, "learning_rate": 1.2564559507365301e-06, "loss": 0.79537755, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 3.6515378952026367 }, { "auxiliary_loss_clip": 0.01283459, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.0506916, "balance_loss_mlp": 1.01700461, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.9184297903601872, "language_loss": 0.78998768, "learning_rate": 1.2557328690295244e-06, "loss": 0.81306899, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.6835968494415283 }, { "auxiliary_loss_clip": 0.012304, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.04773486, "balance_loss_mlp": 1.02426827, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.6763201392426013, "language_loss": 0.76232147, "learning_rate": 1.255009900227251e-06, "loss": 0.78493983, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.761054515838623 }, { "auxiliary_loss_clip": 0.01176434, "auxiliary_loss_mlp": 0.01028465, "balance_loss_clip": 1.05067718, "balance_loss_mlp": 1.02124047, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 2.2159413090715536, "language_loss": 0.79588884, "learning_rate": 1.254287044439383e-06, "loss": 0.81793785, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.635645866394043 }, { "auxiliary_loss_clip": 0.01065272, "auxiliary_loss_mlp": 0.01007707, "balance_loss_clip": 1.00872195, "balance_loss_mlp": 1.00680149, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.773971778910318, "language_loss": 0.54372203, "learning_rate": 1.2535643017755776e-06, "loss": 0.56445181, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.2790462970733643 }, { "auxiliary_loss_clip": 0.01225899, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.04361844, "balance_loss_mlp": 1.02129626, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 3.4492790560881, "language_loss": 0.72267789, "learning_rate": 1.2528416723454737e-06, "loss": 0.74522412, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 3.681333303451538 }, { "auxiliary_loss_clip": 0.01177162, "auxiliary_loss_mlp": 0.0102405, "balance_loss_clip": 1.05089021, "balance_loss_mlp": 1.01702619, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.933363118868206, "language_loss": 0.70972544, "learning_rate": 1.2521191562586945e-06, "loss": 0.73173755, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.729631185531616 }, { "auxiliary_loss_clip": 0.0117753, "auxiliary_loss_mlp": 0.02566665, "balance_loss_clip": 1.05020952, "balance_loss_mlp": 0.99990064, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 2.6562279168059817, "language_loss": 0.76717275, "learning_rate": 1.2513967536248445e-06, "loss": 0.80461466, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.6455459594726562 }, { "auxiliary_loss_clip": 0.01227248, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.05121887, "balance_loss_mlp": 1.02618384, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.684609454335387, "language_loss": 0.81184131, "learning_rate": 1.2506744645535117e-06, "loss": 0.83445215, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.646491289138794 }, { "auxiliary_loss_clip": 0.01275716, "auxiliary_loss_mlp": 0.0102365, "balance_loss_clip": 1.04309106, "balance_loss_mlp": 1.01624131, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 2.6779841249830016, "language_loss": 0.60509878, "learning_rate": 1.249952289154267e-06, "loss": 0.62809241, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.7346575260162354 }, { "auxiliary_loss_clip": 0.01418272, "auxiliary_loss_mlp": 0.01028678, "balance_loss_clip": 1.04132748, "balance_loss_mlp": 1.02145362, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 1.8383011465736623, "language_loss": 0.76768851, "learning_rate": 1.2492302275366635e-06, "loss": 0.79215801, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.8717143535614014 }, { "auxiliary_loss_clip": 0.01227777, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.04877019, "balance_loss_mlp": 1.01674342, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 1.8906100038454539, "language_loss": 0.65720475, "learning_rate": 1.2485082798102377e-06, "loss": 0.67973292, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.66744327545166 }, { "auxiliary_loss_clip": 0.01335672, "auxiliary_loss_mlp": 0.01025218, "balance_loss_clip": 1.04652488, "balance_loss_mlp": 1.01782715, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 4.401367770023357, "language_loss": 0.6867938, "learning_rate": 1.2477864460845084e-06, "loss": 0.71040267, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.7111310958862305 }, { "auxiliary_loss_clip": 0.01280128, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.04845524, "balance_loss_mlp": 1.02192223, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.9346408429888506, "language_loss": 0.74403274, "learning_rate": 1.2470647264689776e-06, "loss": 0.76713282, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.644277334213257 }, { "auxiliary_loss_clip": 0.01428488, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.04045057, "balance_loss_mlp": 1.01851177, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 2.25228408086617, "language_loss": 0.71544713, "learning_rate": 1.2463431210731282e-06, "loss": 0.73999435, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.777639627456665 }, { "auxiliary_loss_clip": 0.01283966, "auxiliary_loss_mlp": 0.01025368, "balance_loss_clip": 1.04167843, "balance_loss_mlp": 1.01837587, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.605796107315974, "language_loss": 0.7661404, "learning_rate": 1.2456216300064289e-06, "loss": 0.78923368, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.7237155437469482 }, { "auxiliary_loss_clip": 0.01275743, "auxiliary_loss_mlp": 0.01024254, "balance_loss_clip": 1.04569292, "balance_loss_mlp": 1.01686025, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.6593548572924075, "language_loss": 0.78519452, "learning_rate": 1.244900253378328e-06, "loss": 0.80819452, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.726276397705078 }, { "auxiliary_loss_clip": 0.01425948, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.04473138, "balance_loss_mlp": 1.02142572, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 3.4604724031503413, "language_loss": 0.69620162, "learning_rate": 1.2441789912982583e-06, "loss": 0.72074646, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.8789350986480713 }, { "auxiliary_loss_clip": 0.01231589, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.05105352, "balance_loss_mlp": 1.02156091, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 1.7433787369038627, "language_loss": 0.64892191, "learning_rate": 1.2434578438756346e-06, "loss": 0.67153358, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.785865068435669 }, { "auxiliary_loss_clip": 0.01230458, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.0477016, "balance_loss_mlp": 1.01814139, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 1.9244817901241218, "language_loss": 0.77934837, "learning_rate": 1.242736811219855e-06, "loss": 0.80190468, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 3.053905725479126 }, { "auxiliary_loss_clip": 0.01224829, "auxiliary_loss_mlp": 0.01026235, "balance_loss_clip": 1.04686999, "balance_loss_mlp": 1.0190227, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 1.9938468459570362, "language_loss": 0.81865847, "learning_rate": 1.2420158934402988e-06, "loss": 0.84116912, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.7481939792633057 }, { "auxiliary_loss_clip": 0.01317511, "auxiliary_loss_mlp": 0.01023225, "balance_loss_clip": 1.04174614, "balance_loss_mlp": 1.01545238, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 2.6564719437766575, "language_loss": 0.84740782, "learning_rate": 1.2412950906463286e-06, "loss": 0.87081516, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.8220255374908447 }, { "auxiliary_loss_clip": 0.01371923, "auxiliary_loss_mlp": 0.0102662, "balance_loss_clip": 1.04397786, "balance_loss_mlp": 1.01966739, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 2.225390811335096, "language_loss": 0.89883804, "learning_rate": 1.2405744029472902e-06, "loss": 0.92282355, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.7432243824005127 }, { "auxiliary_loss_clip": 0.01276179, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04707134, "balance_loss_mlp": 1.01791632, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 1.9389407649095132, "language_loss": 0.7634353, "learning_rate": 1.2398538304525108e-06, "loss": 0.78644753, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 3.617347478866577 }, { "auxiliary_loss_clip": 0.01335617, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.04982948, "balance_loss_mlp": 1.0243988, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.5036315219263514, "language_loss": 0.75693536, "learning_rate": 1.2391333732713016e-06, "loss": 0.78061438, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 3.629286766052246 }, { "auxiliary_loss_clip": 0.01335075, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.04631329, "balance_loss_mlp": 1.02003467, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 3.783730187925394, "language_loss": 0.78629208, "learning_rate": 1.2384130315129543e-06, "loss": 0.80992174, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 3.609495162963867 }, { "auxiliary_loss_clip": 0.01577718, "auxiliary_loss_mlp": 0.01035982, "balance_loss_clip": 1.0377872, "balance_loss_mlp": 1.02857625, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 2.2767841656411343, "language_loss": 0.73352516, "learning_rate": 1.2376928052867447e-06, "loss": 0.75966215, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.0749175548553467 }, { "auxiliary_loss_clip": 0.01279211, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 1.05007708, "balance_loss_mlp": 1.02047062, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 4.622598863238778, "language_loss": 0.77545387, "learning_rate": 1.2369726947019299e-06, "loss": 0.79852211, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 3.867518663406372 }, { "auxiliary_loss_clip": 0.01228078, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.04720545, "balance_loss_mlp": 1.01660764, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.6290673734833123, "language_loss": 0.67141831, "learning_rate": 1.2362526998677511e-06, "loss": 0.69393587, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.738909959793091 }, { "auxiliary_loss_clip": 0.0128417, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.04931843, "balance_loss_mlp": 1.02213228, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 1.958280311406575, "language_loss": 0.84136766, "learning_rate": 1.2355328208934301e-06, "loss": 0.86450148, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.723400831222534 }, { "auxiliary_loss_clip": 0.01230447, "auxiliary_loss_mlp": 0.02566996, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 0.9999457, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.7548591254531911, "language_loss": 0.72765803, "learning_rate": 1.2348130578881728e-06, "loss": 0.76563245, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 3.5798356533050537 }, { "auxiliary_loss_clip": 0.01183418, "auxiliary_loss_mlp": 0.01026681, "balance_loss_clip": 1.0526371, "balance_loss_mlp": 1.01898599, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 3.976229429780192, "language_loss": 0.76433283, "learning_rate": 1.2340934109611664e-06, "loss": 0.78643382, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.659642457962036 }, { "auxiliary_loss_clip": 0.01290766, "auxiliary_loss_mlp": 0.01031627, "balance_loss_clip": 1.05176747, "balance_loss_mlp": 1.0234251, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.6556450841890316, "language_loss": 0.6845867, "learning_rate": 1.2333738802215798e-06, "loss": 0.70781064, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.7764925956726074 }, { "auxiliary_loss_clip": 0.01374372, "auxiliary_loss_mlp": 0.0102664, "balance_loss_clip": 1.04250097, "balance_loss_mlp": 1.01977086, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 1.8053087493572912, "language_loss": 0.81016296, "learning_rate": 1.2326544657785668e-06, "loss": 0.83417308, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.763246774673462 }, { "auxiliary_loss_clip": 0.01326943, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.04406643, "balance_loss_mlp": 1.0235498, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 3.2903584290296863, "language_loss": 0.74745631, "learning_rate": 1.2319351677412608e-06, "loss": 0.77103281, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.754897356033325 }, { "auxiliary_loss_clip": 0.01339776, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.0502311, "balance_loss_mlp": 1.0178138, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 3.0228096761891092, "language_loss": 0.74169385, "learning_rate": 1.2312159862187796e-06, "loss": 0.765347, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.7006120681762695 }, { "auxiliary_loss_clip": 0.01183189, "auxiliary_loss_mlp": 0.01033933, "balance_loss_clip": 1.0526979, "balance_loss_mlp": 1.02651787, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 1.57614599811213, "language_loss": 0.76277381, "learning_rate": 1.2304969213202217e-06, "loss": 0.78494507, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.6557743549346924 }, { "auxiliary_loss_clip": 0.01277922, "auxiliary_loss_mlp": 0.01024208, "balance_loss_clip": 1.04710948, "balance_loss_mlp": 1.01710296, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.881974843776227, "language_loss": 0.79669458, "learning_rate": 1.2297779731546692e-06, "loss": 0.81971586, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.6683907508850098 }, { "auxiliary_loss_clip": 0.01275798, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.04994333, "balance_loss_mlp": 1.02375984, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 3.6600555054667767, "language_loss": 0.78095341, "learning_rate": 1.2290591418311853e-06, "loss": 0.80402249, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 2.7273497581481934 }, { "auxiliary_loss_clip": 0.01236541, "auxiliary_loss_mlp": 0.01025339, "balance_loss_clip": 1.05429935, "balance_loss_mlp": 1.01797748, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.7119532676464184, "language_loss": 0.72348732, "learning_rate": 1.2283404274588172e-06, "loss": 0.74610615, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.704105854034424 }, { "auxiliary_loss_clip": 0.01379952, "auxiliary_loss_mlp": 0.01003266, "balance_loss_clip": 1.01305842, "balance_loss_mlp": 1.00230014, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7424864654086032, "language_loss": 0.52722347, "learning_rate": 1.227621830146592e-06, "loss": 0.55105567, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.32707142829895 }, { "auxiliary_loss_clip": 0.01340744, "auxiliary_loss_mlp": 0.0103116, "balance_loss_clip": 1.05211771, "balance_loss_mlp": 1.02355456, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.8781768836186468, "language_loss": 0.78994977, "learning_rate": 1.2269033500035217e-06, "loss": 0.81366879, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.9176735877990723 }, { "auxiliary_loss_clip": 0.01227879, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.04790187, "balance_loss_mlp": 1.02178335, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 7.050854867166453, "language_loss": 0.73587561, "learning_rate": 1.2261849871385988e-06, "loss": 0.75844795, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.6849355697631836 }, { "auxiliary_loss_clip": 0.0117909, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.04996634, "balance_loss_mlp": 1.01760101, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.375572084102643, "language_loss": 0.6293323, "learning_rate": 1.2254667416607972e-06, "loss": 0.65137136, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.7112810611724854 }, { "auxiliary_loss_clip": 0.01226877, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.04875851, "balance_loss_mlp": 1.02211964, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 2.4206745456629153, "language_loss": 0.83108926, "learning_rate": 1.2247486136790756e-06, "loss": 0.8536545, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.651848554611206 }, { "auxiliary_loss_clip": 0.01233971, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.05219877, "balance_loss_mlp": 1.02261353, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.3200955888049486, "language_loss": 0.80098975, "learning_rate": 1.2240306033023726e-06, "loss": 0.82362884, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.6729204654693604 }, { "auxiliary_loss_clip": 0.01328401, "auxiliary_loss_mlp": 0.01024998, "balance_loss_clip": 1.04139209, "balance_loss_mlp": 1.01733899, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.8287691826387416, "language_loss": 0.71887654, "learning_rate": 1.223312710639611e-06, "loss": 0.74241054, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.6990292072296143 }, { "auxiliary_loss_clip": 0.0128004, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.05002117, "balance_loss_mlp": 1.01839852, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 9.148277954893828, "language_loss": 0.86992294, "learning_rate": 1.2225949357996928e-06, "loss": 0.89298135, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.6827967166900635 }, { "auxiliary_loss_clip": 0.01224944, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 1.04951203, "balance_loss_mlp": 1.01681399, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.6276328091586416, "language_loss": 0.80488217, "learning_rate": 1.221877278891505e-06, "loss": 0.82737017, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 3.9019930362701416 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01028107, "balance_loss_clip": 1.05216789, "balance_loss_mlp": 1.02027524, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 2.1254113099500227, "language_loss": 0.71100497, "learning_rate": 1.221159740023915e-06, "loss": 0.73264819, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 3.627065658569336 }, { "auxiliary_loss_clip": 0.01340415, "auxiliary_loss_mlp": 0.02572309, "balance_loss_clip": 1.05035043, "balance_loss_mlp": 0.99994391, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 1.9645677604340959, "language_loss": 0.72886324, "learning_rate": 1.2204423193057735e-06, "loss": 0.76799047, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.745328187942505 }, { "auxiliary_loss_clip": 0.01171094, "auxiliary_loss_mlp": 0.01006057, "balance_loss_clip": 1.00769019, "balance_loss_mlp": 1.00510311, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8453016799504164, "language_loss": 0.63358885, "learning_rate": 1.2197250168459122e-06, "loss": 0.65536034, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 4.310146808624268 }, { "auxiliary_loss_clip": 0.01230352, "auxiliary_loss_mlp": 0.01022881, "balance_loss_clip": 1.04808497, "balance_loss_mlp": 1.01598477, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 2.070153284371048, "language_loss": 0.74610418, "learning_rate": 1.2190078327531454e-06, "loss": 0.76863647, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.6409835815429688 }, { "auxiliary_loss_clip": 0.01230427, "auxiliary_loss_mlp": 0.01029252, "balance_loss_clip": 1.04962349, "balance_loss_mlp": 1.02264488, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 2.074891438118566, "language_loss": 0.7294749, "learning_rate": 1.2182907671362697e-06, "loss": 0.75207162, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.6993775367736816 }, { "auxiliary_loss_clip": 0.01224753, "auxiliary_loss_mlp": 0.01025753, "balance_loss_clip": 1.05029035, "balance_loss_mlp": 1.01813591, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.155721714967466, "language_loss": 0.78890157, "learning_rate": 1.2175738201040626e-06, "loss": 0.81140661, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.591069459915161 }, { "auxiliary_loss_clip": 0.01228544, "auxiliary_loss_mlp": 0.01034898, "balance_loss_clip": 1.05011809, "balance_loss_mlp": 1.02724779, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 1.8746230010381824, "language_loss": 0.78611332, "learning_rate": 1.2168569917652855e-06, "loss": 0.80874777, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.707594633102417 }, { "auxiliary_loss_clip": 0.01225242, "auxiliary_loss_mlp": 0.01029437, "balance_loss_clip": 1.04828978, "balance_loss_mlp": 1.02234375, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 9.432821893734403, "language_loss": 0.63907576, "learning_rate": 1.2161402822286797e-06, "loss": 0.66162258, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 3.5787999629974365 }, { "auxiliary_loss_clip": 0.01329658, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 1.0481993, "balance_loss_mlp": 1.017133, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 2.3702539679344388, "language_loss": 0.79382002, "learning_rate": 1.2154236916029703e-06, "loss": 0.81735456, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.683284044265747 }, { "auxiliary_loss_clip": 0.01376356, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.04133296, "balance_loss_mlp": 1.02366006, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.4637363562091554, "language_loss": 0.73552692, "learning_rate": 1.2147072199968627e-06, "loss": 0.75960052, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.7734501361846924 }, { "auxiliary_loss_clip": 0.01226509, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.04923844, "balance_loss_mlp": 1.02301908, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.862433203879789, "language_loss": 0.71987844, "learning_rate": 1.2139908675190454e-06, "loss": 0.74244213, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.5943119525909424 }, { "auxiliary_loss_clip": 0.01416625, "auxiliary_loss_mlp": 0.0102649, "balance_loss_clip": 1.04022503, "balance_loss_mlp": 1.01908147, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 2.7623210186736347, "language_loss": 0.75253767, "learning_rate": 1.2132746342781883e-06, "loss": 0.77696884, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.8414394855499268 }, { "auxiliary_loss_clip": 0.01179229, "auxiliary_loss_mlp": 0.01027898, "balance_loss_clip": 1.05036378, "balance_loss_mlp": 1.02044797, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 3.047870338701369, "language_loss": 0.80004287, "learning_rate": 1.2125585203829442e-06, "loss": 0.82211411, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.550372838973999 }, { "auxiliary_loss_clip": 0.01316651, "auxiliary_loss_mlp": 0.01027944, "balance_loss_clip": 1.04529929, "balance_loss_mlp": 1.02055264, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 1.8283726804263651, "language_loss": 0.74109644, "learning_rate": 1.211842525941946e-06, "loss": 0.76454234, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.7672312259674072 }, { "auxiliary_loss_clip": 0.01369604, "auxiliary_loss_mlp": 0.01026705, "balance_loss_clip": 1.04580283, "balance_loss_mlp": 1.01953447, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 1.825817199706977, "language_loss": 0.79146278, "learning_rate": 1.2111266510638105e-06, "loss": 0.81542587, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 3.008723735809326 }, { "auxiliary_loss_clip": 0.01415866, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.0408721, "balance_loss_mlp": 1.01885414, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 2.140801100020516, "language_loss": 0.80098367, "learning_rate": 1.2104108958571346e-06, "loss": 0.82540739, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.79460072517395 }, { "auxiliary_loss_clip": 0.01223413, "auxiliary_loss_mlp": 0.01032883, "balance_loss_clip": 1.04979348, "balance_loss_mlp": 1.02576935, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.458621368437152, "language_loss": 0.75491369, "learning_rate": 1.2096952604304975e-06, "loss": 0.77747667, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.7047767639160156 }, { "auxiliary_loss_clip": 0.01229211, "auxiliary_loss_mlp": 0.01026812, "balance_loss_clip": 1.04833412, "balance_loss_mlp": 1.01932561, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.2676051287849788, "language_loss": 0.70444167, "learning_rate": 1.2089797448924616e-06, "loss": 0.72700179, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.7771782875061035 }, { "auxiliary_loss_clip": 0.0128388, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.04107738, "balance_loss_mlp": 1.02472186, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 6.254847309407252, "language_loss": 0.66257668, "learning_rate": 1.2082643493515692e-06, "loss": 0.68574071, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.7917044162750244 }, { "auxiliary_loss_clip": 0.01227695, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.05037832, "balance_loss_mlp": 1.01734066, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 2.0696929043392256, "language_loss": 0.82075852, "learning_rate": 1.207549073916346e-06, "loss": 0.84328336, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.6529016494750977 }, { "auxiliary_loss_clip": 0.01272538, "auxiliary_loss_mlp": 0.0102621, "balance_loss_clip": 1.04760504, "balance_loss_mlp": 1.01924539, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.1317507050475566, "language_loss": 0.78030324, "learning_rate": 1.2068339186952976e-06, "loss": 0.80329078, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.6093974113464355 }, { "auxiliary_loss_clip": 0.01231919, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.05038023, "balance_loss_mlp": 1.02081156, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 1.8263896813610085, "language_loss": 0.73330009, "learning_rate": 1.2061188837969136e-06, "loss": 0.75589937, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.645265579223633 }, { "auxiliary_loss_clip": 0.01322756, "auxiliary_loss_mlp": 0.01029349, "balance_loss_clip": 1.04172075, "balance_loss_mlp": 1.02195764, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 3.9640093134470415, "language_loss": 0.84484422, "learning_rate": 1.2054039693296631e-06, "loss": 0.86836535, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.6965737342834473 }, { "auxiliary_loss_clip": 0.01330345, "auxiliary_loss_mlp": 0.01027537, "balance_loss_clip": 1.04777288, "balance_loss_mlp": 1.02029836, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 2.284399082318156, "language_loss": 0.81534171, "learning_rate": 1.2046891754019992e-06, "loss": 0.83892053, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.6886987686157227 }, { "auxiliary_loss_clip": 0.01232679, "auxiliary_loss_mlp": 0.01030663, "balance_loss_clip": 1.05097818, "balance_loss_mlp": 1.02349544, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.0349333454385454, "language_loss": 0.82725275, "learning_rate": 1.2039745021223548e-06, "loss": 0.84988606, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.65878963470459 }, { "auxiliary_loss_clip": 0.01270238, "auxiliary_loss_mlp": 0.01003842, "balance_loss_clip": 1.00811672, "balance_loss_mlp": 1.00296021, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7910189274494626, "language_loss": 0.57009828, "learning_rate": 1.2032599495991456e-06, "loss": 0.59283912, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 4.429994344711304 }, { "auxiliary_loss_clip": 0.01225924, "auxiliary_loss_mlp": 0.01026514, "balance_loss_clip": 1.04986441, "balance_loss_mlp": 1.01916444, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 1.9210795253401722, "language_loss": 0.69570345, "learning_rate": 1.2025455179407685e-06, "loss": 0.71822786, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 3.7220423221588135 }, { "auxiliary_loss_clip": 0.01223077, "auxiliary_loss_mlp": 0.02567919, "balance_loss_clip": 1.04731226, "balance_loss_mlp": 0.99993193, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 3.1876058874053483, "language_loss": 0.73820984, "learning_rate": 1.2018312072556022e-06, "loss": 0.77611983, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 2.653197765350342 }, { "auxiliary_loss_clip": 0.01178015, "auxiliary_loss_mlp": 0.0256696, "balance_loss_clip": 1.05112588, "balance_loss_mlp": 0.99994761, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 2.95149149268209, "language_loss": 0.74899352, "learning_rate": 1.2011170176520077e-06, "loss": 0.78644323, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 3.463630199432373 }, { "auxiliary_loss_clip": 0.01409338, "auxiliary_loss_mlp": 0.01023384, "balance_loss_clip": 1.03995204, "balance_loss_mlp": 1.01663327, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.5414715759350448, "language_loss": 0.81390941, "learning_rate": 1.2004029492383256e-06, "loss": 0.83823669, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.8309359550476074 }, { "auxiliary_loss_clip": 0.01227818, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.05111265, "balance_loss_mlp": 1.01877379, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 2.0765680730243252, "language_loss": 0.73506361, "learning_rate": 1.1996890021228814e-06, "loss": 0.75760412, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.6715340614318848 }, { "auxiliary_loss_clip": 0.01273984, "auxiliary_loss_mlp": 0.0103054, "balance_loss_clip": 1.0447886, "balance_loss_mlp": 1.02360857, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.8266681848311934, "language_loss": 0.69944024, "learning_rate": 1.1989751764139785e-06, "loss": 0.72248548, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.9005038738250732 }, { "auxiliary_loss_clip": 0.01375219, "auxiliary_loss_mlp": 0.01028541, "balance_loss_clip": 1.03998959, "balance_loss_mlp": 1.02163231, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.7497912918768581, "language_loss": 0.83254623, "learning_rate": 1.1982614722199044e-06, "loss": 0.85658377, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.788990020751953 }, { "auxiliary_loss_clip": 0.01281391, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 1.04532671, "balance_loss_mlp": 1.02093959, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.4343447924437114, "language_loss": 0.77818197, "learning_rate": 1.1975478896489276e-06, "loss": 0.80127519, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 3.593946695327759 }, { "auxiliary_loss_clip": 0.01176639, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.04941106, "balance_loss_mlp": 1.01855016, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 2.306817174436465, "language_loss": 0.76225352, "learning_rate": 1.1968344288092981e-06, "loss": 0.78427309, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.5889034271240234 }, { "auxiliary_loss_clip": 0.01226182, "auxiliary_loss_mlp": 0.02566624, "balance_loss_clip": 1.04989588, "balance_loss_mlp": 0.99998045, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.9843278051346342, "language_loss": 0.64798766, "learning_rate": 1.1961210898092468e-06, "loss": 0.68591571, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.669842481613159 }, { "auxiliary_loss_clip": 0.01283959, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.05046904, "balance_loss_mlp": 1.0252831, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 4.141509608895586, "language_loss": 0.79022843, "learning_rate": 1.1954078727569874e-06, "loss": 0.81339478, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.6981024742126465 }, { "auxiliary_loss_clip": 0.01332893, "auxiliary_loss_mlp": 0.02565682, "balance_loss_clip": 1.04620123, "balance_loss_mlp": 0.99994588, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.607009078671201, "language_loss": 0.78207588, "learning_rate": 1.1946947777607141e-06, "loss": 0.82106167, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.7820684909820557 }, { "auxiliary_loss_clip": 0.01370123, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.04205418, "balance_loss_mlp": 1.02306819, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 1.9588895379546838, "language_loss": 0.80246627, "learning_rate": 1.1939818049286024e-06, "loss": 0.82647526, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.8068578243255615 }, { "auxiliary_loss_clip": 0.01414496, "auxiliary_loss_mlp": 0.01029484, "balance_loss_clip": 1.04117012, "balance_loss_mlp": 1.02190185, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.671189822093965, "language_loss": 0.75499475, "learning_rate": 1.1932689543688101e-06, "loss": 0.77943456, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.8449716567993164 }, { "auxiliary_loss_clip": 0.01275471, "auxiliary_loss_mlp": 0.01025915, "balance_loss_clip": 1.0487442, "balance_loss_mlp": 1.01873875, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 1.9031050842654673, "language_loss": 0.72462022, "learning_rate": 1.1925562261894756e-06, "loss": 0.74763405, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.707843542098999 }, { "auxiliary_loss_clip": 0.01274651, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.04629564, "balance_loss_mlp": 1.02175248, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.8083097374239043, "language_loss": 0.77531129, "learning_rate": 1.1918436204987207e-06, "loss": 0.79834664, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.8311479091644287 }, { "auxiliary_loss_clip": 0.01231923, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.05327809, "balance_loss_mlp": 1.02210212, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 2.2759310312235175, "language_loss": 0.82223141, "learning_rate": 1.191131137404645e-06, "loss": 0.84484458, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.7492055892944336 }, { "auxiliary_loss_clip": 0.01321933, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.04541469, "balance_loss_mlp": 1.02266967, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 1.8575178301040514, "language_loss": 0.77082479, "learning_rate": 1.190418777015333e-06, "loss": 0.79434454, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.666023015975952 }, { "auxiliary_loss_clip": 0.01276569, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.04735518, "balance_loss_mlp": 1.01902175, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 2.02905636660288, "language_loss": 0.73628998, "learning_rate": 1.1897065394388487e-06, "loss": 0.75931472, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.817009449005127 }, { "auxiliary_loss_clip": 0.01278498, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.05293894, "balance_loss_mlp": 1.0261457, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.6293159810393822, "language_loss": 0.76637399, "learning_rate": 1.1889944247832385e-06, "loss": 0.78949296, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.6723968982696533 }, { "auxiliary_loss_clip": 0.01229441, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.04646063, "balance_loss_mlp": 1.02213001, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 2.088411334325203, "language_loss": 0.70783746, "learning_rate": 1.1882824331565283e-06, "loss": 0.730425, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.682340621948242 }, { "auxiliary_loss_clip": 0.01327074, "auxiliary_loss_mlp": 0.01026426, "balance_loss_clip": 1.04334235, "balance_loss_mlp": 1.01956844, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.2736883216079637, "language_loss": 0.88878089, "learning_rate": 1.1875705646667287e-06, "loss": 0.91231585, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.7104852199554443 }, { "auxiliary_loss_clip": 0.0122643, "auxiliary_loss_mlp": 0.01019169, "balance_loss_clip": 1.047521, "balance_loss_mlp": 1.0120461, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 2.2511954163330747, "language_loss": 0.7511307, "learning_rate": 1.1868588194218282e-06, "loss": 0.77358663, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.708192825317383 }, { "auxiliary_loss_clip": 0.01282283, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 1.02644265, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.6925277234657443, "language_loss": 0.73895746, "learning_rate": 1.1861471975297979e-06, "loss": 0.76212096, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.8025894165039062 }, { "auxiliary_loss_clip": 0.01321892, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.04891777, "balance_loss_mlp": 1.02312505, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 2.206159393892351, "language_loss": 0.71043539, "learning_rate": 1.185435699098591e-06, "loss": 0.7339586, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.8623263835906982 }, { "auxiliary_loss_clip": 0.01279517, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.04837704, "balance_loss_mlp": 1.02044201, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.2598051967358534, "language_loss": 0.78921866, "learning_rate": 1.1847243242361403e-06, "loss": 0.81228971, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 3.6350767612457275 }, { "auxiliary_loss_clip": 0.01286049, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.05044973, "balance_loss_mlp": 1.02001977, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.7891082558215068, "language_loss": 0.7755214, "learning_rate": 1.1840130730503624e-06, "loss": 0.7986533, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 3.5857667922973633 }, { "auxiliary_loss_clip": 0.01179367, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.05046284, "balance_loss_mlp": 1.02104914, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 1.7660727028670387, "language_loss": 0.74972528, "learning_rate": 1.1833019456491518e-06, "loss": 0.77180451, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 2.6265995502471924 }, { "auxiliary_loss_clip": 0.0122944, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.05025554, "balance_loss_mlp": 1.01930559, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 8.985661777713332, "language_loss": 0.78447819, "learning_rate": 1.1825909421403871e-06, "loss": 0.80703443, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.6992149353027344 }, { "auxiliary_loss_clip": 0.01230321, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.05063748, "balance_loss_mlp": 1.02196872, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 6.476611330423354, "language_loss": 0.76526976, "learning_rate": 1.181880062631926e-06, "loss": 0.78786206, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 3.556283473968506 }, { "auxiliary_loss_clip": 0.01277299, "auxiliary_loss_mlp": 0.01027111, "balance_loss_clip": 1.04844809, "balance_loss_mlp": 1.01898694, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.531395611345167, "language_loss": 0.85035801, "learning_rate": 1.1811693072316093e-06, "loss": 0.87340206, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.6717875003814697 }, { "auxiliary_loss_clip": 0.01180461, "auxiliary_loss_mlp": 0.02568662, "balance_loss_clip": 1.05140781, "balance_loss_mlp": 0.99996006, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.688501113884709, "language_loss": 0.83988339, "learning_rate": 1.1804586760472574e-06, "loss": 0.87737465, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.558027744293213 }, { "auxiliary_loss_clip": 0.01325121, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.04642391, "balance_loss_mlp": 1.01818347, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.4442832692307532, "language_loss": 0.80593359, "learning_rate": 1.1797481691866736e-06, "loss": 0.82943511, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.8676185607910156 }, { "auxiliary_loss_clip": 0.01279141, "auxiliary_loss_mlp": 0.01025732, "balance_loss_clip": 1.05091083, "balance_loss_mlp": 1.01790047, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 2.312555763473352, "language_loss": 0.83032167, "learning_rate": 1.1790377867576393e-06, "loss": 0.85337043, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 3.9158096313476562 }, { "auxiliary_loss_clip": 0.0127847, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 1.04567003, "balance_loss_mlp": 1.02427578, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 2.675385620108355, "language_loss": 0.76739192, "learning_rate": 1.1783275288679203e-06, "loss": 0.79049301, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.6878902912139893 }, { "auxiliary_loss_clip": 0.01121286, "auxiliary_loss_mlp": 0.0100246, "balance_loss_clip": 1.00901008, "balance_loss_mlp": 1.00145233, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8457571277618732, "language_loss": 0.57114094, "learning_rate": 1.177617395625262e-06, "loss": 0.59237844, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.1920483112335205 }, { "auxiliary_loss_clip": 0.01225855, "auxiliary_loss_mlp": 0.01029658, "balance_loss_clip": 1.04978991, "balance_loss_mlp": 1.02219522, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.9086016240199255, "language_loss": 0.75413489, "learning_rate": 1.1769073871373908e-06, "loss": 0.77669001, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.6086840629577637 }, { "auxiliary_loss_clip": 0.01327057, "auxiliary_loss_mlp": 0.01023506, "balance_loss_clip": 1.04460764, "balance_loss_mlp": 1.01577187, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.6636978835221703, "language_loss": 0.83597755, "learning_rate": 1.176197503512015e-06, "loss": 0.85948324, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.704448938369751 }, { "auxiliary_loss_clip": 0.01278112, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.0479672, "balance_loss_mlp": 1.0212276, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.013839878944226, "language_loss": 0.81944561, "learning_rate": 1.1754877448568223e-06, "loss": 0.84250379, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.67710018157959 }, { "auxiliary_loss_clip": 0.01276058, "auxiliary_loss_mlp": 0.01026299, "balance_loss_clip": 1.04772878, "balance_loss_mlp": 1.01907802, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 2.3404292361603374, "language_loss": 0.90275306, "learning_rate": 1.1747781112794837e-06, "loss": 0.9257766, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.6170248985290527 }, { "auxiliary_loss_clip": 0.01327311, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.04671907, "balance_loss_mlp": 1.02237546, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.6096175715666006, "language_loss": 0.83236104, "learning_rate": 1.1740686028876487e-06, "loss": 0.85592973, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.718742609024048 }, { "auxiliary_loss_clip": 0.01225902, "auxiliary_loss_mlp": 0.01022982, "balance_loss_clip": 1.05187333, "balance_loss_mlp": 1.01616359, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 2.8525027511356766, "language_loss": 0.74911296, "learning_rate": 1.1733592197889507e-06, "loss": 0.77160186, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.6101062297821045 }, { "auxiliary_loss_clip": 0.01222714, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 1.04948461, "balance_loss_mlp": 1.0180794, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 2.565692617394344, "language_loss": 0.72882169, "learning_rate": 1.1726499620910014e-06, "loss": 0.75129235, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.6990389823913574 }, { "auxiliary_loss_clip": 0.01225325, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.04901457, "balance_loss_mlp": 1.02061796, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 3.430765639134419, "language_loss": 0.78471041, "learning_rate": 1.1719408299013955e-06, "loss": 0.80724645, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.6492514610290527 }, { "auxiliary_loss_clip": 0.01179078, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.05277228, "balance_loss_mlp": 1.01989245, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.2659522421839844, "language_loss": 0.76389015, "learning_rate": 1.1712318233277067e-06, "loss": 0.78595328, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.6937906742095947 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.01003221, "balance_loss_clip": 1.01114058, "balance_loss_mlp": 1.00214827, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.75160815782135, "language_loss": 0.57840848, "learning_rate": 1.1705229424774916e-06, "loss": 0.59965515, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.045072317123413 }, { "auxiliary_loss_clip": 0.01277431, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.0471741, "balance_loss_mlp": 1.02754402, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 2.0202776617357094, "language_loss": 0.64344656, "learning_rate": 1.1698141874582867e-06, "loss": 0.66656744, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.7180795669555664 }, { "auxiliary_loss_clip": 0.01178821, "auxiliary_loss_mlp": 0.01028386, "balance_loss_clip": 1.05262566, "balance_loss_mlp": 1.02101922, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 2.034748749924749, "language_loss": 0.72390306, "learning_rate": 1.169105558377609e-06, "loss": 0.74597514, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.5870606899261475 }, { "auxiliary_loss_clip": 0.01278582, "auxiliary_loss_mlp": 0.02565853, "balance_loss_clip": 1.05117393, "balance_loss_mlp": 0.99992359, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 2.708943284296548, "language_loss": 0.78717041, "learning_rate": 1.1683970553429587e-06, "loss": 0.82561475, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.7570509910583496 }, { "auxiliary_loss_clip": 0.01329172, "auxiliary_loss_mlp": 0.01026639, "balance_loss_clip": 1.04736793, "balance_loss_mlp": 1.0194422, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 5.653869737408974, "language_loss": 0.82087165, "learning_rate": 1.1676886784618128e-06, "loss": 0.84442973, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.654784917831421 }, { "auxiliary_loss_clip": 0.01226996, "auxiliary_loss_mlp": 0.01029774, "balance_loss_clip": 1.04971743, "balance_loss_mlp": 1.02268362, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.5817260941130957, "language_loss": 0.83703548, "learning_rate": 1.1669804278416332e-06, "loss": 0.85960317, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.6135501861572266 }, { "auxiliary_loss_clip": 0.01280067, "auxiliary_loss_mlp": 0.01022618, "balance_loss_clip": 1.04822266, "balance_loss_mlp": 1.01509607, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 1.9821179943034815, "language_loss": 0.71588248, "learning_rate": 1.1662723035898602e-06, "loss": 0.73890936, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 3.600111484527588 }, { "auxiliary_loss_clip": 0.01227508, "auxiliary_loss_mlp": 0.01023685, "balance_loss_clip": 1.05032039, "balance_loss_mlp": 1.01644325, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 2.1140001741649836, "language_loss": 0.82215172, "learning_rate": 1.165564305813915e-06, "loss": 0.84466362, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 3.5675952434539795 }, { "auxiliary_loss_clip": 0.01225622, "auxiliary_loss_mlp": 0.01022907, "balance_loss_clip": 1.0497731, "balance_loss_mlp": 1.01607597, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.856256842358605, "language_loss": 0.8139317, "learning_rate": 1.1648564346212019e-06, "loss": 0.83641696, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 2.6461098194122314 }, { "auxiliary_loss_clip": 0.01222353, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.05021536, "balance_loss_mlp": 1.02160764, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 2.5120302109855177, "language_loss": 0.75994474, "learning_rate": 1.164148690119104e-06, "loss": 0.78245282, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 3.6221113204956055 }, { "auxiliary_loss_clip": 0.01175864, "auxiliary_loss_mlp": 0.01026678, "balance_loss_clip": 1.05009258, "balance_loss_mlp": 1.01981747, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 1.9658036618130383, "language_loss": 0.74014735, "learning_rate": 1.163441072414985e-06, "loss": 0.76217276, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.580493688583374 }, { "auxiliary_loss_clip": 0.01225376, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.05036151, "balance_loss_mlp": 1.01870072, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 1.7519620705742083, "language_loss": 0.70102823, "learning_rate": 1.16273358161619e-06, "loss": 0.72353816, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.697293758392334 }, { "auxiliary_loss_clip": 0.01188735, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.05256462, "balance_loss_mlp": 1.01875782, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 2.1035885663388534, "language_loss": 0.83482474, "learning_rate": 1.1620262178300446e-06, "loss": 0.85696846, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.6464905738830566 }, { "auxiliary_loss_clip": 0.01326949, "auxiliary_loss_mlp": 0.01021068, "balance_loss_clip": 1.04554951, "balance_loss_mlp": 1.01434433, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.860060521558231, "language_loss": 0.76033175, "learning_rate": 1.1613189811638563e-06, "loss": 0.78381193, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.825573444366455 }, { "auxiliary_loss_clip": 0.01229912, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.05102873, "balance_loss_mlp": 1.0184319, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.9498922523056847, "language_loss": 0.7799421, "learning_rate": 1.1606118717249117e-06, "loss": 0.80249321, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 3.601853132247925 }, { "auxiliary_loss_clip": 0.01182433, "auxiliary_loss_mlp": 0.01026461, "balance_loss_clip": 1.05101943, "balance_loss_mlp": 1.0188849, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.9683529651889502, "language_loss": 0.67961425, "learning_rate": 1.1599048896204787e-06, "loss": 0.70170319, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.5810952186584473 }, { "auxiliary_loss_clip": 0.0133643, "auxiliary_loss_mlp": 0.01028174, "balance_loss_clip": 1.0488646, "balance_loss_mlp": 1.0209738, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 2.952466828239897, "language_loss": 0.81185448, "learning_rate": 1.1591980349578061e-06, "loss": 0.83550048, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.733032464981079 }, { "auxiliary_loss_clip": 0.01224289, "auxiliary_loss_mlp": 0.01000193, "balance_loss_clip": 1.009619, "balance_loss_mlp": 0.99914378, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.7276237160985991, "language_loss": 0.5426712, "learning_rate": 1.158491307844123e-06, "loss": 0.56491601, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.208956718444824 }, { "auxiliary_loss_clip": 0.01275602, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.04791427, "balance_loss_mlp": 1.02047694, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.594479712874919, "language_loss": 0.84181064, "learning_rate": 1.1577847083866387e-06, "loss": 0.86483854, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.755687713623047 }, { "auxiliary_loss_clip": 0.01270548, "auxiliary_loss_mlp": 0.01025655, "balance_loss_clip": 1.04645693, "balance_loss_mlp": 1.01830542, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 2.3257036348853446, "language_loss": 0.72403407, "learning_rate": 1.1570782366925453e-06, "loss": 0.74699616, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.64321231842041 }, { "auxiliary_loss_clip": 0.0127907, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.04522979, "balance_loss_mlp": 1.01923871, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 2.068440738899054, "language_loss": 0.75223321, "learning_rate": 1.1563718928690132e-06, "loss": 0.77528805, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.6680045127868652 }, { "auxiliary_loss_clip": 0.01323928, "auxiliary_loss_mlp": 0.01023995, "balance_loss_clip": 1.04893732, "balance_loss_mlp": 1.01659226, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.0755369378996487, "language_loss": 0.71829271, "learning_rate": 1.1556656770231942e-06, "loss": 0.74177194, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.7182188034057617 }, { "auxiliary_loss_clip": 0.01226846, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.04714775, "balance_loss_mlp": 1.02162766, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.4878543450274582, "language_loss": 0.75991094, "learning_rate": 1.1549595892622207e-06, "loss": 0.78246212, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.6820600032806396 }, { "auxiliary_loss_clip": 0.01270153, "auxiliary_loss_mlp": 0.01002272, "balance_loss_clip": 1.01197076, "balance_loss_mlp": 1.00128865, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.8843696729932652, "language_loss": 0.58970755, "learning_rate": 1.1542536296932047e-06, "loss": 0.61243176, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.232889413833618 }, { "auxiliary_loss_clip": 0.01332371, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.04594159, "balance_loss_mlp": 1.02045298, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.9754430256653954, "language_loss": 0.69986326, "learning_rate": 1.1535477984232414e-06, "loss": 0.72346967, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.721411943435669 }, { "auxiliary_loss_clip": 0.01275345, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.04001045, "balance_loss_mlp": 1.01903284, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.9615277044579729, "language_loss": 0.76846433, "learning_rate": 1.152842095559404e-06, "loss": 0.79147851, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.8004376888275146 }, { "auxiliary_loss_clip": 0.01278252, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.04437375, "balance_loss_mlp": 1.02621651, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 4.636066553665009, "language_loss": 0.76876181, "learning_rate": 1.1521365212087474e-06, "loss": 0.79187119, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.69944429397583 }, { "auxiliary_loss_clip": 0.0122621, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04747748, "balance_loss_mlp": 1.01921797, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.6003861585554322, "language_loss": 0.70790243, "learning_rate": 1.1514310754783062e-06, "loss": 0.73042792, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.8331291675567627 }, { "auxiliary_loss_clip": 0.01279082, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 1.0480547, "balance_loss_mlp": 1.02453601, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 2.3551354867753505, "language_loss": 0.73427004, "learning_rate": 1.1507257584750964e-06, "loss": 0.75737441, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.690790891647339 }, { "auxiliary_loss_clip": 0.01179674, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.05183864, "balance_loss_mlp": 1.02451324, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.9458585331150509, "language_loss": 0.7731567, "learning_rate": 1.150020570306113e-06, "loss": 0.7952795, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.6193864345550537 }, { "auxiliary_loss_clip": 0.01272256, "auxiliary_loss_mlp": 0.01029034, "balance_loss_clip": 1.04225099, "balance_loss_mlp": 1.02168202, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 1.9165760361507775, "language_loss": 0.74948072, "learning_rate": 1.1493155110783338e-06, "loss": 0.77249366, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.650876045227051 }, { "auxiliary_loss_clip": 0.0122683, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.04938745, "balance_loss_mlp": 1.02172649, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 2.5517627452400564, "language_loss": 0.70543474, "learning_rate": 1.1486105808987155e-06, "loss": 0.7279883, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.6841177940368652 }, { "auxiliary_loss_clip": 0.01228533, "auxiliary_loss_mlp": 0.01028387, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.02093041, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.928626003285675, "language_loss": 0.81366372, "learning_rate": 1.1479057798741947e-06, "loss": 0.8362329, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 3.6132447719573975 }, { "auxiliary_loss_clip": 0.01187007, "auxiliary_loss_mlp": 0.01001746, "balance_loss_clip": 1.02009058, "balance_loss_mlp": 1.00075102, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7806073404423542, "language_loss": 0.53261048, "learning_rate": 1.14720110811169e-06, "loss": 0.55449796, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 4.147408723831177 }, { "auxiliary_loss_clip": 0.01229998, "auxiliary_loss_mlp": 0.01028585, "balance_loss_clip": 1.05064404, "balance_loss_mlp": 1.02112818, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 4.955999817011386, "language_loss": 0.76824355, "learning_rate": 1.146496565718098e-06, "loss": 0.79082942, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 2.655353546142578 }, { "auxiliary_loss_clip": 0.01285383, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.05400395, "balance_loss_mlp": 1.0241785, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 2.120412173402629, "language_loss": 0.75945699, "learning_rate": 1.1457921528002996e-06, "loss": 0.78262222, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 3.587359666824341 }, { "auxiliary_loss_clip": 0.01179805, "auxiliary_loss_mlp": 0.02566258, "balance_loss_clip": 1.05088329, "balance_loss_mlp": 0.99996865, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.6100917975246962, "language_loss": 0.72412491, "learning_rate": 1.1450878694651522e-06, "loss": 0.76158559, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.679577112197876 }, { "auxiliary_loss_clip": 0.01374735, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.04128397, "balance_loss_mlp": 1.01844466, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.4622043516333028, "language_loss": 0.63214946, "learning_rate": 1.1443837158194954e-06, "loss": 0.65615571, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.7450952529907227 }, { "auxiliary_loss_clip": 0.01332207, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.05174291, "balance_loss_mlp": 1.02708364, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.7840738608253814, "language_loss": 0.74496222, "learning_rate": 1.1436796919701484e-06, "loss": 0.76862872, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.688143730163574 }, { "auxiliary_loss_clip": 0.01285005, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.05306101, "balance_loss_mlp": 1.0252049, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 1.903405810962586, "language_loss": 0.62004596, "learning_rate": 1.1429757980239115e-06, "loss": 0.64321852, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.732534885406494 }, { "auxiliary_loss_clip": 0.01182413, "auxiliary_loss_mlp": 0.01028814, "balance_loss_clip": 1.05148578, "balance_loss_mlp": 1.02125621, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 6.480381621836686, "language_loss": 0.81825805, "learning_rate": 1.1422720340875636e-06, "loss": 0.8403703, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 3.5740420818328857 }, { "auxiliary_loss_clip": 0.0123635, "auxiliary_loss_mlp": 0.01027464, "balance_loss_clip": 1.05239463, "balance_loss_mlp": 1.0198878, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 14.699523670367348, "language_loss": 0.79339868, "learning_rate": 1.1415684002678671e-06, "loss": 0.81603682, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.6206202507019043 }, { "auxiliary_loss_clip": 0.01280842, "auxiliary_loss_mlp": 0.01024019, "balance_loss_clip": 1.04669034, "balance_loss_mlp": 1.0166043, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 2.326401052622643, "language_loss": 0.77597362, "learning_rate": 1.1408648966715617e-06, "loss": 0.7990222, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.6803457736968994 }, { "auxiliary_loss_clip": 0.01276693, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.04242778, "balance_loss_mlp": 1.02232325, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 2.854982404129945, "language_loss": 0.72517246, "learning_rate": 1.1401615234053683e-06, "loss": 0.74823809, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.6611742973327637 }, { "auxiliary_loss_clip": 0.01279635, "auxiliary_loss_mlp": 0.01023218, "balance_loss_clip": 1.04766488, "balance_loss_mlp": 1.01577938, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 1.8549520358833953, "language_loss": 0.75670898, "learning_rate": 1.1394582805759885e-06, "loss": 0.77973747, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.75006103515625 }, { "auxiliary_loss_clip": 0.01226582, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.05147076, "balance_loss_mlp": 1.02059639, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 1.7209484559770951, "language_loss": 0.75823462, "learning_rate": 1.1387551682901022e-06, "loss": 0.78078002, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.6242623329162598 }, { "auxiliary_loss_clip": 0.01323016, "auxiliary_loss_mlp": 0.01026447, "balance_loss_clip": 1.04593408, "balance_loss_mlp": 1.01950932, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 2.5557549654219467, "language_loss": 0.70803583, "learning_rate": 1.138052186654373e-06, "loss": 0.73153055, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.7227444648742676 }, { "auxiliary_loss_clip": 0.01281156, "auxiliary_loss_mlp": 0.01027443, "balance_loss_clip": 1.04778504, "balance_loss_mlp": 1.0198853, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.444798574118974, "language_loss": 0.87969351, "learning_rate": 1.1373493357754417e-06, "loss": 0.90277946, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.643268346786499 }, { "auxiliary_loss_clip": 0.01177988, "auxiliary_loss_mlp": 0.01025419, "balance_loss_clip": 1.04971027, "balance_loss_mlp": 1.01830816, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 1.8800330360825315, "language_loss": 0.77596676, "learning_rate": 1.1366466157599303e-06, "loss": 0.79800081, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.6175243854522705 }, { "auxiliary_loss_clip": 0.01373468, "auxiliary_loss_mlp": 0.02567064, "balance_loss_clip": 1.04483056, "balance_loss_mlp": 0.99996245, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 3.170309959805079, "language_loss": 0.7646805, "learning_rate": 1.1359440267144412e-06, "loss": 0.80408579, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.7027971744537354 }, { "auxiliary_loss_clip": 0.01230383, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04906821, "balance_loss_mlp": 1.01809955, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 3.0913279831993696, "language_loss": 0.74007869, "learning_rate": 1.1352415687455556e-06, "loss": 0.76263499, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.7686328887939453 }, { "auxiliary_loss_clip": 0.0122882, "auxiliary_loss_mlp": 0.01028993, "balance_loss_clip": 1.05229878, "balance_loss_mlp": 1.02137852, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.751591341834461, "language_loss": 0.63648117, "learning_rate": 1.1345392419598362e-06, "loss": 0.65905929, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.6842434406280518 }, { "auxiliary_loss_clip": 0.01225376, "auxiliary_loss_mlp": 0.0102624, "balance_loss_clip": 1.04724002, "balance_loss_mlp": 1.0192157, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 2.151936229129589, "language_loss": 0.71875679, "learning_rate": 1.1338370464638263e-06, "loss": 0.74127293, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.607154130935669 }, { "auxiliary_loss_clip": 0.01177789, "auxiliary_loss_mlp": 0.01023164, "balance_loss_clip": 1.0500828, "balance_loss_mlp": 1.01604104, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.502683706857755, "language_loss": 0.63423592, "learning_rate": 1.1331349823640474e-06, "loss": 0.65624541, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.5923633575439453 }, { "auxiliary_loss_clip": 0.01230486, "auxiliary_loss_mlp": 0.025613, "balance_loss_clip": 1.05113053, "balance_loss_mlp": 0.99998116, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.306020775860141, "language_loss": 0.78107834, "learning_rate": 1.132433049767003e-06, "loss": 0.81899619, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.7102584838867188 }, { "auxiliary_loss_clip": 0.01273893, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.0480907, "balance_loss_mlp": 1.02074742, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5403289652308572, "language_loss": 0.81058347, "learning_rate": 1.1317312487791748e-06, "loss": 0.83359754, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.6879794597625732 }, { "auxiliary_loss_clip": 0.01225619, "auxiliary_loss_mlp": 0.01023324, "balance_loss_clip": 1.04639959, "balance_loss_mlp": 1.01612377, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 1.851989820708844, "language_loss": 0.73142278, "learning_rate": 1.1310295795070253e-06, "loss": 0.75391221, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.5850019454956055 }, { "auxiliary_loss_clip": 0.01281326, "auxiliary_loss_mlp": 0.01022766, "balance_loss_clip": 1.04324019, "balance_loss_mlp": 1.01566982, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.7869915383695245, "language_loss": 0.81111264, "learning_rate": 1.1303280420569982e-06, "loss": 0.83415353, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.7777106761932373 }, { "auxiliary_loss_clip": 0.01225659, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 1.04874146, "balance_loss_mlp": 1.01674366, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.885830325799197, "language_loss": 0.77422762, "learning_rate": 1.1296266365355158e-06, "loss": 0.79672402, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 3.52317214012146 }, { "auxiliary_loss_clip": 0.01330105, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.0482465, "balance_loss_mlp": 1.02273607, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 2.0073581796269155, "language_loss": 0.73465717, "learning_rate": 1.1289253630489806e-06, "loss": 0.75826287, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 3.6282200813293457 }, { "auxiliary_loss_clip": 0.01233877, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.04880667, "balance_loss_mlp": 1.0247848, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 10.493789351798704, "language_loss": 0.72855526, "learning_rate": 1.1282242217037753e-06, "loss": 0.75122404, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 2.6482295989990234 }, { "auxiliary_loss_clip": 0.01374916, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.04128003, "balance_loss_mlp": 1.02286947, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 1.7070326702621832, "language_loss": 0.62038243, "learning_rate": 1.127523212606262e-06, "loss": 0.64443505, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 3.8822784423828125 }, { "auxiliary_loss_clip": 0.01223403, "auxiliary_loss_mlp": 0.01029346, "balance_loss_clip": 1.04732311, "balance_loss_mlp": 1.02217007, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 2.2415187869201776, "language_loss": 0.73452568, "learning_rate": 1.1268223358627835e-06, "loss": 0.75705314, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.6701555252075195 }, { "auxiliary_loss_clip": 0.01177097, "auxiliary_loss_mlp": 0.01023173, "balance_loss_clip": 1.04918349, "balance_loss_mlp": 1.01571047, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 2.043749796469184, "language_loss": 0.7232517, "learning_rate": 1.126121591579663e-06, "loss": 0.7452544, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.587313652038574 }, { "auxiliary_loss_clip": 0.01224303, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.05015612, "balance_loss_mlp": 1.02135575, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.8017052151299981, "language_loss": 0.6930272, "learning_rate": 1.1254209798632018e-06, "loss": 0.71555233, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.7308852672576904 }, { "auxiliary_loss_clip": 0.01417698, "auxiliary_loss_mlp": 0.0102673, "balance_loss_clip": 1.04107296, "balance_loss_mlp": 1.01925516, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 2.0190456340500558, "language_loss": 0.84617686, "learning_rate": 1.124720500819683e-06, "loss": 0.87062114, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.887286901473999 }, { "auxiliary_loss_clip": 0.01180559, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.05230165, "balance_loss_mlp": 1.02150881, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.8159591016737833, "language_loss": 0.82501167, "learning_rate": 1.1240201545553682e-06, "loss": 0.84710521, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 3.6231441497802734 }, { "auxiliary_loss_clip": 0.01325051, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.04756856, "balance_loss_mlp": 1.02486467, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 1.7698258579747173, "language_loss": 0.7321533, "learning_rate": 1.1233199411764987e-06, "loss": 0.75572491, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.684396505355835 }, { "auxiliary_loss_clip": 0.01318192, "auxiliary_loss_mlp": 0.01023339, "balance_loss_clip": 1.04289639, "balance_loss_mlp": 1.0164963, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 2.2471085273305813, "language_loss": 0.68732458, "learning_rate": 1.1226198607892978e-06, "loss": 0.71073991, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.666257619857788 }, { "auxiliary_loss_clip": 0.01374545, "auxiliary_loss_mlp": 0.01023908, "balance_loss_clip": 1.04552627, "balance_loss_mlp": 1.01763463, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 1.8457519126083552, "language_loss": 0.80269736, "learning_rate": 1.1219199134999664e-06, "loss": 0.82668191, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.742067337036133 }, { "auxiliary_loss_clip": 0.01180523, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.0485214, "balance_loss_mlp": 1.02252293, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 5.357322027637243, "language_loss": 0.78694856, "learning_rate": 1.1212200994146863e-06, "loss": 0.80905604, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.6347031593322754 }, { "auxiliary_loss_clip": 0.0132486, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.0401001, "balance_loss_mlp": 1.01977706, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 1.7916631768620825, "language_loss": 0.75622189, "learning_rate": 1.120520418639618e-06, "loss": 0.77973795, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.6369264125823975 }, { "auxiliary_loss_clip": 0.01227682, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.05075884, "balance_loss_mlp": 1.01843476, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 2.365254205156692, "language_loss": 0.8375107, "learning_rate": 1.119820871280903e-06, "loss": 0.86004245, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.6866512298583984 }, { "auxiliary_loss_clip": 0.01226729, "auxiliary_loss_mlp": 0.01027573, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.02041721, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 2.7070796565877036, "language_loss": 0.73645258, "learning_rate": 1.1191214574446614e-06, "loss": 0.75899559, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.633678674697876 }, { "auxiliary_loss_clip": 0.01270436, "auxiliary_loss_mlp": 0.01026973, "balance_loss_clip": 1.04465902, "balance_loss_mlp": 1.01957035, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 2.774114463800792, "language_loss": 0.80145884, "learning_rate": 1.118422177236995e-06, "loss": 0.82443297, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.719125270843506 }, { "auxiliary_loss_clip": 0.01276532, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 1.04644585, "balance_loss_mlp": 1.01869476, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 26.609557876332538, "language_loss": 0.85800976, "learning_rate": 1.1177230307639835e-06, "loss": 0.8810364, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.6332247257232666 }, { "auxiliary_loss_clip": 0.01321591, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.04380322, "balance_loss_mlp": 1.0264256, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.6583573679959318, "language_loss": 0.78998101, "learning_rate": 1.1170240181316865e-06, "loss": 0.81352794, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.744224786758423 }, { "auxiliary_loss_clip": 0.01322232, "auxiliary_loss_mlp": 0.01026955, "balance_loss_clip": 1.04162991, "balance_loss_mlp": 1.01956964, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.1062334415376616, "language_loss": 0.78848195, "learning_rate": 1.1163251394461442e-06, "loss": 0.81197387, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.6836535930633545 }, { "auxiliary_loss_clip": 0.01224048, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.04674816, "balance_loss_mlp": 1.02003241, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.9093226040754003, "language_loss": 0.82462692, "learning_rate": 1.1156263948133746e-06, "loss": 0.84714794, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.6298000812530518 }, { "auxiliary_loss_clip": 0.01367695, "auxiliary_loss_mlp": 0.02566053, "balance_loss_clip": 1.04330742, "balance_loss_mlp": 0.99995399, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 1.8022589529770474, "language_loss": 0.77475584, "learning_rate": 1.1149277843393787e-06, "loss": 0.81409329, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.7474470138549805 }, { "auxiliary_loss_clip": 0.01326672, "auxiliary_loss_mlp": 0.02565902, "balance_loss_clip": 1.03876173, "balance_loss_mlp": 0.99999362, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.576168140481698, "language_loss": 0.63363242, "learning_rate": 1.1142293081301342e-06, "loss": 0.67255813, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.8075101375579834 }, { "auxiliary_loss_clip": 0.0127173, "auxiliary_loss_mlp": 0.01023152, "balance_loss_clip": 1.045928, "balance_loss_mlp": 1.01616311, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.9498911895038764, "language_loss": 0.6799826, "learning_rate": 1.1135309662915995e-06, "loss": 0.7029314, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.6712820529937744 }, { "auxiliary_loss_clip": 0.01276379, "auxiliary_loss_mlp": 0.0102691, "balance_loss_clip": 1.04165447, "balance_loss_mlp": 1.01971591, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 2.986990336406726, "language_loss": 0.60419726, "learning_rate": 1.112832758929712e-06, "loss": 0.62723017, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.812462568283081 }, { "auxiliary_loss_clip": 0.01224842, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.04855466, "balance_loss_mlp": 1.02333212, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.9068719320345653, "language_loss": 0.74978077, "learning_rate": 1.11213468615039e-06, "loss": 0.77233469, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.6571929454803467 }, { "auxiliary_loss_clip": 0.01322208, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.04054785, "balance_loss_mlp": 1.01951385, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 2.1302425531170845, "language_loss": 0.75519741, "learning_rate": 1.1114367480595292e-06, "loss": 0.77868283, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 3.842916250228882 }, { "auxiliary_loss_clip": 0.01426748, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.04470587, "balance_loss_mlp": 1.02020454, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 2.91766769542174, "language_loss": 0.81288111, "learning_rate": 1.1107389447630086e-06, "loss": 0.8374306, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.8885416984558105 }, { "auxiliary_loss_clip": 0.01276524, "auxiliary_loss_mlp": 0.02562854, "balance_loss_clip": 1.04521227, "balance_loss_mlp": 1.00003326, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.601936796310565, "language_loss": 0.78409708, "learning_rate": 1.1100412763666818e-06, "loss": 0.82249081, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 2.6148431301116943 }, { "auxiliary_loss_clip": 0.01278702, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.0487386, "balance_loss_mlp": 1.01793957, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.4414680272125278, "language_loss": 0.8012858, "learning_rate": 1.1093437429763865e-06, "loss": 0.82432485, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 3.6090495586395264 }, { "auxiliary_loss_clip": 0.01223864, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.04882431, "balance_loss_mlp": 1.02141118, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 18.990648333320188, "language_loss": 0.73066866, "learning_rate": 1.1086463446979361e-06, "loss": 0.75318861, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.6246602535247803 }, { "auxiliary_loss_clip": 0.01232251, "auxiliary_loss_mlp": 0.01028751, "balance_loss_clip": 1.05377316, "balance_loss_mlp": 1.020818, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 6.855671173346874, "language_loss": 0.77268797, "learning_rate": 1.1079490816371277e-06, "loss": 0.79529798, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.589524507522583 }, { "auxiliary_loss_clip": 0.01227835, "auxiliary_loss_mlp": 0.02565632, "balance_loss_clip": 1.04872978, "balance_loss_mlp": 1.00001144, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 1.961221999880228, "language_loss": 0.74683893, "learning_rate": 1.1072519538997352e-06, "loss": 0.78477353, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.620088577270508 }, { "auxiliary_loss_clip": 0.01275959, "auxiliary_loss_mlp": 0.0102178, "balance_loss_clip": 1.04392087, "balance_loss_mlp": 1.01488721, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.9026035522808549, "language_loss": 0.82588035, "learning_rate": 1.1065549615915095e-06, "loss": 0.84885776, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.7519476413726807 }, { "auxiliary_loss_clip": 0.01228487, "auxiliary_loss_mlp": 0.01026018, "balance_loss_clip": 1.0528034, "balance_loss_mlp": 1.01852584, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.6792391778299716, "language_loss": 0.78506422, "learning_rate": 1.105858104818187e-06, "loss": 0.80760932, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 3.55314302444458 }, { "auxiliary_loss_clip": 0.01232849, "auxiliary_loss_mlp": 0.01028539, "balance_loss_clip": 1.05039191, "balance_loss_mlp": 1.02083206, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 2.6030154700700203, "language_loss": 0.75067854, "learning_rate": 1.105161383685478e-06, "loss": 0.77329242, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.6238620281219482 }, { "auxiliary_loss_clip": 0.01218791, "auxiliary_loss_mlp": 0.00999148, "balance_loss_clip": 1.00924635, "balance_loss_mlp": 0.9982416, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7256989419024298, "language_loss": 0.56357932, "learning_rate": 1.1044647982990771e-06, "loss": 0.58575869, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.1921029090881348 }, { "auxiliary_loss_clip": 0.01274808, "auxiliary_loss_mlp": 0.01027845, "balance_loss_clip": 1.04844105, "balance_loss_mlp": 1.02065921, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.4250154459510114, "language_loss": 0.64347148, "learning_rate": 1.1037683487646536e-06, "loss": 0.66649806, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.759403705596924 }, { "auxiliary_loss_clip": 0.01273766, "auxiliary_loss_mlp": 0.02564117, "balance_loss_clip": 1.04998446, "balance_loss_mlp": 0.99995828, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.1250114595909015, "language_loss": 0.77019048, "learning_rate": 1.1030720351878583e-06, "loss": 0.80856931, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.625041961669922 }, { "auxiliary_loss_clip": 0.01173574, "auxiliary_loss_mlp": 0.01001554, "balance_loss_clip": 1.00968719, "balance_loss_mlp": 1.00053501, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.8066425328485106, "language_loss": 0.57590562, "learning_rate": 1.102375857674323e-06, "loss": 0.59765691, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.2424581050872803 }, { "auxiliary_loss_clip": 0.01276464, "auxiliary_loss_mlp": 0.01027496, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.02042687, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.770267910107722, "language_loss": 0.89952385, "learning_rate": 1.1016798163296561e-06, "loss": 0.92256343, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.643224000930786 }, { "auxiliary_loss_clip": 0.01129812, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.05000925, "balance_loss_mlp": 1.02401137, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 5.388688971263541, "language_loss": 0.66506004, "learning_rate": 1.1009839112594471e-06, "loss": 0.68666822, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.630459785461426 }, { "auxiliary_loss_clip": 0.01229078, "auxiliary_loss_mlp": 0.01023776, "balance_loss_clip": 1.0497328, "balance_loss_mlp": 1.01674581, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.296207521511213, "language_loss": 0.72355533, "learning_rate": 1.1002881425692638e-06, "loss": 0.74608386, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.6454477310180664 }, { "auxiliary_loss_clip": 0.0122495, "auxiliary_loss_mlp": 0.01028061, "balance_loss_clip": 1.04627693, "balance_loss_mlp": 1.02073002, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.7255171801342932, "language_loss": 0.7519381, "learning_rate": 1.0995925103646532e-06, "loss": 0.7744683, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.635377883911133 }, { "auxiliary_loss_clip": 0.01320378, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 1.04604435, "balance_loss_mlp": 1.01972997, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.488041581069915, "language_loss": 0.67109591, "learning_rate": 1.0988970147511437e-06, "loss": 0.69456607, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.8348073959350586 }, { "auxiliary_loss_clip": 0.01274273, "auxiliary_loss_mlp": 0.01023971, "balance_loss_clip": 1.0486474, "balance_loss_mlp": 1.01665187, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.7070497665171604, "language_loss": 0.80232364, "learning_rate": 1.0982016558342405e-06, "loss": 0.82530606, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.6559014320373535 }, { "auxiliary_loss_clip": 0.01179223, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.0526526, "balance_loss_mlp": 1.02045727, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 7.648347431674363, "language_loss": 0.71658325, "learning_rate": 1.0975064337194291e-06, "loss": 0.73864794, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.6056840419769287 }, { "auxiliary_loss_clip": 0.01329776, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 1.04834938, "balance_loss_mlp": 1.02073455, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 2.185468450648023, "language_loss": 0.7033155, "learning_rate": 1.0968113485121743e-06, "loss": 0.72689056, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.7169063091278076 }, { "auxiliary_loss_clip": 0.01229027, "auxiliary_loss_mlp": 0.02567182, "balance_loss_clip": 1.04760838, "balance_loss_mlp": 0.99997717, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 2.7532837228679226, "language_loss": 0.80023277, "learning_rate": 1.0961164003179185e-06, "loss": 0.83819485, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.6531145572662354 }, { "auxiliary_loss_clip": 0.01328394, "auxiliary_loss_mlp": 0.01024856, "balance_loss_clip": 1.04504502, "balance_loss_mlp": 1.01790035, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 2.357185210308578, "language_loss": 0.84051317, "learning_rate": 1.0954215892420884e-06, "loss": 0.86404562, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.698723793029785 }, { "auxiliary_loss_clip": 0.01329301, "auxiliary_loss_mlp": 0.01024688, "balance_loss_clip": 1.04846406, "balance_loss_mlp": 1.01740456, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.6778929052993212, "language_loss": 0.70393747, "learning_rate": 1.094726915390082e-06, "loss": 0.72747737, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.712451934814453 }, { "auxiliary_loss_clip": 0.01229477, "auxiliary_loss_mlp": 0.01021589, "balance_loss_clip": 1.05102742, "balance_loss_mlp": 1.01424527, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 2.135844974541871, "language_loss": 0.69788098, "learning_rate": 1.0940323788672836e-06, "loss": 0.72039163, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.5877645015716553 }, { "auxiliary_loss_clip": 0.01222006, "auxiliary_loss_mlp": 0.01025963, "balance_loss_clip": 1.04878855, "balance_loss_mlp": 1.01886737, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.9826925355280898, "language_loss": 0.74273932, "learning_rate": 1.0933379797790522e-06, "loss": 0.76521903, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 3.5598385334014893 }, { "auxiliary_loss_clip": 0.01179774, "auxiliary_loss_mlp": 0.01029124, "balance_loss_clip": 1.05208576, "balance_loss_mlp": 1.02213883, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 2.6703478945513166, "language_loss": 0.7169143, "learning_rate": 1.0926437182307293e-06, "loss": 0.7390033, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 3.4581408500671387 }, { "auxiliary_loss_clip": 0.01281549, "auxiliary_loss_mlp": 0.01023135, "balance_loss_clip": 1.04670238, "balance_loss_mlp": 1.01582122, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 2.8959503496679098, "language_loss": 0.78372419, "learning_rate": 1.0919495943276338e-06, "loss": 0.80677098, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.670224666595459 }, { "auxiliary_loss_clip": 0.01333953, "auxiliary_loss_mlp": 0.0102414, "balance_loss_clip": 1.04446101, "balance_loss_mlp": 1.01663613, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.2995361233913547, "language_loss": 0.76168799, "learning_rate": 1.0912556081750611e-06, "loss": 0.78526896, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 3.584895610809326 }, { "auxiliary_loss_clip": 0.01272862, "auxiliary_loss_mlp": 0.01023395, "balance_loss_clip": 1.05008221, "balance_loss_mlp": 1.01638544, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 1.9931413128580504, "language_loss": 0.76469827, "learning_rate": 1.0905617598782909e-06, "loss": 0.78766078, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.673684597015381 }, { "auxiliary_loss_clip": 0.01370342, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.04328668, "balance_loss_mlp": 1.01815832, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 2.3076713966539475, "language_loss": 0.81521726, "learning_rate": 1.0898680495425775e-06, "loss": 0.83917308, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.740471839904785 }, { "auxiliary_loss_clip": 0.012788, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.04889059, "balance_loss_mlp": 1.02271962, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.9092998098466674, "language_loss": 0.80177587, "learning_rate": 1.0891744772731594e-06, "loss": 0.82486242, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.6357572078704834 }, { "auxiliary_loss_clip": 0.01227825, "auxiliary_loss_mlp": 0.01024599, "balance_loss_clip": 1.04784918, "balance_loss_mlp": 1.0174973, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.8444955740469628, "language_loss": 0.66068739, "learning_rate": 1.088481043175248e-06, "loss": 0.68321162, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.638779640197754 }, { "auxiliary_loss_clip": 0.01270777, "auxiliary_loss_mlp": 0.01024819, "balance_loss_clip": 1.04412353, "balance_loss_mlp": 1.01800358, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.6515689148666428, "language_loss": 0.75555182, "learning_rate": 1.0877877473540368e-06, "loss": 0.77850771, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 3.691736936569214 }, { "auxiliary_loss_clip": 0.01179351, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.05028105, "balance_loss_mlp": 1.02136374, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 2.255625154525312, "language_loss": 0.72829193, "learning_rate": 1.0870945899147002e-06, "loss": 0.75037909, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.580784797668457 }, { "auxiliary_loss_clip": 0.01225428, "auxiliary_loss_mlp": 0.01024548, "balance_loss_clip": 1.0504142, "balance_loss_mlp": 1.01780689, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 2.2863270141282968, "language_loss": 0.76088089, "learning_rate": 1.0864015709623879e-06, "loss": 0.78338063, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.7074501514434814 }, { "auxiliary_loss_clip": 0.0123166, "auxiliary_loss_mlp": 0.01026342, "balance_loss_clip": 1.04933703, "balance_loss_mlp": 1.01963377, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.8381154113330678, "language_loss": 0.80326664, "learning_rate": 1.0857086906022313e-06, "loss": 0.82584667, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.5841357707977295 }, { "auxiliary_loss_clip": 0.01417241, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.04485607, "balance_loss_mlp": 1.02254653, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.347379000619189, "language_loss": 0.72956133, "learning_rate": 1.0850159489393388e-06, "loss": 0.75403178, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.7797205448150635 }, { "auxiliary_loss_clip": 0.01320392, "auxiliary_loss_mlp": 0.01027647, "balance_loss_clip": 1.04032254, "balance_loss_mlp": 1.02050626, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 2.006315264312654, "language_loss": 0.82087106, "learning_rate": 1.0843233460787992e-06, "loss": 0.84435141, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.6603031158447266 }, { "auxiliary_loss_clip": 0.01321734, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.04639351, "balance_loss_mlp": 1.02053118, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 2.4510862808091605, "language_loss": 0.78084755, "learning_rate": 1.0836308821256805e-06, "loss": 0.80434203, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.700047492980957 }, { "auxiliary_loss_clip": 0.01220502, "auxiliary_loss_mlp": 0.01026015, "balance_loss_clip": 1.04764557, "balance_loss_mlp": 1.01966715, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.0593991503372617, "language_loss": 0.78070229, "learning_rate": 1.0829385571850282e-06, "loss": 0.80316746, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.656554698944092 }, { "auxiliary_loss_clip": 0.0118317, "auxiliary_loss_mlp": 0.0103498, "balance_loss_clip": 1.05109811, "balance_loss_mlp": 1.0268681, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.612083129248404, "language_loss": 0.83797657, "learning_rate": 1.0822463713618679e-06, "loss": 0.86015803, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.5681161880493164 }, { "auxiliary_loss_clip": 0.01325236, "auxiliary_loss_mlp": 0.01024572, "balance_loss_clip": 1.04393506, "balance_loss_mlp": 1.01727319, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.49346246405378, "language_loss": 0.85181266, "learning_rate": 1.0815543247612034e-06, "loss": 0.87531078, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.688002109527588 }, { "auxiliary_loss_clip": 0.01275377, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.04253531, "balance_loss_mlp": 1.01900148, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.5044380476087564, "language_loss": 0.82883453, "learning_rate": 1.0808624174880168e-06, "loss": 0.85184753, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.6334853172302246 }, { "auxiliary_loss_clip": 0.01174651, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 1.050318, "balance_loss_mlp": 1.0181812, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 2.0399557502494265, "language_loss": 0.79930413, "learning_rate": 1.080170649647272e-06, "loss": 0.8212989, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.6194918155670166 }, { "auxiliary_loss_clip": 0.01176297, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.04893792, "balance_loss_mlp": 1.01804471, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.9981682594140222, "language_loss": 0.6725536, "learning_rate": 1.0794790213439068e-06, "loss": 0.69456774, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.726300001144409 }, { "auxiliary_loss_clip": 0.01376859, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.04499555, "balance_loss_mlp": 1.01737821, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 1.7662786169204687, "language_loss": 0.78302264, "learning_rate": 1.078787532682843e-06, "loss": 0.80704105, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.7515039443969727 }, { "auxiliary_loss_clip": 0.0122491, "auxiliary_loss_mlp": 0.01027203, "balance_loss_clip": 1.04914057, "balance_loss_mlp": 1.01969922, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 3.219044728636975, "language_loss": 0.75697464, "learning_rate": 1.0780961837689773e-06, "loss": 0.77949584, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.7394988536834717 }, { "auxiliary_loss_clip": 0.01272368, "auxiliary_loss_mlp": 0.01024523, "balance_loss_clip": 1.04762757, "balance_loss_mlp": 1.017555, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 2.2799869220126636, "language_loss": 0.70258355, "learning_rate": 1.0774049747071883e-06, "loss": 0.72555244, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.60937237739563 }, { "auxiliary_loss_clip": 0.01371696, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.04684401, "balance_loss_mlp": 1.0208286, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.9064255369633842, "language_loss": 0.68207538, "learning_rate": 1.076713905602332e-06, "loss": 0.70606625, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.8440396785736084 }, { "auxiliary_loss_clip": 0.01234325, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.0533545, "balance_loss_mlp": 1.02090907, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 3.0687568022116025, "language_loss": 0.81071752, "learning_rate": 1.07602297655924e-06, "loss": 0.83334374, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.6223411560058594 }, { "auxiliary_loss_clip": 0.01178618, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.05178285, "balance_loss_mlp": 1.01811564, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 2.023862389264744, "language_loss": 0.81168312, "learning_rate": 1.0753321876827292e-06, "loss": 0.83371872, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.549839973449707 }, { "auxiliary_loss_clip": 0.01178751, "auxiliary_loss_mlp": 0.01025636, "balance_loss_clip": 1.05024993, "balance_loss_mlp": 1.01843023, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 2.1797808943871977, "language_loss": 0.74202359, "learning_rate": 1.0746415390775893e-06, "loss": 0.76406741, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 4.583606958389282 }, { "auxiliary_loss_clip": 0.01178786, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.05213583, "balance_loss_mlp": 1.0211755, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 1.8883854072974364, "language_loss": 0.76686215, "learning_rate": 1.0739510308485939e-06, "loss": 0.78892863, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 2.5728931427001953 }, { "auxiliary_loss_clip": 0.01131292, "auxiliary_loss_mlp": 0.0099829, "balance_loss_clip": 1.01195109, "balance_loss_mlp": 0.99728882, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8043689979672518, "language_loss": 0.62547088, "learning_rate": 1.07326066310049e-06, "loss": 0.64676666, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 4.192986249923706 }, { "auxiliary_loss_clip": 0.01324277, "auxiliary_loss_mlp": 0.01030192, "balance_loss_clip": 1.04570401, "balance_loss_mlp": 1.02253246, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 1.903197794320203, "language_loss": 0.79663008, "learning_rate": 1.0725704359380059e-06, "loss": 0.82017475, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.712439775466919 }, { "auxiliary_loss_clip": 0.0117782, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.05021977, "balance_loss_mlp": 1.02034211, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 2.2423425979476344, "language_loss": 0.72472632, "learning_rate": 1.0718803494658497e-06, "loss": 0.74677783, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.5610995292663574 }, { "auxiliary_loss_clip": 0.01422733, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.03904581, "balance_loss_mlp": 1.02210701, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 8.348746104356044, "language_loss": 0.83815414, "learning_rate": 1.071190403788707e-06, "loss": 0.86267585, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 2.932133197784424 }, { "auxiliary_loss_clip": 0.01331102, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.04895937, "balance_loss_mlp": 1.02069676, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 2.98664321500518, "language_loss": 0.75411922, "learning_rate": 1.0705005990112415e-06, "loss": 0.77770978, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.880600690841675 }, { "auxiliary_loss_clip": 0.01365062, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.04501748, "balance_loss_mlp": 1.02225053, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 2.334941119652741, "language_loss": 0.74703753, "learning_rate": 1.0698109352380957e-06, "loss": 0.77098209, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 3.6938283443450928 }, { "auxiliary_loss_clip": 0.01175247, "auxiliary_loss_mlp": 0.01028287, "balance_loss_clip": 1.04964948, "balance_loss_mlp": 1.02113771, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 2.591865875067558, "language_loss": 0.78055155, "learning_rate": 1.0691214125738909e-06, "loss": 0.80258685, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.6220219135284424 }, { "auxiliary_loss_clip": 0.01066642, "auxiliary_loss_mlp": 0.01003513, "balance_loss_clip": 1.01042819, "balance_loss_mlp": 1.00256503, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.8084229792955522, "language_loss": 0.57486117, "learning_rate": 1.0684320311232287e-06, "loss": 0.59556276, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.2511510848999023 }, { "auxiliary_loss_clip": 0.01272728, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 1.04571784, "balance_loss_mlp": 1.02007151, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 2.008985292055761, "language_loss": 0.81515557, "learning_rate": 1.0677427909906865e-06, "loss": 0.83815718, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.6179802417755127 }, { "auxiliary_loss_clip": 0.01183645, "auxiliary_loss_mlp": 0.01025845, "balance_loss_clip": 1.05388629, "balance_loss_mlp": 1.01849258, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 2.398812717152142, "language_loss": 0.72352827, "learning_rate": 1.0670536922808216e-06, "loss": 0.74562323, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.5919346809387207 }, { "auxiliary_loss_clip": 0.01276075, "auxiliary_loss_mlp": 0.01025917, "balance_loss_clip": 1.04831862, "balance_loss_mlp": 1.01962852, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.052258998668079, "language_loss": 0.72162092, "learning_rate": 1.06636473509817e-06, "loss": 0.74464083, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.6255650520324707 }, { "auxiliary_loss_clip": 0.01274494, "auxiliary_loss_mlp": 0.02565725, "balance_loss_clip": 1.0466038, "balance_loss_mlp": 0.99994272, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.113997359802001, "language_loss": 0.80841625, "learning_rate": 1.0656759195472447e-06, "loss": 0.84681839, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.6718270778656006 }, { "auxiliary_loss_clip": 0.01169459, "auxiliary_loss_mlp": 0.0099683, "balance_loss_clip": 1.0113368, "balance_loss_mlp": 0.99582225, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7883594392411588, "language_loss": 0.59722459, "learning_rate": 1.0649872457325414e-06, "loss": 0.61888748, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.17917799949646 }, { "auxiliary_loss_clip": 0.01121179, "auxiliary_loss_mlp": 0.01003787, "balance_loss_clip": 1.0092597, "balance_loss_mlp": 1.00277984, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8466411745687178, "language_loss": 0.55090559, "learning_rate": 1.0642987137585278e-06, "loss": 0.5721553, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.1537904739379883 }, { "auxiliary_loss_clip": 0.01273932, "auxiliary_loss_mlp": 0.01021104, "balance_loss_clip": 1.04671216, "balance_loss_mlp": 1.01356375, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.8683810331111077, "language_loss": 0.82866395, "learning_rate": 1.0636103237296561e-06, "loss": 0.8516143, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.640620231628418 }, { "auxiliary_loss_clip": 0.01224165, "auxiliary_loss_mlp": 0.0102564, "balance_loss_clip": 1.0510869, "balance_loss_mlp": 1.01927757, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.7774221543824762, "language_loss": 0.8445704, "learning_rate": 1.062922075750353e-06, "loss": 0.86706847, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.661778688430786 }, { "auxiliary_loss_clip": 0.01327753, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04720831, "balance_loss_mlp": 1.01794553, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 1.9483775966890535, "language_loss": 0.72257769, "learning_rate": 1.0622339699250267e-06, "loss": 0.74610466, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.723595142364502 }, { "auxiliary_loss_clip": 0.01324737, "auxiliary_loss_mlp": 0.01025561, "balance_loss_clip": 1.0451479, "balance_loss_mlp": 1.01884055, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.722800401274858, "language_loss": 0.79735029, "learning_rate": 1.0615460063580624e-06, "loss": 0.82085329, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.7291383743286133 }, { "auxiliary_loss_clip": 0.01276992, "auxiliary_loss_mlp": 0.01025844, "balance_loss_clip": 1.04717243, "balance_loss_mlp": 1.01932037, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 2.1523246954035047, "language_loss": 0.72678202, "learning_rate": 1.060858185153821e-06, "loss": 0.74981034, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.669538974761963 }, { "auxiliary_loss_clip": 0.01281353, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.04924297, "balance_loss_mlp": 1.02296913, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.1864471893660546, "language_loss": 0.76109844, "learning_rate": 1.0601705064166474e-06, "loss": 0.78421599, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.6237549781799316 }, { "auxiliary_loss_clip": 0.01276743, "auxiliary_loss_mlp": 0.01024175, "balance_loss_clip": 1.05150342, "balance_loss_mlp": 1.01754379, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 3.130919801878578, "language_loss": 0.73383951, "learning_rate": 1.0594829702508596e-06, "loss": 0.75684869, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.6599974632263184 }, { "auxiliary_loss_clip": 0.01327166, "auxiliary_loss_mlp": 0.01025753, "balance_loss_clip": 1.04713476, "balance_loss_mlp": 1.01868415, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.8279601845131723, "language_loss": 0.55078709, "learning_rate": 1.0587955767607592e-06, "loss": 0.57431626, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.8139560222625732 }, { "auxiliary_loss_clip": 0.01177368, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.04971218, "balance_loss_mlp": 1.02031147, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.510514760034638, "language_loss": 0.77106804, "learning_rate": 1.0581083260506206e-06, "loss": 0.79311258, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.5572571754455566 }, { "auxiliary_loss_clip": 0.01273635, "auxiliary_loss_mlp": 0.01023985, "balance_loss_clip": 1.04684281, "balance_loss_mlp": 1.01693034, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.6335819576816757, "language_loss": 0.7632792, "learning_rate": 1.0574212182246993e-06, "loss": 0.78625536, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 3.7482664585113525 }, { "auxiliary_loss_clip": 0.01280778, "auxiliary_loss_mlp": 0.01025855, "balance_loss_clip": 1.04659724, "balance_loss_mlp": 1.01864624, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 3.0837146452480693, "language_loss": 0.76162905, "learning_rate": 1.0567342533872303e-06, "loss": 0.78469539, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 3.6421444416046143 }, { "auxiliary_loss_clip": 0.01275673, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.04853272, "balance_loss_mlp": 1.02417862, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 2.4443624744621304, "language_loss": 0.81156909, "learning_rate": 1.0560474316424255e-06, "loss": 0.83463609, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 2.681339979171753 }, { "auxiliary_loss_clip": 0.0127855, "auxiliary_loss_mlp": 0.01029447, "balance_loss_clip": 1.04562879, "balance_loss_mlp": 1.0217762, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.6227229720593113, "language_loss": 0.73554456, "learning_rate": 1.0553607530944746e-06, "loss": 0.75862455, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 3.623143196105957 }, { "auxiliary_loss_clip": 0.01325814, "auxiliary_loss_mlp": 0.01030162, "balance_loss_clip": 1.04409099, "balance_loss_mlp": 1.02282798, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 2.0973838001175666, "language_loss": 0.8974511, "learning_rate": 1.0546742178475463e-06, "loss": 0.92101085, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.674889326095581 }, { "auxiliary_loss_clip": 0.01277383, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 1.04486775, "balance_loss_mlp": 1.02108574, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 2.388717979447034, "language_loss": 0.86931252, "learning_rate": 1.0539878260057868e-06, "loss": 0.89236152, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.722992181777954 }, { "auxiliary_loss_clip": 0.01238063, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.05536795, "balance_loss_mlp": 1.02108622, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 2.7735877869205994, "language_loss": 0.68935311, "learning_rate": 1.0533015776733226e-06, "loss": 0.71202314, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.5754356384277344 }, { "auxiliary_loss_clip": 0.01271621, "auxiliary_loss_mlp": 0.01026386, "balance_loss_clip": 1.04880214, "balance_loss_mlp": 1.01917958, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.351925661412378, "language_loss": 0.78394914, "learning_rate": 1.0526154729542566e-06, "loss": 0.80692923, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.6515743732452393 }, { "auxiliary_loss_clip": 0.01327793, "auxiliary_loss_mlp": 0.01029967, "balance_loss_clip": 1.04934239, "balance_loss_mlp": 1.02236724, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 2.858074486793648, "language_loss": 0.80555499, "learning_rate": 1.0519295119526699e-06, "loss": 0.82913256, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 3.6462242603302 }, { "auxiliary_loss_clip": 0.01279909, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.04876125, "balance_loss_mlp": 1.02047837, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.6096276633267281, "language_loss": 0.83412254, "learning_rate": 1.0512436947726227e-06, "loss": 0.85719752, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.6560628414154053 }, { "auxiliary_loss_clip": 0.01325365, "auxiliary_loss_mlp": 0.01023563, "balance_loss_clip": 1.04454136, "balance_loss_mlp": 1.01667285, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.5964858622827887, "language_loss": 0.65482211, "learning_rate": 1.0505580215181517e-06, "loss": 0.67831135, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.7225420475006104 }, { "auxiliary_loss_clip": 0.01212946, "auxiliary_loss_mlp": 0.01003318, "balance_loss_clip": 1.01132274, "balance_loss_mlp": 1.0023644, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7829112256429517, "language_loss": 0.56639367, "learning_rate": 1.0498724922932753e-06, "loss": 0.58855635, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.1799941062927246 }, { "auxiliary_loss_clip": 0.01184596, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.05464649, "balance_loss_mlp": 1.02238512, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 3.6486238724692686, "language_loss": 0.86665249, "learning_rate": 1.0491871072019851e-06, "loss": 0.88879883, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.597923517227173 }, { "auxiliary_loss_clip": 0.01331104, "auxiliary_loss_mlp": 0.01026313, "balance_loss_clip": 1.04425752, "balance_loss_mlp": 1.01942253, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 2.128235629692362, "language_loss": 0.63705784, "learning_rate": 1.0485018663482555e-06, "loss": 0.66063201, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.7478344440460205 }, { "auxiliary_loss_clip": 0.01226778, "auxiliary_loss_mlp": 0.01027552, "balance_loss_clip": 1.04914689, "balance_loss_mlp": 1.02032185, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.615700651940876, "language_loss": 0.70484126, "learning_rate": 1.0478167698360354e-06, "loss": 0.72738457, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.661102533340454 }, { "auxiliary_loss_clip": 0.01222377, "auxiliary_loss_mlp": 0.01023672, "balance_loss_clip": 1.04593968, "balance_loss_mlp": 1.01639438, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 1.7652814679903237, "language_loss": 0.70249152, "learning_rate": 1.0471318177692556e-06, "loss": 0.72495198, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.651160717010498 }, { "auxiliary_loss_clip": 0.01375629, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.0444783, "balance_loss_mlp": 1.02212477, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 2.1554176479253457, "language_loss": 0.75217497, "learning_rate": 1.046447010251821e-06, "loss": 0.77622938, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.7954230308532715 }, { "auxiliary_loss_clip": 0.01275726, "auxiliary_loss_mlp": 0.01022536, "balance_loss_clip": 1.04981112, "balance_loss_mlp": 1.01578867, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.796774397085072, "language_loss": 0.75664574, "learning_rate": 1.0457623473876157e-06, "loss": 0.7796284, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.7273337841033936 }, { "auxiliary_loss_clip": 0.0117708, "auxiliary_loss_mlp": 0.0102695, "balance_loss_clip": 1.05125856, "balance_loss_mlp": 1.02017879, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 2.1517089540393233, "language_loss": 0.7120266, "learning_rate": 1.0450778292805046e-06, "loss": 0.73406696, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.64261794090271 }, { "auxiliary_loss_clip": 0.01230049, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.04799962, "balance_loss_mlp": 1.02036476, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 6.008140864238958, "language_loss": 0.78930402, "learning_rate": 1.0443934560343267e-06, "loss": 0.81187832, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.6808807849884033 }, { "auxiliary_loss_clip": 0.01315182, "auxiliary_loss_mlp": 0.01023942, "balance_loss_clip": 1.04464149, "balance_loss_mlp": 1.01682234, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 2.064621233969995, "language_loss": 0.77729344, "learning_rate": 1.0437092277529034e-06, "loss": 0.80068469, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.7106070518493652 }, { "auxiliary_loss_clip": 0.01272593, "auxiliary_loss_mlp": 0.01023047, "balance_loss_clip": 1.04684246, "balance_loss_mlp": 1.01655638, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 1.9842519419870406, "language_loss": 0.73648596, "learning_rate": 1.0430251445400292e-06, "loss": 0.75944233, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.7483930587768555 }, { "auxiliary_loss_clip": 0.01522951, "auxiliary_loss_mlp": 0.0102525, "balance_loss_clip": 1.04345012, "balance_loss_mlp": 1.01825285, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 1.9054118322551583, "language_loss": 0.62414324, "learning_rate": 1.0423412064994787e-06, "loss": 0.6496253, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 3.014051914215088 }, { "auxiliary_loss_clip": 0.01325896, "auxiliary_loss_mlp": 0.01025314, "balance_loss_clip": 1.04370022, "balance_loss_mlp": 1.01877856, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 2.2393843611594098, "language_loss": 0.73869038, "learning_rate": 1.0416574137350064e-06, "loss": 0.7622025, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.0932059288024902 }, { "auxiliary_loss_clip": 0.01225872, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.04980755, "balance_loss_mlp": 1.0180831, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.296372225022629, "language_loss": 0.81224287, "learning_rate": 1.0409737663503428e-06, "loss": 0.8347618, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.6090338230133057 }, { "auxiliary_loss_clip": 0.01225604, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.04541075, "balance_loss_mlp": 1.02276254, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 1.861960652148183, "language_loss": 0.82584631, "learning_rate": 1.040290264449196e-06, "loss": 0.84840667, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.677114963531494 }, { "auxiliary_loss_clip": 0.01220377, "auxiliary_loss_mlp": 0.01024262, "balance_loss_clip": 1.04956675, "balance_loss_mlp": 1.01766706, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 3.1854411183147255, "language_loss": 0.64382654, "learning_rate": 1.0396069081352532e-06, "loss": 0.66627288, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 3.8532094955444336 }, { "auxiliary_loss_clip": 0.01066056, "auxiliary_loss_mlp": 0.01003242, "balance_loss_clip": 1.00974667, "balance_loss_mlp": 1.00232458, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.7734666089265403, "language_loss": 0.55993795, "learning_rate": 1.0389236975121782e-06, "loss": 0.5806309, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 4.4609832763671875 }, { "auxiliary_loss_clip": 0.01179282, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.05053806, "balance_loss_mlp": 1.01807499, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 4.122730893076424, "language_loss": 0.70992804, "learning_rate": 1.0382406326836147e-06, "loss": 0.73197919, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 2.596496820449829 }, { "auxiliary_loss_clip": 0.01235238, "auxiliary_loss_mlp": 0.01030334, "balance_loss_clip": 1.05233073, "balance_loss_mlp": 1.02226305, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 3.022771400080622, "language_loss": 0.76064414, "learning_rate": 1.0375577137531828e-06, "loss": 0.7832998, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.6609222888946533 }, { "auxiliary_loss_clip": 0.01278393, "auxiliary_loss_mlp": 0.01027496, "balance_loss_clip": 1.04822826, "balance_loss_mlp": 1.01980066, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 6.904218405916863, "language_loss": 0.72251236, "learning_rate": 1.0368749408244802e-06, "loss": 0.74557132, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 3.5561344623565674 }, { "auxiliary_loss_clip": 0.01223187, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.04827476, "balance_loss_mlp": 1.0196799, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 1.714761703428131, "language_loss": 0.78663015, "learning_rate": 1.0361923140010836e-06, "loss": 0.80912483, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.6306159496307373 }, { "auxiliary_loss_clip": 0.01233798, "auxiliary_loss_mlp": 0.01021436, "balance_loss_clip": 1.05007315, "balance_loss_mlp": 1.01463497, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.295365125679275, "language_loss": 0.63622963, "learning_rate": 1.0355098333865455e-06, "loss": 0.65878201, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.6698997020721436 }, { "auxiliary_loss_clip": 0.01232107, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.05641091, "balance_loss_mlp": 1.022156, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 1.9906968029082088, "language_loss": 0.69355941, "learning_rate": 1.0348274990844006e-06, "loss": 0.71616817, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.756074905395508 }, { "auxiliary_loss_clip": 0.01223182, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.04929638, "balance_loss_mlp": 1.01949441, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.8122443430414128, "language_loss": 0.72340071, "learning_rate": 1.034145311198155e-06, "loss": 0.74590576, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 3.597355842590332 }, { "auxiliary_loss_clip": 0.01173612, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.04840899, "balance_loss_mlp": 1.0212307, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.8948795596469417, "language_loss": 0.6380927, "learning_rate": 1.0334632698312989e-06, "loss": 0.66010427, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.6826212406158447 }, { "auxiliary_loss_clip": 0.01273213, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.04654813, "balance_loss_mlp": 1.02271295, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 1.9051873128306942, "language_loss": 0.75533199, "learning_rate": 1.032781375087295e-06, "loss": 0.77836514, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.7347004413604736 }, { "auxiliary_loss_clip": 0.0127781, "auxiliary_loss_mlp": 0.0102338, "balance_loss_clip": 1.05029786, "balance_loss_mlp": 1.01643598, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.5516214510527269, "language_loss": 0.67288536, "learning_rate": 1.0320996270695891e-06, "loss": 0.69589722, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.7064363956451416 }, { "auxiliary_loss_clip": 0.01328047, "auxiliary_loss_mlp": 0.01032903, "balance_loss_clip": 1.04432952, "balance_loss_mlp": 1.02541661, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 2.102990234178898, "language_loss": 0.73181462, "learning_rate": 1.0314180258815998e-06, "loss": 0.75542414, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.747220516204834 }, { "auxiliary_loss_clip": 0.01318778, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 1.04337263, "balance_loss_mlp": 1.01619792, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 2.1947724924685907, "language_loss": 0.74419987, "learning_rate": 1.0307365716267247e-06, "loss": 0.76761693, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.9099223613739014 }, { "auxiliary_loss_clip": 0.01226163, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.05068135, "balance_loss_mlp": 1.02036619, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 2.346225919379869, "language_loss": 0.78521931, "learning_rate": 1.0300552644083423e-06, "loss": 0.80776066, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.5653109550476074 }, { "auxiliary_loss_clip": 0.01334007, "auxiliary_loss_mlp": 0.01025776, "balance_loss_clip": 1.05062056, "balance_loss_mlp": 1.01811099, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 15.11653414489998, "language_loss": 0.7247566, "learning_rate": 1.0293741043298036e-06, "loss": 0.74835443, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.723883867263794 }, { "auxiliary_loss_clip": 0.01340579, "auxiliary_loss_mlp": 0.01027178, "balance_loss_clip": 1.0558517, "balance_loss_mlp": 1.01941144, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 2.027377042860145, "language_loss": 0.71608829, "learning_rate": 1.0286930914944436e-06, "loss": 0.73976588, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.7461986541748047 }, { "auxiliary_loss_clip": 0.0117707, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.04761124, "balance_loss_mlp": 1.01716161, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.509918754138467, "language_loss": 0.76958847, "learning_rate": 1.0280122260055684e-06, "loss": 0.79160154, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.6147093772888184 }, { "auxiliary_loss_clip": 0.01180183, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.05247164, "balance_loss_mlp": 1.02063608, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 2.004717764097082, "language_loss": 0.82126486, "learning_rate": 1.0273315079664652e-06, "loss": 0.84334981, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.606456995010376 }, { "auxiliary_loss_clip": 0.01229864, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.04959869, "balance_loss_mlp": 1.02096796, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.6876708252991612, "language_loss": 0.74212104, "learning_rate": 1.0266509374803992e-06, "loss": 0.76469922, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.6926281452178955 }, { "auxiliary_loss_clip": 0.01179414, "auxiliary_loss_mlp": 0.02567841, "balance_loss_clip": 1.05041337, "balance_loss_mlp": 0.99993938, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 2.564801395423503, "language_loss": 0.84542668, "learning_rate": 1.0259705146506123e-06, "loss": 0.88289928, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.5299551486968994 }, { "auxiliary_loss_clip": 0.01230123, "auxiliary_loss_mlp": 0.01025992, "balance_loss_clip": 1.04950929, "balance_loss_mlp": 1.01913154, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 2.1434743146822317, "language_loss": 0.7743656, "learning_rate": 1.025290239580324e-06, "loss": 0.79692674, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.7838857173919678 }, { "auxiliary_loss_clip": 0.01375993, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.04323542, "balance_loss_mlp": 1.01918411, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.9392342272601666, "language_loss": 0.75401914, "learning_rate": 1.0246101123727313e-06, "loss": 0.77804559, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.735318660736084 }, { "auxiliary_loss_clip": 0.01227746, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.0481714, "balance_loss_mlp": 1.02301013, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 2.4746191766339725, "language_loss": 0.7863903, "learning_rate": 1.0239301331310085e-06, "loss": 0.80896747, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.672780752182007 }, { "auxiliary_loss_clip": 0.01224586, "auxiliary_loss_mlp": 0.01023218, "balance_loss_clip": 1.04942572, "balance_loss_mlp": 1.01671219, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.7520118090416217, "language_loss": 0.88552541, "learning_rate": 1.0232503019583088e-06, "loss": 0.90800345, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.62329363822937 }, { "auxiliary_loss_clip": 0.01223181, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.04909801, "balance_loss_mlp": 1.01689446, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 2.0464782714459666, "language_loss": 0.70052189, "learning_rate": 1.0225706189577619e-06, "loss": 0.72299659, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.6584572792053223 }, { "auxiliary_loss_clip": 0.01228497, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.05119705, "balance_loss_mlp": 1.02037644, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.052416763784095, "language_loss": 0.74639338, "learning_rate": 1.021891084232475e-06, "loss": 0.76895553, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.5460262298583984 }, { "auxiliary_loss_clip": 0.01228017, "auxiliary_loss_mlp": 0.01025519, "balance_loss_clip": 1.04835296, "balance_loss_mlp": 1.01822317, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.3847622003549027, "language_loss": 0.79696798, "learning_rate": 1.0212116978855325e-06, "loss": 0.81950331, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 4.397632837295532 }, { "auxiliary_loss_clip": 0.01320389, "auxiliary_loss_mlp": 0.01024907, "balance_loss_clip": 1.04672766, "balance_loss_mlp": 1.0175159, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 2.2502161199821424, "language_loss": 0.79025054, "learning_rate": 1.020532460019997e-06, "loss": 0.81370354, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 2.7380754947662354 }, { "auxiliary_loss_clip": 0.01476237, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.04276514, "balance_loss_mlp": 1.01968551, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 3.031885451138773, "language_loss": 0.71204442, "learning_rate": 1.0198533707389096e-06, "loss": 0.73707759, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 3.0769906044006348 }, { "auxiliary_loss_clip": 0.01225405, "auxiliary_loss_mlp": 0.0256745, "balance_loss_clip": 1.04951501, "balance_loss_mlp": 0.99996507, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 1.7364839209956702, "language_loss": 0.73253977, "learning_rate": 1.0191744301452853e-06, "loss": 0.77046829, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 3.9378957748413086 }, { "auxiliary_loss_clip": 0.01174603, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.04758406, "balance_loss_mlp": 1.01979458, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 2.0239720144109308, "language_loss": 0.70333827, "learning_rate": 1.0184956383421208e-06, "loss": 0.72535235, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.7296152114868164 }, { "auxiliary_loss_clip": 0.01231567, "auxiliary_loss_mlp": 0.01024986, "balance_loss_clip": 1.0507617, "balance_loss_mlp": 1.01793528, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 2.4059797444957733, "language_loss": 0.65669161, "learning_rate": 1.017816995432387e-06, "loss": 0.67925715, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.6747119426727295 }, { "auxiliary_loss_clip": 0.01274238, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.0468328, "balance_loss_mlp": 1.01752543, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 7.723861690522186, "language_loss": 0.74046087, "learning_rate": 1.0171385015190353e-06, "loss": 0.76344359, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.6880056858062744 }, { "auxiliary_loss_clip": 0.01276467, "auxiliary_loss_mlp": 0.02565142, "balance_loss_clip": 1.05141437, "balance_loss_mlp": 0.99991649, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 1.9573512777289763, "language_loss": 0.73793697, "learning_rate": 1.0164601567049908e-06, "loss": 0.77635306, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 3.854444742202759 }, { "auxiliary_loss_clip": 0.01280607, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.04870772, "balance_loss_mlp": 1.0218606, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.9243560925337178, "language_loss": 0.803868, "learning_rate": 1.015781961093158e-06, "loss": 0.82696712, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.6697568893432617 }, { "auxiliary_loss_clip": 0.01277983, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04496837, "balance_loss_mlp": 1.02022314, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.5910099515217886, "language_loss": 0.77140808, "learning_rate": 1.0151039147864197e-06, "loss": 0.79446274, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.6987903118133545 }, { "auxiliary_loss_clip": 0.01474731, "auxiliary_loss_mlp": 0.01028423, "balance_loss_clip": 1.05039334, "balance_loss_mlp": 1.02088857, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 2.615116744511832, "language_loss": 0.66014588, "learning_rate": 1.0144260178876336e-06, "loss": 0.68517739, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.930556058883667 }, { "auxiliary_loss_clip": 0.01283006, "auxiliary_loss_mlp": 0.01022493, "balance_loss_clip": 1.04785609, "balance_loss_mlp": 1.01601434, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 3.218628284791756, "language_loss": 0.67821574, "learning_rate": 1.0137482704996388e-06, "loss": 0.70127076, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.769575595855713 }, { "auxiliary_loss_clip": 0.01329213, "auxiliary_loss_mlp": 0.01031838, "balance_loss_clip": 1.04827583, "balance_loss_mlp": 1.02452493, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 3.547734277126329, "language_loss": 0.78719258, "learning_rate": 1.0130706727252461e-06, "loss": 0.81080312, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.696565866470337 }, { "auxiliary_loss_clip": 0.01326529, "auxiliary_loss_mlp": 0.01024883, "balance_loss_clip": 1.04661989, "balance_loss_mlp": 1.01802552, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 3.109336959465384, "language_loss": 0.67726475, "learning_rate": 1.0123932246672468e-06, "loss": 0.70077884, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.7819058895111084 }, { "auxiliary_loss_clip": 0.0127217, "auxiliary_loss_mlp": 0.02507057, "balance_loss_clip": 1.00906444, "balance_loss_mlp": 0.99983454, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7448829340269733, "language_loss": 0.55764842, "learning_rate": 1.0117159264284114e-06, "loss": 0.59544063, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.3621935844421387 }, { "auxiliary_loss_clip": 0.01277974, "auxiliary_loss_mlp": 0.01024892, "balance_loss_clip": 1.04860783, "balance_loss_mlp": 1.01790357, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.9434949557435146, "language_loss": 0.76957762, "learning_rate": 1.0110387781114837e-06, "loss": 0.79260629, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.843716621398926 }, { "auxiliary_loss_clip": 0.01175358, "auxiliary_loss_mlp": 0.01023586, "balance_loss_clip": 1.04949069, "balance_loss_mlp": 1.01631379, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 2.227642783321111, "language_loss": 0.77120113, "learning_rate": 1.0103617798191872e-06, "loss": 0.7931906, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.613525629043579 }, { "auxiliary_loss_clip": 0.01278128, "auxiliary_loss_mlp": 0.01023834, "balance_loss_clip": 1.05003786, "balance_loss_mlp": 1.01656246, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.898200117055019, "language_loss": 0.82500875, "learning_rate": 1.0096849316542217e-06, "loss": 0.84802842, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.6423797607421875 }, { "auxiliary_loss_clip": 0.01459834, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.03789258, "balance_loss_mlp": 1.01683736, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.0650612665310444, "language_loss": 0.753106, "learning_rate": 1.0090082337192643e-06, "loss": 0.77794605, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.9286766052246094 }, { "auxiliary_loss_clip": 0.01417499, "auxiliary_loss_mlp": 0.01022234, "balance_loss_clip": 1.03688312, "balance_loss_mlp": 1.01535594, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 2.133943121228115, "language_loss": 0.78412151, "learning_rate": 1.0083316861169705e-06, "loss": 0.80851883, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 3.498469114303589 }, { "auxiliary_loss_clip": 0.01332115, "auxiliary_loss_mlp": 0.01025963, "balance_loss_clip": 1.04570651, "balance_loss_mlp": 1.0181365, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.1151483113056484, "language_loss": 0.72089994, "learning_rate": 1.0076552889499713e-06, "loss": 0.74448073, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.7501678466796875 }, { "auxiliary_loss_clip": 0.01225192, "auxiliary_loss_mlp": 0.01023255, "balance_loss_clip": 1.05169594, "balance_loss_mlp": 1.01650739, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.186101669168665, "language_loss": 0.73795855, "learning_rate": 1.006979042320876e-06, "loss": 0.76044297, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.6787331104278564 }, { "auxiliary_loss_clip": 0.01275787, "auxiliary_loss_mlp": 0.01024987, "balance_loss_clip": 1.04541945, "balance_loss_mlp": 1.01772761, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.509908302990683, "language_loss": 0.63004053, "learning_rate": 1.0063029463322702e-06, "loss": 0.65304828, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.682905912399292 }, { "auxiliary_loss_clip": 0.01369078, "auxiliary_loss_mlp": 0.0256628, "balance_loss_clip": 1.04132068, "balance_loss_mlp": 0.99996185, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.0714597204467617, "language_loss": 0.75825465, "learning_rate": 1.0056270010867164e-06, "loss": 0.79760826, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.796499490737915 }, { "auxiliary_loss_clip": 0.01277678, "auxiliary_loss_mlp": 0.01021467, "balance_loss_clip": 1.0435183, "balance_loss_mlp": 1.01470494, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 3.814683059950269, "language_loss": 0.78349447, "learning_rate": 1.004951206686758e-06, "loss": 0.80648589, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.6802000999450684 }, { "auxiliary_loss_clip": 0.01226477, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.04898965, "balance_loss_mlp": 1.02073753, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 2.912257298731784, "language_loss": 0.71543908, "learning_rate": 1.0042755632349087e-06, "loss": 0.73797697, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 4.4816811084747314 }, { "auxiliary_loss_clip": 0.01324768, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.0461812, "balance_loss_mlp": 1.02183652, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.2349884272639815, "language_loss": 0.6306833, "learning_rate": 1.0036000708336653e-06, "loss": 0.65422046, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 2.777637481689453 }, { "auxiliary_loss_clip": 0.01279661, "auxiliary_loss_mlp": 0.01028025, "balance_loss_clip": 1.04988158, "balance_loss_mlp": 1.0207864, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 1.9246670159472827, "language_loss": 0.79580134, "learning_rate": 1.0029247295854984e-06, "loss": 0.81887817, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.700270891189575 }, { "auxiliary_loss_clip": 0.01334032, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.04906058, "balance_loss_mlp": 1.02080023, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.9346962504688938, "language_loss": 0.71888411, "learning_rate": 1.0022495395928588e-06, "loss": 0.74250174, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.7219483852386475 }, { "auxiliary_loss_clip": 0.01065605, "auxiliary_loss_mlp": 0.01001422, "balance_loss_clip": 1.00949347, "balance_loss_mlp": 1.00043857, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7983165056331405, "language_loss": 0.62319434, "learning_rate": 1.0015745009581697e-06, "loss": 0.64386463, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 4.08286452293396 }, { "auxiliary_loss_clip": 0.01224823, "auxiliary_loss_mlp": 0.01026451, "balance_loss_clip": 1.04964983, "balance_loss_mlp": 1.01943576, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 2.9605406573615287, "language_loss": 0.67123133, "learning_rate": 1.0008996137838343e-06, "loss": 0.69374406, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.6052463054656982 }, { "auxiliary_loss_clip": 0.0118384, "auxiliary_loss_mlp": 0.01022908, "balance_loss_clip": 1.0523386, "balance_loss_mlp": 1.01479602, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 2.2697094566510785, "language_loss": 0.79923022, "learning_rate": 1.000224878172234e-06, "loss": 0.82129771, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.7874865531921387 }, { "auxiliary_loss_clip": 0.01128138, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.04912496, "balance_loss_mlp": 1.02066469, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 2.189889270254064, "language_loss": 0.72885346, "learning_rate": 9.99550294225724e-07, "loss": 0.75040901, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.679964065551758 }, { "auxiliary_loss_clip": 0.01383823, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.04408622, "balance_loss_mlp": 1.02337551, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 1.8561657310003972, "language_loss": 0.72422582, "learning_rate": 9.988758620466402e-07, "loss": 0.74836922, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 3.690739393234253 }, { "auxiliary_loss_clip": 0.01328637, "auxiliary_loss_mlp": 0.01023201, "balance_loss_clip": 1.04321611, "balance_loss_mlp": 1.01676643, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.8599517955722473, "language_loss": 0.76334089, "learning_rate": 9.982015817372917e-07, "loss": 0.78685933, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.914457082748413 }, { "auxiliary_loss_clip": 0.01375633, "auxiliary_loss_mlp": 0.01026304, "balance_loss_clip": 1.04180789, "balance_loss_mlp": 1.01882696, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 3.5349904504390555, "language_loss": 0.81821579, "learning_rate": 9.975274533999657e-07, "loss": 0.84223515, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.9193122386932373 }, { "auxiliary_loss_clip": 0.01176885, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.04884768, "balance_loss_mlp": 1.02013934, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 3.35708127981289, "language_loss": 0.84093606, "learning_rate": 9.96853477136929e-07, "loss": 0.86298728, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.672593116760254 }, { "auxiliary_loss_clip": 0.01317071, "auxiliary_loss_mlp": 0.01026348, "balance_loss_clip": 1.04186964, "balance_loss_mlp": 1.01933861, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 11.112346455058685, "language_loss": 0.75633907, "learning_rate": 9.96179653050422e-07, "loss": 0.77977324, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.721611976623535 }, { "auxiliary_loss_clip": 0.01323726, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.04776001, "balance_loss_mlp": 1.02509546, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.0443028294634558, "language_loss": 0.74201095, "learning_rate": 9.955059812426635e-07, "loss": 0.76557231, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.74883770942688 }, { "auxiliary_loss_clip": 0.01178223, "auxiliary_loss_mlp": 0.01027675, "balance_loss_clip": 1.05222487, "balance_loss_mlp": 1.02070117, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 2.3131419210518436, "language_loss": 0.82775629, "learning_rate": 9.948324618158493e-07, "loss": 0.84981531, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.7032361030578613 }, { "auxiliary_loss_clip": 0.01228238, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.04805148, "balance_loss_mlp": 1.01843131, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.2275301147943387, "language_loss": 0.77279955, "learning_rate": 9.941590948721502e-07, "loss": 0.79534042, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.668509006500244 }, { "auxiliary_loss_clip": 0.01269051, "auxiliary_loss_mlp": 0.01024663, "balance_loss_clip": 1.04831707, "balance_loss_mlp": 1.01824069, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 1.84171945740778, "language_loss": 0.76568866, "learning_rate": 9.934858805137188e-07, "loss": 0.78862584, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.705293893814087 }, { "auxiliary_loss_clip": 0.01220406, "auxiliary_loss_mlp": 0.01023388, "balance_loss_clip": 1.04871011, "balance_loss_mlp": 1.01654518, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.610733008742841, "language_loss": 0.80667698, "learning_rate": 9.92812818842677e-07, "loss": 0.82911497, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.633744239807129 }, { "auxiliary_loss_clip": 0.01228849, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.05001235, "balance_loss_mlp": 1.0171771, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.9040382446368882, "language_loss": 0.64129335, "learning_rate": 9.921399099611306e-07, "loss": 0.66382551, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.857239246368408 }, { "auxiliary_loss_clip": 0.01274494, "auxiliary_loss_mlp": 0.01022252, "balance_loss_clip": 1.04551971, "balance_loss_mlp": 1.01572275, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.964877109951649, "language_loss": 0.68859744, "learning_rate": 9.914671539711588e-07, "loss": 0.7115649, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.7055819034576416 }, { "auxiliary_loss_clip": 0.01523702, "auxiliary_loss_mlp": 0.02568455, "balance_loss_clip": 1.04381382, "balance_loss_mlp": 0.99991, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.6449471832844065, "language_loss": 0.78465658, "learning_rate": 9.90794550974817e-07, "loss": 0.82557821, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.949815511703491 }, { "auxiliary_loss_clip": 0.01324366, "auxiliary_loss_mlp": 0.01027033, "balance_loss_clip": 1.04647088, "balance_loss_mlp": 1.01979744, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.1310128730629283, "language_loss": 0.81480026, "learning_rate": 9.901221010741407e-07, "loss": 0.83831418, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 3.3170065879821777 }, { "auxiliary_loss_clip": 0.0122927, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.04843175, "balance_loss_mlp": 1.02156138, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 2.5321141212847595, "language_loss": 0.74980623, "learning_rate": 9.894498043711375e-07, "loss": 0.77238476, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.800082206726074 }, { "auxiliary_loss_clip": 0.01271461, "auxiliary_loss_mlp": 0.0102624, "balance_loss_clip": 1.04501092, "balance_loss_mlp": 1.0191294, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 2.6479819599431167, "language_loss": 0.69502234, "learning_rate": 9.887776609677962e-07, "loss": 0.71799934, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.7338764667510986 }, { "auxiliary_loss_clip": 0.01318775, "auxiliary_loss_mlp": 0.01028091, "balance_loss_clip": 1.04099846, "balance_loss_mlp": 1.0211556, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 2.1180849562701987, "language_loss": 0.72642684, "learning_rate": 9.88105670966079e-07, "loss": 0.74989545, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.7269327640533447 }, { "auxiliary_loss_clip": 0.01364505, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.04433072, "balance_loss_mlp": 1.02062774, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 2.6571087061554075, "language_loss": 0.79125094, "learning_rate": 9.874338344679283e-07, "loss": 0.81517541, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.727947235107422 }, { "auxiliary_loss_clip": 0.01173758, "auxiliary_loss_mlp": 0.01023384, "balance_loss_clip": 1.0502317, "balance_loss_mlp": 1.01692033, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.8081323050048514, "language_loss": 0.74201095, "learning_rate": 9.86762151575259e-07, "loss": 0.76398236, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 4.671745538711548 }, { "auxiliary_loss_clip": 0.01272844, "auxiliary_loss_mlp": 0.02561721, "balance_loss_clip": 1.04667222, "balance_loss_mlp": 0.99993587, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.7486181817525948, "language_loss": 0.80319262, "learning_rate": 9.860906223899651e-07, "loss": 0.84153819, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 2.794055700302124 }, { "auxiliary_loss_clip": 0.01282177, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.04848504, "balance_loss_mlp": 1.02245617, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.708280942138804, "language_loss": 0.75775123, "learning_rate": 9.854192470139184e-07, "loss": 0.78086853, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 2.7461514472961426 }, { "auxiliary_loss_clip": 0.01276285, "auxiliary_loss_mlp": 0.01022114, "balance_loss_clip": 1.05200863, "balance_loss_mlp": 1.01523542, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.6300929516984333, "language_loss": 0.71276814, "learning_rate": 9.847480255489645e-07, "loss": 0.73575211, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 3.5870859622955322 }, { "auxiliary_loss_clip": 0.01280027, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04835701, "balance_loss_mlp": 1.02000403, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.8327972801879975, "language_loss": 0.69112813, "learning_rate": 9.840769580969295e-07, "loss": 0.71420068, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.724898338317871 }, { "auxiliary_loss_clip": 0.01220624, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.04625058, "balance_loss_mlp": 1.02137184, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 2.2736757935076013, "language_loss": 0.7986744, "learning_rate": 9.834060447596114e-07, "loss": 0.82116693, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 2.593921661376953 }, { "auxiliary_loss_clip": 0.01225336, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.04667902, "balance_loss_mlp": 1.02171004, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 2.0713616863105737, "language_loss": 0.78383338, "learning_rate": 9.827352856387868e-07, "loss": 0.80637795, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.7752583026885986 }, { "auxiliary_loss_clip": 0.01277022, "auxiliary_loss_mlp": 0.0099963, "balance_loss_clip": 1.012429, "balance_loss_mlp": 0.99873585, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7781990583417483, "language_loss": 0.64292991, "learning_rate": 9.820646808362118e-07, "loss": 0.66569638, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 4.481834411621094 }, { "auxiliary_loss_clip": 0.01271924, "auxiliary_loss_mlp": 0.0102282, "balance_loss_clip": 1.04949772, "balance_loss_mlp": 1.01579881, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.0720199815409446, "language_loss": 0.72724295, "learning_rate": 9.813942304536154e-07, "loss": 0.75019032, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.808762788772583 }, { "auxiliary_loss_clip": 0.01277536, "auxiliary_loss_mlp": 0.01024071, "balance_loss_clip": 1.04740393, "balance_loss_mlp": 1.01691198, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.8692674359696668, "language_loss": 0.63610029, "learning_rate": 9.807239345927043e-07, "loss": 0.65911639, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.737557888031006 }, { "auxiliary_loss_clip": 0.0117944, "auxiliary_loss_mlp": 0.01024556, "balance_loss_clip": 1.04568911, "balance_loss_mlp": 1.01722813, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.333451316877656, "language_loss": 0.72252059, "learning_rate": 9.80053793355162e-07, "loss": 0.7445606, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.7595248222351074 }, { "auxiliary_loss_clip": 0.01379949, "auxiliary_loss_mlp": 0.01024768, "balance_loss_clip": 1.04919052, "balance_loss_mlp": 1.01768684, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.0869753363975905, "language_loss": 0.74988353, "learning_rate": 9.793838068426472e-07, "loss": 0.77393073, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.788546562194824 }, { "auxiliary_loss_clip": 0.01177691, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.05215764, "balance_loss_mlp": 1.02197492, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.029726971046754, "language_loss": 0.61124218, "learning_rate": 9.78713975156799e-07, "loss": 0.63331318, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.571359872817993 }, { "auxiliary_loss_clip": 0.01332293, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.05124009, "balance_loss_mlp": 1.01934659, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.7939877098624293, "language_loss": 0.71689057, "learning_rate": 9.780442983992273e-07, "loss": 0.74048102, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.75433611869812 }, { "auxiliary_loss_clip": 0.01270659, "auxiliary_loss_mlp": 0.0102819, "balance_loss_clip": 1.04613066, "balance_loss_mlp": 1.02109742, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.934409976476223, "language_loss": 0.71808815, "learning_rate": 9.773747766715238e-07, "loss": 0.74107659, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.8214199542999268 }, { "auxiliary_loss_clip": 0.01274635, "auxiliary_loss_mlp": 0.01023822, "balance_loss_clip": 1.04496086, "balance_loss_mlp": 1.01665783, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.6886575844507006, "language_loss": 0.80234563, "learning_rate": 9.767054100752536e-07, "loss": 0.82533026, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.612154960632324 }, { "auxiliary_loss_clip": 0.01226437, "auxiliary_loss_mlp": 0.0102362, "balance_loss_clip": 1.04737556, "balance_loss_mlp": 1.016271, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 2.3747885577257892, "language_loss": 0.81571734, "learning_rate": 9.760361987119584e-07, "loss": 0.83821785, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.676050901412964 }, { "auxiliary_loss_clip": 0.01272944, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 1.04907846, "balance_loss_mlp": 1.01740575, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 3.538853926142097, "language_loss": 0.67850333, "learning_rate": 9.753671426831592e-07, "loss": 0.70147932, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.01220018, "auxiliary_loss_mlp": 0.01024268, "balance_loss_clip": 1.04518378, "balance_loss_mlp": 1.01690078, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 2.051530452164032, "language_loss": 0.79804707, "learning_rate": 9.746982420903483e-07, "loss": 0.82049, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.666740894317627 }, { "auxiliary_loss_clip": 0.01228484, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.05262399, "balance_loss_mlp": 1.01579571, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 2.0549577069544878, "language_loss": 0.74642146, "learning_rate": 9.740294970349993e-07, "loss": 0.76893413, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.5737645626068115 }, { "auxiliary_loss_clip": 0.01169732, "auxiliary_loss_mlp": 0.00999722, "balance_loss_clip": 1.0086422, "balance_loss_mlp": 0.99876803, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.9031821921517337, "language_loss": 0.60845721, "learning_rate": 9.733609076185594e-07, "loss": 0.63015163, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.115020751953125 }, { "auxiliary_loss_clip": 0.01226088, "auxiliary_loss_mlp": 0.0102764, "balance_loss_clip": 1.04991984, "balance_loss_mlp": 1.02066934, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 2.588583308675847, "language_loss": 0.84142435, "learning_rate": 9.72692473942455e-07, "loss": 0.86396158, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.6086370944976807 }, { "auxiliary_loss_clip": 0.01384823, "auxiliary_loss_mlp": 0.01029237, "balance_loss_clip": 1.05037212, "balance_loss_mlp": 1.02186394, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.735544058324277, "language_loss": 0.77616847, "learning_rate": 9.720241961080849e-07, "loss": 0.80030912, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.7292754650115967 }, { "auxiliary_loss_clip": 0.01176123, "auxiliary_loss_mlp": 0.01024447, "balance_loss_clip": 1.04936028, "balance_loss_mlp": 1.01748478, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 2.010274318268284, "language_loss": 0.72936106, "learning_rate": 9.713560742168259e-07, "loss": 0.75136673, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.7083382606506348 }, { "auxiliary_loss_clip": 0.01225128, "auxiliary_loss_mlp": 0.01025326, "balance_loss_clip": 1.04628086, "balance_loss_mlp": 1.01812017, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 4.246766151782642, "language_loss": 0.71378565, "learning_rate": 9.706881083700333e-07, "loss": 0.7362901, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.6749114990234375 }, { "auxiliary_loss_clip": 0.01425416, "auxiliary_loss_mlp": 0.01024422, "balance_loss_clip": 1.04872787, "balance_loss_mlp": 1.01716506, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 2.2108689901389864, "language_loss": 0.82539642, "learning_rate": 9.700202986690357e-07, "loss": 0.84989482, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.8656537532806396 }, { "auxiliary_loss_clip": 0.01222395, "auxiliary_loss_mlp": 0.02568969, "balance_loss_clip": 1.04724967, "balance_loss_mlp": 0.99990785, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 2.2904661231886148, "language_loss": 0.66750574, "learning_rate": 9.693526452151413e-07, "loss": 0.70541942, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 4.341189622879028 }, { "auxiliary_loss_clip": 0.01333591, "auxiliary_loss_mlp": 0.01027269, "balance_loss_clip": 1.04561198, "balance_loss_mlp": 1.01896048, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.8363575606245934, "language_loss": 0.75870782, "learning_rate": 9.686851481096305e-07, "loss": 0.78231645, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.7774531841278076 }, { "auxiliary_loss_clip": 0.01325364, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04450834, "balance_loss_mlp": 1.01869321, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 2.2081260206024287, "language_loss": 0.71577311, "learning_rate": 9.68017807453762e-07, "loss": 0.73928851, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 2.7937445640563965 }, { "auxiliary_loss_clip": 0.01277966, "auxiliary_loss_mlp": 0.02565002, "balance_loss_clip": 1.0500989, "balance_loss_mlp": 0.99986863, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 1.7752346636873464, "language_loss": 0.73133993, "learning_rate": 9.673506233487721e-07, "loss": 0.76976967, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 3.5410282611846924 }, { "auxiliary_loss_clip": 0.01273209, "auxiliary_loss_mlp": 0.02560392, "balance_loss_clip": 1.0461396, "balance_loss_mlp": 0.99990594, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 5.035075896186554, "language_loss": 0.86107206, "learning_rate": 9.666835958958717e-07, "loss": 0.8994081, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.707176923751831 }, { "auxiliary_loss_clip": 0.01175205, "auxiliary_loss_mlp": 0.01024164, "balance_loss_clip": 1.05051231, "balance_loss_mlp": 1.01727343, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.1289830886127863, "language_loss": 0.80723929, "learning_rate": 9.660167251962484e-07, "loss": 0.82923299, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 2.616446018218994 }, { "auxiliary_loss_clip": 0.01326993, "auxiliary_loss_mlp": 0.01025687, "balance_loss_clip": 1.04552627, "balance_loss_mlp": 1.01872838, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 1.6678969836363005, "language_loss": 0.77877629, "learning_rate": 9.653500113510654e-07, "loss": 0.80230308, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.7183127403259277 }, { "auxiliary_loss_clip": 0.01273302, "auxiliary_loss_mlp": 0.01025706, "balance_loss_clip": 1.04613006, "balance_loss_mlp": 1.01830554, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.7828764047272916, "language_loss": 0.66872585, "learning_rate": 9.646834544614627e-07, "loss": 0.69171596, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 3.8200297355651855 }, { "auxiliary_loss_clip": 0.01270601, "auxiliary_loss_mlp": 0.01022447, "balance_loss_clip": 1.04685318, "balance_loss_mlp": 1.01564562, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 2.0113858275556336, "language_loss": 0.76183271, "learning_rate": 9.64017054628558e-07, "loss": 0.78476322, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.654698133468628 }, { "auxiliary_loss_clip": 0.01373429, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.04248595, "balance_loss_mlp": 1.0209372, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 2.3052481490638224, "language_loss": 0.78986406, "learning_rate": 9.63350811953441e-07, "loss": 0.81387687, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.7597062587738037 }, { "auxiliary_loss_clip": 0.01322535, "auxiliary_loss_mlp": 0.01023296, "balance_loss_clip": 1.04491901, "balance_loss_mlp": 1.01651609, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.205399457845408, "language_loss": 0.70806134, "learning_rate": 9.626847265371826e-07, "loss": 0.73151964, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.6784133911132812 }, { "auxiliary_loss_clip": 0.01268935, "auxiliary_loss_mlp": 0.01020399, "balance_loss_clip": 1.04487109, "balance_loss_mlp": 1.01352644, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 2.383593930829791, "language_loss": 0.78942871, "learning_rate": 9.620187984808262e-07, "loss": 0.81232202, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.661470651626587 }, { "auxiliary_loss_clip": 0.01276095, "auxiliary_loss_mlp": 0.02565048, "balance_loss_clip": 1.04953218, "balance_loss_mlp": 0.9998818, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 2.083440043596199, "language_loss": 0.86249906, "learning_rate": 9.613530278853919e-07, "loss": 0.9009105, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.6823244094848633 }, { "auxiliary_loss_clip": 0.01228775, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 1.05117726, "balance_loss_mlp": 1.0223664, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 2.3208347545979264, "language_loss": 0.74395192, "learning_rate": 9.60687414851879e-07, "loss": 0.76653796, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.736229658126831 }, { "auxiliary_loss_clip": 0.01176537, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.04839945, "balance_loss_mlp": 1.02181292, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.8525911160431114, "language_loss": 0.78008795, "learning_rate": 9.600219594812575e-07, "loss": 0.80215055, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.6897788047790527 }, { "auxiliary_loss_clip": 0.0117279, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.04910231, "balance_loss_mlp": 1.01783633, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 2.940916159499002, "language_loss": 0.72783267, "learning_rate": 9.593566618744786e-07, "loss": 0.74980724, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.5747857093811035 }, { "auxiliary_loss_clip": 0.0117699, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.05086064, "balance_loss_mlp": 1.01790786, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.9213602091676687, "language_loss": 0.74121422, "learning_rate": 9.58691522132466e-07, "loss": 0.76323599, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.5558040142059326 }, { "auxiliary_loss_clip": 0.01275476, "auxiliary_loss_mlp": 0.01026663, "balance_loss_clip": 1.0491178, "balance_loss_mlp": 1.01917052, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 2.0714248252195535, "language_loss": 0.850766, "learning_rate": 9.58026540356123e-07, "loss": 0.8737874, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.678323268890381 }, { "auxiliary_loss_clip": 0.01229198, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.04907966, "balance_loss_mlp": 1.02096474, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.5680000359587742, "language_loss": 0.86724508, "learning_rate": 9.573617166463246e-07, "loss": 0.88982737, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.6601693630218506 }, { "auxiliary_loss_clip": 0.01275506, "auxiliary_loss_mlp": 0.01025735, "balance_loss_clip": 1.0446105, "balance_loss_mlp": 1.01847529, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 1.9940667182706293, "language_loss": 0.60289884, "learning_rate": 9.56697051103924e-07, "loss": 0.62591124, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.6190919876098633 }, { "auxiliary_loss_clip": 0.01272387, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.04605651, "balance_loss_mlp": 1.01813793, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 2.1276339359785696, "language_loss": 0.81294072, "learning_rate": 9.560325438297522e-07, "loss": 0.83590907, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.7365951538085938 }, { "auxiliary_loss_clip": 0.01281813, "auxiliary_loss_mlp": 0.01030866, "balance_loss_clip": 1.05187249, "balance_loss_mlp": 1.02365112, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 2.0131217826635215, "language_loss": 0.86571622, "learning_rate": 9.553681949246127e-07, "loss": 0.88884306, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.6193203926086426 }, { "auxiliary_loss_clip": 0.01333971, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.04902065, "balance_loss_mlp": 1.01869404, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 1.7837101792795929, "language_loss": 0.75307715, "learning_rate": 9.547040044892886e-07, "loss": 0.7766732, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 2.988497495651245 }, { "auxiliary_loss_clip": 0.01122943, "auxiliary_loss_mlp": 0.0100201, "balance_loss_clip": 1.01030159, "balance_loss_mlp": 1.00109768, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8539573509703516, "language_loss": 0.60055006, "learning_rate": 9.540399726245354e-07, "loss": 0.62179959, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.068936347961426 }, { "auxiliary_loss_clip": 0.01273194, "auxiliary_loss_mlp": 0.01025494, "balance_loss_clip": 1.04526818, "balance_loss_mlp": 1.01784086, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 1.803645209650667, "language_loss": 0.69132644, "learning_rate": 9.533760994310859e-07, "loss": 0.71431327, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.7191457748413086 }, { "auxiliary_loss_clip": 0.01179696, "auxiliary_loss_mlp": 0.01027568, "balance_loss_clip": 1.0523752, "balance_loss_mlp": 1.02014709, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 1.9510714830900895, "language_loss": 0.75362349, "learning_rate": 9.527123850096508e-07, "loss": 0.77569616, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.544518232345581 }, { "auxiliary_loss_clip": 0.01128966, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.04858065, "balance_loss_mlp": 1.02077055, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 2.051018936946761, "language_loss": 0.72021872, "learning_rate": 9.520488294609142e-07, "loss": 0.74178571, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 3.7030558586120605 }, { "auxiliary_loss_clip": 0.0127279, "auxiliary_loss_mlp": 0.00998161, "balance_loss_clip": 1.01056111, "balance_loss_mlp": 0.99715942, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7363687255039579, "language_loss": 0.53787374, "learning_rate": 9.513854328855368e-07, "loss": 0.56058329, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 4.061933755874634 }, { "auxiliary_loss_clip": 0.01172113, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.048733, "balance_loss_mlp": 1.01872921, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 4.559240214138372, "language_loss": 0.81215578, "learning_rate": 9.507221953841558e-07, "loss": 0.83412874, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.5772759914398193 }, { "auxiliary_loss_clip": 0.01226575, "auxiliary_loss_mlp": 0.01027547, "balance_loss_clip": 1.05020499, "balance_loss_mlp": 1.02023399, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.5580689648306694, "language_loss": 0.77861047, "learning_rate": 9.500591170573824e-07, "loss": 0.80115169, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 3.487673282623291 }, { "auxiliary_loss_clip": 0.01366323, "auxiliary_loss_mlp": 0.01023686, "balance_loss_clip": 1.04440951, "balance_loss_mlp": 1.01678085, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 1.9473316161229444, "language_loss": 0.74156773, "learning_rate": 9.493961980058078e-07, "loss": 0.76546782, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.691288948059082 }, { "auxiliary_loss_clip": 0.01412827, "auxiliary_loss_mlp": 0.01026087, "balance_loss_clip": 1.03941441, "balance_loss_mlp": 1.01934576, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 1.7984888693962273, "language_loss": 0.67054278, "learning_rate": 9.48733438329993e-07, "loss": 0.69493192, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 2.84193754196167 }, { "auxiliary_loss_clip": 0.01173491, "auxiliary_loss_mlp": 0.02565207, "balance_loss_clip": 1.05013168, "balance_loss_mlp": 0.9998731, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.6352658294007683, "language_loss": 0.74609184, "learning_rate": 9.480708381304807e-07, "loss": 0.78347886, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.6821491718292236 }, { "auxiliary_loss_clip": 0.01366056, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.04624176, "balance_loss_mlp": 1.02481771, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.105814549809959, "language_loss": 0.83511019, "learning_rate": 9.474083975077858e-07, "loss": 0.85909611, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 3.641396999359131 }, { "auxiliary_loss_clip": 0.01222068, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.0471164, "balance_loss_mlp": 1.02338135, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 2.792949141537391, "language_loss": 0.80726814, "learning_rate": 9.467461165623994e-07, "loss": 0.82979637, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.6267364025115967 }, { "auxiliary_loss_clip": 0.01225909, "auxiliary_loss_mlp": 0.01027481, "balance_loss_clip": 1.04682183, "balance_loss_mlp": 1.0207994, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 2.329033601036418, "language_loss": 0.79694533, "learning_rate": 9.46083995394791e-07, "loss": 0.81947923, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.687666416168213 }, { "auxiliary_loss_clip": 0.01225084, "auxiliary_loss_mlp": 0.02562198, "balance_loss_clip": 1.04887009, "balance_loss_mlp": 0.99988246, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 1.9641969576923344, "language_loss": 0.63212156, "learning_rate": 9.454220341054012e-07, "loss": 0.66999435, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.78898286819458 }, { "auxiliary_loss_clip": 0.01319669, "auxiliary_loss_mlp": 0.01023767, "balance_loss_clip": 1.04450226, "balance_loss_mlp": 1.01689744, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 5.227472392339175, "language_loss": 0.80553579, "learning_rate": 9.447602327946512e-07, "loss": 0.82897019, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.7231011390686035 }, { "auxiliary_loss_clip": 0.0127723, "auxiliary_loss_mlp": 0.01026585, "balance_loss_clip": 1.04530954, "balance_loss_mlp": 1.01899755, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 1.8317816552749706, "language_loss": 0.76462305, "learning_rate": 9.440985915629338e-07, "loss": 0.78766114, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.6180999279022217 }, { "auxiliary_loss_clip": 0.01175071, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.05173755, "balance_loss_mlp": 1.02522087, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 2.2354838289724506, "language_loss": 0.73151171, "learning_rate": 9.434371105106223e-07, "loss": 0.75358331, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.5748751163482666 }, { "auxiliary_loss_clip": 0.01325881, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.04608881, "balance_loss_mlp": 1.02268088, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 2.797659046559257, "language_loss": 0.70602477, "learning_rate": 9.427757897380602e-07, "loss": 0.72958601, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.7238268852233887 }, { "auxiliary_loss_clip": 0.01315089, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.04656994, "balance_loss_mlp": 1.02043056, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.4805680228557794, "language_loss": 0.85022008, "learning_rate": 9.421146293455695e-07, "loss": 0.87365127, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.6656785011291504 }, { "auxiliary_loss_clip": 0.01275414, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 1.04613006, "balance_loss_mlp": 1.02049136, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 1.9077841215875244, "language_loss": 0.68432462, "learning_rate": 9.414536294334489e-07, "loss": 0.70735425, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.69136905670166 }, { "auxiliary_loss_clip": 0.01277298, "auxiliary_loss_mlp": 0.01025327, "balance_loss_clip": 1.04454398, "balance_loss_mlp": 1.01808846, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 2.0196915890800535, "language_loss": 0.6991452, "learning_rate": 9.407927901019708e-07, "loss": 0.72217143, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.65014386177063 }, { "auxiliary_loss_clip": 0.01226173, "auxiliary_loss_mlp": 0.0102271, "balance_loss_clip": 1.04919529, "balance_loss_mlp": 1.01562572, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 2.0653014308526454, "language_loss": 0.76750237, "learning_rate": 9.401321114513854e-07, "loss": 0.7899912, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.604227304458618 }, { "auxiliary_loss_clip": 0.01177054, "auxiliary_loss_mlp": 0.01024415, "balance_loss_clip": 1.05129433, "balance_loss_mlp": 1.01718211, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 2.4708513584558056, "language_loss": 0.75512773, "learning_rate": 9.394715935819155e-07, "loss": 0.77714241, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.6273937225341797 }, { "auxiliary_loss_clip": 0.01229285, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.04810572, "balance_loss_mlp": 1.02129936, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 2.1028429462361538, "language_loss": 0.62164557, "learning_rate": 9.388112365937608e-07, "loss": 0.6442219, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.6580827236175537 }, { "auxiliary_loss_clip": 0.01328323, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.04733849, "balance_loss_mlp": 1.02275181, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.1766364624415795, "language_loss": 0.82616538, "learning_rate": 9.381510405870985e-07, "loss": 0.84974623, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.7399609088897705 }, { "auxiliary_loss_clip": 0.01226628, "auxiliary_loss_mlp": 0.01032057, "balance_loss_clip": 1.04673696, "balance_loss_mlp": 1.02454662, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 2.6044894746204257, "language_loss": 0.77330565, "learning_rate": 9.374910056620791e-07, "loss": 0.79589248, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.673384666442871 }, { "auxiliary_loss_clip": 0.01224441, "auxiliary_loss_mlp": 0.01027222, "balance_loss_clip": 1.04847407, "balance_loss_mlp": 1.01949155, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 1.970407040502931, "language_loss": 0.81097031, "learning_rate": 9.368311319188293e-07, "loss": 0.83348691, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.5886852741241455 }, { "auxiliary_loss_clip": 0.01319887, "auxiliary_loss_mlp": 0.01030966, "balance_loss_clip": 1.04333854, "balance_loss_mlp": 1.0242157, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.7505810711663898, "language_loss": 0.79286361, "learning_rate": 9.361714194574515e-07, "loss": 0.81637216, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.8434510231018066 }, { "auxiliary_loss_clip": 0.01067686, "auxiliary_loss_mlp": 0.0100321, "balance_loss_clip": 1.0118407, "balance_loss_mlp": 1.00222683, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7354427154682996, "language_loss": 0.58275026, "learning_rate": 9.355118683780228e-07, "loss": 0.60345924, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 4.211063861846924 }, { "auxiliary_loss_clip": 0.01174732, "auxiliary_loss_mlp": 0.01025767, "balance_loss_clip": 1.04820657, "balance_loss_mlp": 1.01842928, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.298206653585592, "language_loss": 0.79184395, "learning_rate": 9.348524787805987e-07, "loss": 0.81384891, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.5176753997802734 }, { "auxiliary_loss_clip": 0.01324894, "auxiliary_loss_mlp": 0.01023451, "balance_loss_clip": 1.04112267, "balance_loss_mlp": 1.01612234, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.387149131902642, "language_loss": 0.84804809, "learning_rate": 9.341932507652053e-07, "loss": 0.87153161, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 3.563671827316284 }, { "auxiliary_loss_clip": 0.01276044, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.04338956, "balance_loss_mlp": 1.01976919, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.7221766865107224, "language_loss": 0.78495193, "learning_rate": 9.335341844318489e-07, "loss": 0.80798054, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 2.6723580360412598 }, { "auxiliary_loss_clip": 0.01269657, "auxiliary_loss_mlp": 0.01023646, "balance_loss_clip": 1.04691947, "balance_loss_mlp": 1.01659465, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.8229815459948806, "language_loss": 0.73574519, "learning_rate": 9.328752798805091e-07, "loss": 0.75867826, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 3.570526123046875 }, { "auxiliary_loss_clip": 0.01226201, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 1.04951692, "balance_loss_mlp": 1.01834476, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 3.0406886795921513, "language_loss": 0.75993991, "learning_rate": 9.322165372111399e-07, "loss": 0.78245997, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.62888765335083 }, { "auxiliary_loss_clip": 0.01315241, "auxiliary_loss_mlp": 0.01025607, "balance_loss_clip": 1.04677022, "balance_loss_mlp": 1.01921439, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 2.2651621390276326, "language_loss": 0.76290661, "learning_rate": 9.315579565236747e-07, "loss": 0.78631508, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.7067768573760986 }, { "auxiliary_loss_clip": 0.01276477, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.05184245, "balance_loss_mlp": 1.01864123, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.7954511455153153, "language_loss": 0.74206942, "learning_rate": 9.308995379180162e-07, "loss": 0.76509386, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.66896390914917 }, { "auxiliary_loss_clip": 0.01122403, "auxiliary_loss_mlp": 0.01007456, "balance_loss_clip": 1.01085567, "balance_loss_mlp": 1.00649595, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7386078759834017, "language_loss": 0.5948649, "learning_rate": 9.302412814940488e-07, "loss": 0.61616349, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 4.155320644378662 }, { "auxiliary_loss_clip": 0.01274517, "auxiliary_loss_mlp": 0.01026956, "balance_loss_clip": 1.04469228, "balance_loss_mlp": 1.01982772, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.1534541563585874, "language_loss": 0.7081399, "learning_rate": 9.295831873516276e-07, "loss": 0.73115456, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.6761391162872314 }, { "auxiliary_loss_clip": 0.01175246, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.05158114, "balance_loss_mlp": 1.02226353, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 1.8666401424307602, "language_loss": 0.76105094, "learning_rate": 9.289252555905873e-07, "loss": 0.7830987, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.5758371353149414 }, { "auxiliary_loss_clip": 0.01228029, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.05195653, "balance_loss_mlp": 1.02129936, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 2.4974901719934066, "language_loss": 0.75881422, "learning_rate": 9.282674863107334e-07, "loss": 0.78138018, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.6942427158355713 }, { "auxiliary_loss_clip": 0.01220484, "auxiliary_loss_mlp": 0.01024196, "balance_loss_clip": 1.04798841, "balance_loss_mlp": 1.01751757, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.2904406945105626, "language_loss": 0.76139116, "learning_rate": 9.276098796118488e-07, "loss": 0.78383791, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.6272382736206055 }, { "auxiliary_loss_clip": 0.01270393, "auxiliary_loss_mlp": 0.01024993, "balance_loss_clip": 1.04684424, "balance_loss_mlp": 1.01805472, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 2.2076571718658156, "language_loss": 0.65964866, "learning_rate": 9.269524355936938e-07, "loss": 0.68260247, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.7755043506622314 }, { "auxiliary_loss_clip": 0.01268469, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.04303598, "balance_loss_mlp": 1.02185738, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.7776846696691546, "language_loss": 0.84649372, "learning_rate": 9.262951543560002e-07, "loss": 0.86946774, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.8290388584136963 }, { "auxiliary_loss_clip": 0.0128042, "auxiliary_loss_mlp": 0.01025811, "balance_loss_clip": 1.05380237, "balance_loss_mlp": 1.0179137, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.475076819173484, "language_loss": 0.85925663, "learning_rate": 9.256380359984795e-07, "loss": 0.88231885, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.6272664070129395 }, { "auxiliary_loss_clip": 0.01380177, "auxiliary_loss_mlp": 0.01022572, "balance_loss_clip": 1.03934109, "balance_loss_mlp": 1.01549387, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 2.184700708538197, "language_loss": 0.74386454, "learning_rate": 9.249810806208139e-07, "loss": 0.767892, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.8792734146118164 }, { "auxiliary_loss_clip": 0.01370667, "auxiliary_loss_mlp": 0.02563464, "balance_loss_clip": 1.03880596, "balance_loss_mlp": 0.99984843, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 1.9202367721768152, "language_loss": 0.79793119, "learning_rate": 9.243242883226627e-07, "loss": 0.83727252, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.7153124809265137 }, { "auxiliary_loss_clip": 0.01227361, "auxiliary_loss_mlp": 0.01028746, "balance_loss_clip": 1.0450778, "balance_loss_mlp": 1.021474, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 2.0021021892468993, "language_loss": 0.69746673, "learning_rate": 9.236676592036628e-07, "loss": 0.7200278, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.714186191558838 }, { "auxiliary_loss_clip": 0.01269372, "auxiliary_loss_mlp": 0.01029554, "balance_loss_clip": 1.05026281, "balance_loss_mlp": 1.02250552, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.990467399179294, "language_loss": 0.73813164, "learning_rate": 9.230111933634228e-07, "loss": 0.76112092, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.7219197750091553 }, { "auxiliary_loss_clip": 0.01224368, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.04896605, "balance_loss_mlp": 1.02957654, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.5977270717397627, "language_loss": 0.80874312, "learning_rate": 9.223548909015288e-07, "loss": 0.83135104, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.6736762523651123 }, { "auxiliary_loss_clip": 0.01364655, "auxiliary_loss_mlp": 0.01021407, "balance_loss_clip": 1.04238665, "balance_loss_mlp": 1.01446295, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.9193185631664957, "language_loss": 0.72465229, "learning_rate": 9.216987519175407e-07, "loss": 0.74851286, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.832808494567871 }, { "auxiliary_loss_clip": 0.0121973, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.04877746, "balance_loss_mlp": 1.0199523, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 1.708292947728412, "language_loss": 0.68864667, "learning_rate": 9.210427765109942e-07, "loss": 0.71111625, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.6491191387176514 }, { "auxiliary_loss_clip": 0.01276814, "auxiliary_loss_mlp": 0.0103295, "balance_loss_clip": 1.046103, "balance_loss_mlp": 1.02447772, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 2.090376798382181, "language_loss": 0.81079096, "learning_rate": 9.20386964781402e-07, "loss": 0.83388853, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.6853320598602295 }, { "auxiliary_loss_clip": 0.01272165, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04550433, "balance_loss_mlp": 1.01822162, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 1.8876306575425497, "language_loss": 0.84590769, "learning_rate": 9.197313168282472e-07, "loss": 0.86887872, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.666729688644409 }, { "auxiliary_loss_clip": 0.01220658, "auxiliary_loss_mlp": 0.01026427, "balance_loss_clip": 1.0452714, "balance_loss_mlp": 1.01902759, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.2784311512602797, "language_loss": 0.72617632, "learning_rate": 9.190758327509935e-07, "loss": 0.74864709, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.6466825008392334 }, { "auxiliary_loss_clip": 0.01188266, "auxiliary_loss_mlp": 0.02506638, "balance_loss_clip": 1.01379108, "balance_loss_mlp": 0.99972993, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.9319355066481709, "language_loss": 0.64456022, "learning_rate": 9.184205126490767e-07, "loss": 0.68150926, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 4.051300764083862 }, { "auxiliary_loss_clip": 0.01226656, "auxiliary_loss_mlp": 0.02507436, "balance_loss_clip": 1.01177478, "balance_loss_mlp": 0.99977225, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.1077367510146883, "language_loss": 0.59615487, "learning_rate": 9.177653566219075e-07, "loss": 0.63349581, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 4.09380316734314 }, { "auxiliary_loss_clip": 0.01327331, "auxiliary_loss_mlp": 0.01022502, "balance_loss_clip": 1.04347146, "balance_loss_mlp": 1.01584089, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.580845649908458, "language_loss": 0.76540005, "learning_rate": 9.171103647688744e-07, "loss": 0.78889835, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.715479612350464 }, { "auxiliary_loss_clip": 0.01464297, "auxiliary_loss_mlp": 0.01027145, "balance_loss_clip": 1.0383867, "balance_loss_mlp": 1.02074051, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 3.179459065167304, "language_loss": 0.6915217, "learning_rate": 9.164555371893367e-07, "loss": 0.71643615, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 3.0214009284973145 }, { "auxiliary_loss_clip": 0.01222808, "auxiliary_loss_mlp": 0.02563963, "balance_loss_clip": 1.04837573, "balance_loss_mlp": 0.99989271, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 2.0638989344165766, "language_loss": 0.75377417, "learning_rate": 9.158008739826333e-07, "loss": 0.79164183, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 3.611166477203369 }, { "auxiliary_loss_clip": 0.01269143, "auxiliary_loss_mlp": 0.01030817, "balance_loss_clip": 1.04699111, "balance_loss_mlp": 1.02374244, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.7688947053935131, "language_loss": 0.86761785, "learning_rate": 9.151463752480744e-07, "loss": 0.89061749, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.698784589767456 }, { "auxiliary_loss_clip": 0.01314809, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04235172, "balance_loss_mlp": 1.01934803, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.8372690556565452, "language_loss": 0.80221313, "learning_rate": 9.144920410849493e-07, "loss": 0.82562494, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.6950252056121826 }, { "auxiliary_loss_clip": 0.01279356, "auxiliary_loss_mlp": 0.01023397, "balance_loss_clip": 1.04832864, "balance_loss_mlp": 1.01678705, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.7332417315961068, "language_loss": 0.80472088, "learning_rate": 9.138378715925176e-07, "loss": 0.82774836, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.6598691940307617 }, { "auxiliary_loss_clip": 0.01264155, "auxiliary_loss_mlp": 0.01026977, "balance_loss_clip": 1.04396677, "balance_loss_mlp": 1.01997364, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.8416223329557695, "language_loss": 0.80859488, "learning_rate": 9.131838668700167e-07, "loss": 0.83150625, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 4.138831377029419 }, { "auxiliary_loss_clip": 0.01328809, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.04461551, "balance_loss_mlp": 1.02253604, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.9793759597275544, "language_loss": 0.86679941, "learning_rate": 9.125300270166598e-07, "loss": 0.89038539, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.771765947341919 }, { "auxiliary_loss_clip": 0.01227297, "auxiliary_loss_mlp": 0.01024966, "balance_loss_clip": 1.04322529, "balance_loss_mlp": 1.01815879, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.6944333321573566, "language_loss": 0.85789722, "learning_rate": 9.118763521316324e-07, "loss": 0.88041985, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.7644693851470947 }, { "auxiliary_loss_clip": 0.01173402, "auxiliary_loss_mlp": 0.02564862, "balance_loss_clip": 1.04792702, "balance_loss_mlp": 0.99986935, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 2.036379971974148, "language_loss": 0.7601198, "learning_rate": 9.112228423140987e-07, "loss": 0.7975024, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.575529098510742 }, { "auxiliary_loss_clip": 0.01277473, "auxiliary_loss_mlp": 0.01025081, "balance_loss_clip": 1.04738939, "balance_loss_mlp": 1.01791096, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.5434413975282206, "language_loss": 0.86881232, "learning_rate": 9.105694976631932e-07, "loss": 0.89183789, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.817011833190918 }, { "auxiliary_loss_clip": 0.01225808, "auxiliary_loss_mlp": 0.01021775, "balance_loss_clip": 1.05078506, "balance_loss_mlp": 1.01472998, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 3.1176795627321634, "language_loss": 0.7261737, "learning_rate": 9.099163182780283e-07, "loss": 0.74864948, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.646911382675171 }, { "auxiliary_loss_clip": 0.01270213, "auxiliary_loss_mlp": 0.01024289, "balance_loss_clip": 1.04795706, "balance_loss_mlp": 1.01721704, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 9.561806247417692, "language_loss": 0.49623895, "learning_rate": 9.092633042576916e-07, "loss": 0.51918399, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.716426372528076 }, { "auxiliary_loss_clip": 0.01264792, "auxiliary_loss_mlp": 0.01023668, "balance_loss_clip": 1.04686868, "balance_loss_mlp": 1.01703668, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.8979134024347013, "language_loss": 0.56385636, "learning_rate": 9.086104557012446e-07, "loss": 0.58674103, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.7018938064575195 }, { "auxiliary_loss_clip": 0.01219691, "auxiliary_loss_mlp": 0.0102387, "balance_loss_clip": 1.04665208, "balance_loss_mlp": 1.01712263, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 2.2755272907052544, "language_loss": 0.65773249, "learning_rate": 9.079577727077239e-07, "loss": 0.68016815, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.6521592140197754 }, { "auxiliary_loss_clip": 0.01226515, "auxiliary_loss_mlp": 0.0102558, "balance_loss_clip": 1.05043721, "balance_loss_mlp": 1.01827824, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 7.454943879645916, "language_loss": 0.72225535, "learning_rate": 9.073052553761404e-07, "loss": 0.74477637, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.608692169189453 }, { "auxiliary_loss_clip": 0.01371753, "auxiliary_loss_mlp": 0.01028706, "balance_loss_clip": 1.04295695, "balance_loss_mlp": 1.02120757, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.7849821638605738, "language_loss": 0.78236842, "learning_rate": 9.066529038054805e-07, "loss": 0.806373, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.7513322830200195 }, { "auxiliary_loss_clip": 0.01273377, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.04641879, "balance_loss_mlp": 1.02075505, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.803670710177161, "language_loss": 0.74201149, "learning_rate": 9.060007180947071e-07, "loss": 0.76501918, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.619276523590088 }, { "auxiliary_loss_clip": 0.01275214, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.04236782, "balance_loss_mlp": 1.02349472, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 2.938524968408278, "language_loss": 0.72987187, "learning_rate": 9.053486983427534e-07, "loss": 0.75293839, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.818345785140991 }, { "auxiliary_loss_clip": 0.01277229, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.04521799, "balance_loss_mlp": 1.02187634, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 1.9155228687975974, "language_loss": 0.71127832, "learning_rate": 9.046968446485326e-07, "loss": 0.73433775, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.693904161453247 }, { "auxiliary_loss_clip": 0.0122539, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.04961479, "balance_loss_mlp": 1.01889324, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 2.1803383682428494, "language_loss": 0.7060945, "learning_rate": 9.040451571109295e-07, "loss": 0.72861302, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.557554006576538 }, { "auxiliary_loss_clip": 0.01239787, "auxiliary_loss_mlp": 0.00998778, "balance_loss_clip": 1.0205394, "balance_loss_mlp": 0.99771065, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8281672444191933, "language_loss": 0.60397536, "learning_rate": 9.033936358288042e-07, "loss": 0.62636101, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.203559160232544 }, { "auxiliary_loss_clip": 0.01176076, "auxiliary_loss_mlp": 0.01024968, "balance_loss_clip": 1.04920089, "balance_loss_mlp": 1.01739264, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 2.3791559168784544, "language_loss": 0.82323587, "learning_rate": 9.027422809009937e-07, "loss": 0.84524632, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.6352412700653076 }, { "auxiliary_loss_clip": 0.01227001, "auxiliary_loss_mlp": 0.01023492, "balance_loss_clip": 1.04749107, "balance_loss_mlp": 1.01657462, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.749950572012988, "language_loss": 0.83178169, "learning_rate": 9.020910924263054e-07, "loss": 0.85428667, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.693902015686035 }, { "auxiliary_loss_clip": 0.01238048, "auxiliary_loss_mlp": 0.00999291, "balance_loss_clip": 1.02091932, "balance_loss_mlp": 0.99828339, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8179131408682453, "language_loss": 0.58109522, "learning_rate": 9.014400705035261e-07, "loss": 0.6034686, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 4.260440826416016 }, { "auxiliary_loss_clip": 0.01176376, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.05203319, "balance_loss_mlp": 1.01865983, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 1.861198904612545, "language_loss": 0.7668494, "learning_rate": 9.00789215231414e-07, "loss": 0.78886986, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 3.626063823699951 }, { "auxiliary_loss_clip": 0.01327085, "auxiliary_loss_mlp": 0.02568952, "balance_loss_clip": 1.0421989, "balance_loss_mlp": 0.99988544, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.8225606397056584, "language_loss": 0.81981361, "learning_rate": 9.001385267087056e-07, "loss": 0.85877395, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.708718776702881 }, { "auxiliary_loss_clip": 0.01228678, "auxiliary_loss_mlp": 0.01024011, "balance_loss_clip": 1.04962897, "balance_loss_mlp": 1.0171957, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.672500466228321, "language_loss": 0.70606899, "learning_rate": 8.994880050341072e-07, "loss": 0.72859585, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 2.6294257640838623 }, { "auxiliary_loss_clip": 0.01273854, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.04737341, "balance_loss_mlp": 1.02169991, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.8927650005237129, "language_loss": 0.77766377, "learning_rate": 8.988376503063026e-07, "loss": 0.80068845, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 3.5834269523620605 }, { "auxiliary_loss_clip": 0.01383419, "auxiliary_loss_mlp": 0.01024342, "balance_loss_clip": 1.04663134, "balance_loss_mlp": 1.01679897, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 2.8118698563458273, "language_loss": 0.81509489, "learning_rate": 8.981874626239521e-07, "loss": 0.83917248, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.765078067779541 }, { "auxiliary_loss_clip": 0.01227229, "auxiliary_loss_mlp": 0.01029765, "balance_loss_clip": 1.05100536, "balance_loss_mlp": 1.02259445, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 1.9915587309888167, "language_loss": 0.88834333, "learning_rate": 8.975374420856872e-07, "loss": 0.91091329, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.587526321411133 }, { "auxiliary_loss_clip": 0.01314925, "auxiliary_loss_mlp": 0.01022709, "balance_loss_clip": 1.04124033, "balance_loss_mlp": 1.01567602, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.3017899439013396, "language_loss": 0.72313994, "learning_rate": 8.968875887901157e-07, "loss": 0.74651635, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.697949171066284 }, { "auxiliary_loss_clip": 0.01275088, "auxiliary_loss_mlp": 0.01025149, "balance_loss_clip": 1.04474878, "balance_loss_mlp": 1.01793945, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 2.1781912979946365, "language_loss": 0.62822652, "learning_rate": 8.9623790283582e-07, "loss": 0.6512289, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 3.5774013996124268 }, { "auxiliary_loss_clip": 0.01324705, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.04553533, "balance_loss_mlp": 1.02238691, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.3854345495506752, "language_loss": 0.76477683, "learning_rate": 8.955883843213561e-07, "loss": 0.78832191, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.7479870319366455 }, { "auxiliary_loss_clip": 0.01230392, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.04839182, "balance_loss_mlp": 1.02341461, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.7727296439869569, "language_loss": 0.86973792, "learning_rate": 8.949390333452569e-07, "loss": 0.89234984, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.7972569465637207 }, { "auxiliary_loss_clip": 0.01176032, "auxiliary_loss_mlp": 0.01031304, "balance_loss_clip": 1.05188107, "balance_loss_mlp": 1.02444363, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 2.569776164111555, "language_loss": 0.67827952, "learning_rate": 8.942898500060279e-07, "loss": 0.70035291, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.667105197906494 }, { "auxiliary_loss_clip": 0.01280622, "auxiliary_loss_mlp": 0.01028728, "balance_loss_clip": 1.04785264, "balance_loss_mlp": 1.0214386, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 2.6127303585525103, "language_loss": 0.721937, "learning_rate": 8.936408344021493e-07, "loss": 0.74503052, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.823032855987549 }, { "auxiliary_loss_clip": 0.01183195, "auxiliary_loss_mlp": 0.01030633, "balance_loss_clip": 1.05083334, "balance_loss_mlp": 1.02269983, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.4493277172156636, "language_loss": 0.71133149, "learning_rate": 8.929919866320765e-07, "loss": 0.73346972, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.853386640548706 }, { "auxiliary_loss_clip": 0.01332753, "auxiliary_loss_mlp": 0.02568753, "balance_loss_clip": 1.0477643, "balance_loss_mlp": 0.99986953, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 2.263201015166706, "language_loss": 0.81476259, "learning_rate": 8.923433067942385e-07, "loss": 0.85377765, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.811659336090088 }, { "auxiliary_loss_clip": 0.01232587, "auxiliary_loss_mlp": 0.01026697, "balance_loss_clip": 1.04846334, "balance_loss_mlp": 1.02024174, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 1.9813255822872426, "language_loss": 0.68725955, "learning_rate": 8.916947949870417e-07, "loss": 0.7098524, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.7138617038726807 }, { "auxiliary_loss_clip": 0.01123914, "auxiliary_loss_mlp": 0.01003355, "balance_loss_clip": 1.01201606, "balance_loss_mlp": 1.00238895, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7356206913647154, "language_loss": 0.58054137, "learning_rate": 8.910464513088615e-07, "loss": 0.60181415, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.2817447185516357 }, { "auxiliary_loss_clip": 0.01270675, "auxiliary_loss_mlp": 0.0102809, "balance_loss_clip": 1.045784, "balance_loss_mlp": 1.02093482, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 2.2370995792112995, "language_loss": 0.78759843, "learning_rate": 8.903982758580542e-07, "loss": 0.81058609, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.6674845218658447 }, { "auxiliary_loss_clip": 0.01279808, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.04929948, "balance_loss_mlp": 1.01688254, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 2.1772907346935186, "language_loss": 0.80069852, "learning_rate": 8.897502687329457e-07, "loss": 0.82373959, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.6819489002227783 }, { "auxiliary_loss_clip": 0.01320613, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.04337978, "balance_loss_mlp": 1.01780045, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 1.9145156329558919, "language_loss": 0.79746091, "learning_rate": 8.891024300318382e-07, "loss": 0.82091624, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.7757420539855957 }, { "auxiliary_loss_clip": 0.01317049, "auxiliary_loss_mlp": 0.01022083, "balance_loss_clip": 1.04129076, "balance_loss_mlp": 1.01540995, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 2.7125023247527147, "language_loss": 0.7592212, "learning_rate": 8.884547598530103e-07, "loss": 0.78261256, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.7578392028808594 }, { "auxiliary_loss_clip": 0.01465185, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.03893018, "balance_loss_mlp": 1.02570033, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.985877231942512, "language_loss": 0.75336051, "learning_rate": 8.8780725829471e-07, "loss": 0.77834034, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 3.012160062789917 }, { "auxiliary_loss_clip": 0.01175217, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.04906058, "balance_loss_mlp": 1.02639198, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 4.9022555669031, "language_loss": 0.78180516, "learning_rate": 8.87159925455165e-07, "loss": 0.80389345, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.8051111698150635 }, { "auxiliary_loss_clip": 0.01324944, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04757321, "balance_loss_mlp": 1.02265954, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 1.797610456467201, "language_loss": 0.73438358, "learning_rate": 8.865127614325738e-07, "loss": 0.75792897, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.7182815074920654 }, { "auxiliary_loss_clip": 0.01273132, "auxiliary_loss_mlp": 0.0103201, "balance_loss_clip": 1.04509115, "balance_loss_mlp": 1.02437782, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 2.1695924492897736, "language_loss": 0.67172706, "learning_rate": 8.85865766325113e-07, "loss": 0.69477844, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.767507791519165 }, { "auxiliary_loss_clip": 0.01273929, "auxiliary_loss_mlp": 0.01026212, "balance_loss_clip": 1.04632258, "balance_loss_mlp": 1.01925921, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.40810108641054, "language_loss": 0.72541326, "learning_rate": 8.852189402309287e-07, "loss": 0.74841464, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.7830047607421875 }, { "auxiliary_loss_clip": 0.01224848, "auxiliary_loss_mlp": 0.01029825, "balance_loss_clip": 1.05020213, "balance_loss_mlp": 1.02251124, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 9.111159253241118, "language_loss": 0.74489874, "learning_rate": 8.845722832481441e-07, "loss": 0.76744545, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 3.54976487159729 }, { "auxiliary_loss_clip": 0.01226334, "auxiliary_loss_mlp": 0.01027625, "balance_loss_clip": 1.05032349, "balance_loss_mlp": 1.02032983, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 2.3041092505730614, "language_loss": 0.77724707, "learning_rate": 8.83925795474858e-07, "loss": 0.79978669, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 3.53317928314209 }, { "auxiliary_loss_clip": 0.01318959, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.04699755, "balance_loss_mlp": 1.02247381, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.7402578972927008, "language_loss": 0.59354579, "learning_rate": 8.832794770091414e-07, "loss": 0.6170404, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.7984459400177 }, { "auxiliary_loss_clip": 0.01280988, "auxiliary_loss_mlp": 0.01025988, "balance_loss_clip": 1.04769421, "balance_loss_mlp": 1.01897883, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.954649139979604, "language_loss": 0.82629001, "learning_rate": 8.826333279490401e-07, "loss": 0.84935975, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 2.7883288860321045 }, { "auxiliary_loss_clip": 0.01278159, "auxiliary_loss_mlp": 0.01024242, "balance_loss_clip": 1.05097437, "balance_loss_mlp": 1.01718795, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.5411725332919697, "language_loss": 0.67729867, "learning_rate": 8.819873483925748e-07, "loss": 0.70032269, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 3.462571859359741 }, { "auxiliary_loss_clip": 0.01330317, "auxiliary_loss_mlp": 0.02564827, "balance_loss_clip": 1.04842615, "balance_loss_mlp": 0.99989778, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 2.4327074179248878, "language_loss": 0.74841928, "learning_rate": 8.81341538437739e-07, "loss": 0.78737068, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.7744383811950684 }, { "auxiliary_loss_clip": 0.012776, "auxiliary_loss_mlp": 0.01023597, "balance_loss_clip": 1.04509497, "balance_loss_mlp": 1.01658416, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 1.7295664741324293, "language_loss": 0.67966688, "learning_rate": 8.80695898182503e-07, "loss": 0.70267886, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.7920539379119873 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01003226, "balance_loss_clip": 1.02112424, "balance_loss_mlp": 1.00223053, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.9095171919704939, "language_loss": 0.65056229, "learning_rate": 8.800504277248093e-07, "loss": 0.67189205, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.228753089904785 }, { "auxiliary_loss_clip": 0.01327372, "auxiliary_loss_mlp": 0.02565784, "balance_loss_clip": 1.05268168, "balance_loss_mlp": 0.99991202, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 2.3173141626109297, "language_loss": 0.75180709, "learning_rate": 8.794051271625753e-07, "loss": 0.79073864, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 3.606156349182129 }, { "auxiliary_loss_clip": 0.01271719, "auxiliary_loss_mlp": 0.01027832, "balance_loss_clip": 1.04639482, "balance_loss_mlp": 1.02078927, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 3.0905194465099948, "language_loss": 0.83319986, "learning_rate": 8.787599965936925e-07, "loss": 0.85619533, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.680300235748291 }, { "auxiliary_loss_clip": 0.01318539, "auxiliary_loss_mlp": 0.01032604, "balance_loss_clip": 1.0461334, "balance_loss_mlp": 1.02550197, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.6725548823844794, "language_loss": 0.7232275, "learning_rate": 8.781150361160261e-07, "loss": 0.74673891, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.878025531768799 }, { "auxiliary_loss_clip": 0.0122796, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.04790401, "balance_loss_mlp": 1.02590942, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 2.5772296291025056, "language_loss": 0.73710942, "learning_rate": 8.774702458274181e-07, "loss": 0.75972104, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.727648973464966 }, { "auxiliary_loss_clip": 0.01230696, "auxiliary_loss_mlp": 0.01024859, "balance_loss_clip": 1.0521946, "balance_loss_mlp": 1.01742649, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.148759666128042, "language_loss": 0.71332014, "learning_rate": 8.768256258256799e-07, "loss": 0.73587561, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.6070117950439453 }, { "auxiliary_loss_clip": 0.01227696, "auxiliary_loss_mlp": 0.01026317, "balance_loss_clip": 1.04848146, "balance_loss_mlp": 1.01946282, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.8452299685138527, "language_loss": 0.73955524, "learning_rate": 8.76181176208602e-07, "loss": 0.76209539, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.6505184173583984 }, { "auxiliary_loss_clip": 0.01373401, "auxiliary_loss_mlp": 0.01031422, "balance_loss_clip": 1.04304183, "balance_loss_mlp": 1.02309489, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.8933104179098255, "language_loss": 0.73378682, "learning_rate": 8.755368970739461e-07, "loss": 0.75783503, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.7742233276367188 }, { "auxiliary_loss_clip": 0.01332288, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.0467931, "balance_loss_mlp": 1.01793039, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 2.734759639855676, "language_loss": 0.61735755, "learning_rate": 8.748927885194479e-07, "loss": 0.64093482, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.697021484375 }, { "auxiliary_loss_clip": 0.01215241, "auxiliary_loss_mlp": 0.01003869, "balance_loss_clip": 1.01029813, "balance_loss_mlp": 1.00289142, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.7865130506160781, "language_loss": 0.57381237, "learning_rate": 8.742488506428209e-07, "loss": 0.59600341, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2413151264190674 }, { "auxiliary_loss_clip": 0.01277902, "auxiliary_loss_mlp": 0.02564764, "balance_loss_clip": 1.04789424, "balance_loss_mlp": 0.99985957, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 2.0636774477627196, "language_loss": 0.78692579, "learning_rate": 8.736050835417466e-07, "loss": 0.82535249, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.7432892322540283 }, { "auxiliary_loss_clip": 0.01230485, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 1.04976249, "balance_loss_mlp": 1.02270186, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 1.9379183069207455, "language_loss": 0.61553961, "learning_rate": 8.729614873138862e-07, "loss": 0.63813877, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.6084043979644775 }, { "auxiliary_loss_clip": 0.01377412, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.04702473, "balance_loss_mlp": 1.01967406, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 2.756183243150909, "language_loss": 0.77806294, "learning_rate": 8.723180620568716e-07, "loss": 0.80210727, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.7775988578796387 }, { "auxiliary_loss_clip": 0.01178605, "auxiliary_loss_mlp": 0.01026021, "balance_loss_clip": 1.04764771, "balance_loss_mlp": 1.01903486, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.8764377377043409, "language_loss": 0.84991288, "learning_rate": 8.716748078683116e-07, "loss": 0.87195909, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.645879030227661 }, { "auxiliary_loss_clip": 0.01466331, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.03963256, "balance_loss_mlp": 1.01802921, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 6.126701491483206, "language_loss": 0.68969589, "learning_rate": 8.710317248457855e-07, "loss": 0.71461058, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.9832546710968018 }, { "auxiliary_loss_clip": 0.01270291, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.04893577, "balance_loss_mlp": 1.0193646, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 2.03196424882431, "language_loss": 0.72150552, "learning_rate": 8.703888130868482e-07, "loss": 0.74447107, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.9289066791534424 }, { "auxiliary_loss_clip": 0.0132414, "auxiliary_loss_mlp": 0.0103274, "balance_loss_clip": 1.04652596, "balance_loss_mlp": 1.02515805, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 2.912047673445961, "language_loss": 0.82068974, "learning_rate": 8.697460726890307e-07, "loss": 0.84425855, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.8161470890045166 }, { "auxiliary_loss_clip": 0.01325322, "auxiliary_loss_mlp": 0.02568508, "balance_loss_clip": 1.04228449, "balance_loss_mlp": 0.99985254, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 2.8863477176638184, "language_loss": 0.9051851, "learning_rate": 8.691035037498354e-07, "loss": 0.94412339, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.738332509994507 }, { "auxiliary_loss_clip": 0.01273697, "auxiliary_loss_mlp": 0.01021856, "balance_loss_clip": 1.04404545, "balance_loss_mlp": 1.01519787, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.6930051593025202, "language_loss": 0.72533596, "learning_rate": 8.684611063667391e-07, "loss": 0.74829149, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.6811234951019287 }, { "auxiliary_loss_clip": 0.01225578, "auxiliary_loss_mlp": 0.01022821, "balance_loss_clip": 1.04703009, "balance_loss_mlp": 1.0158329, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 8.924215925267575, "language_loss": 0.76759487, "learning_rate": 8.678188806371935e-07, "loss": 0.79007882, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.7661795616149902 }, { "auxiliary_loss_clip": 0.01222941, "auxiliary_loss_mlp": 0.01022918, "balance_loss_clip": 1.04635191, "balance_loss_mlp": 1.0161469, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 4.644818167254538, "language_loss": 0.85371614, "learning_rate": 8.671768266586228e-07, "loss": 0.87617475, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 4.507591485977173 }, { "auxiliary_loss_clip": 0.01322516, "auxiliary_loss_mlp": 0.01029864, "balance_loss_clip": 1.04527092, "balance_loss_mlp": 1.02315867, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 1.7334660006742766, "language_loss": 0.78183752, "learning_rate": 8.665349445284275e-07, "loss": 0.80536127, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.771275520324707 }, { "auxiliary_loss_clip": 0.01326066, "auxiliary_loss_mlp": 0.01027281, "balance_loss_clip": 1.04833698, "balance_loss_mlp": 1.01993823, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.5061386227351443, "language_loss": 0.80914295, "learning_rate": 8.658932343439799e-07, "loss": 0.83267647, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 3.6157174110412598 }, { "auxiliary_loss_clip": 0.01178661, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.05232799, "balance_loss_mlp": 1.021837, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.091485899682168, "language_loss": 0.77733493, "learning_rate": 8.65251696202627e-07, "loss": 0.79941332, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.6113028526306152 }, { "auxiliary_loss_clip": 0.01224487, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.04647875, "balance_loss_mlp": 1.02137589, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 2.2499485188242576, "language_loss": 0.87401867, "learning_rate": 8.646103302016896e-07, "loss": 0.89655173, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 2.6925222873687744 }, { "auxiliary_loss_clip": 0.01383361, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.04424441, "balance_loss_mlp": 1.01885509, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 2.7300553500435187, "language_loss": 0.88217622, "learning_rate": 8.639691364384614e-07, "loss": 0.90627539, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.673872470855713 }, { "auxiliary_loss_clip": 0.0127722, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 1.04841197, "balance_loss_mlp": 1.023368, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 2.515700738086435, "language_loss": 0.72301596, "learning_rate": 8.633281150102136e-07, "loss": 0.74609387, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.7368955612182617 }, { "auxiliary_loss_clip": 0.01271882, "auxiliary_loss_mlp": 0.01021388, "balance_loss_clip": 1.04716158, "balance_loss_mlp": 1.01435518, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 3.3486200728699425, "language_loss": 0.67551512, "learning_rate": 8.626872660141855e-07, "loss": 0.69844782, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 3.5676157474517822 }, { "auxiliary_loss_clip": 0.01372814, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.04675174, "balance_loss_mlp": 1.02611542, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 3.5928782627405673, "language_loss": 0.74956286, "learning_rate": 8.620465895475957e-07, "loss": 0.7736237, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.645437479019165 }, { "auxiliary_loss_clip": 0.013704, "auxiliary_loss_mlp": 0.01031762, "balance_loss_clip": 1.04538417, "balance_loss_mlp": 1.02466893, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.4458550645792665, "language_loss": 0.7539798, "learning_rate": 8.614060857076333e-07, "loss": 0.77800143, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.665581226348877 }, { "auxiliary_loss_clip": 0.01270931, "auxiliary_loss_mlp": 0.01028675, "balance_loss_clip": 1.04628468, "balance_loss_mlp": 1.02142119, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 2.537891675864813, "language_loss": 0.75265181, "learning_rate": 8.60765754591462e-07, "loss": 0.77564788, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.5448806285858154 }, { "auxiliary_loss_clip": 0.01175151, "auxiliary_loss_mlp": 0.01021948, "balance_loss_clip": 1.04925156, "balance_loss_mlp": 1.01445544, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 2.3361447180321306, "language_loss": 0.72705787, "learning_rate": 8.601255962962211e-07, "loss": 0.7490288, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.5022740364074707 }, { "auxiliary_loss_clip": 0.01235639, "auxiliary_loss_mlp": 0.01030932, "balance_loss_clip": 1.05236316, "balance_loss_mlp": 1.02279603, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.342792630063301, "language_loss": 0.72028518, "learning_rate": 8.594856109190194e-07, "loss": 0.74295092, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.6082193851470947 }, { "auxiliary_loss_clip": 0.01175711, "auxiliary_loss_mlp": 0.01022886, "balance_loss_clip": 1.05000961, "balance_loss_mlp": 1.01583481, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.7383305081462501, "language_loss": 0.69289207, "learning_rate": 8.588457985569446e-07, "loss": 0.71487808, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.672624111175537 }, { "auxiliary_loss_clip": 0.0117785, "auxiliary_loss_mlp": 0.01022049, "balance_loss_clip": 1.05060935, "balance_loss_mlp": 1.01484299, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 2.0507915880196377, "language_loss": 0.71899569, "learning_rate": 8.582061593070542e-07, "loss": 0.74099463, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.6070573329925537 }, { "auxiliary_loss_clip": 0.01178074, "auxiliary_loss_mlp": 0.02568107, "balance_loss_clip": 1.05160713, "balance_loss_mlp": 0.99989367, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.9068587662098295, "language_loss": 0.77310216, "learning_rate": 8.57566693266383e-07, "loss": 0.81056398, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.590712785720825 }, { "auxiliary_loss_clip": 0.01280028, "auxiliary_loss_mlp": 0.02567463, "balance_loss_clip": 1.0468905, "balance_loss_mlp": 0.99987864, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.2356630541363756, "language_loss": 0.69568866, "learning_rate": 8.569274005319354e-07, "loss": 0.73416358, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.6465060710906982 }, { "auxiliary_loss_clip": 0.01223054, "auxiliary_loss_mlp": 0.01029243, "balance_loss_clip": 1.04733467, "balance_loss_mlp": 1.02223361, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.766331721677987, "language_loss": 0.7950592, "learning_rate": 8.562882812006913e-07, "loss": 0.81758213, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.6434860229492188 }, { "auxiliary_loss_clip": 0.01174772, "auxiliary_loss_mlp": 0.01025516, "balance_loss_clip": 1.04964089, "balance_loss_mlp": 1.01831865, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 2.1766117444183997, "language_loss": 0.77882189, "learning_rate": 8.556493353696066e-07, "loss": 0.80082476, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.575765609741211 }, { "auxiliary_loss_clip": 0.01229444, "auxiliary_loss_mlp": 0.02571439, "balance_loss_clip": 1.05065453, "balance_loss_mlp": 0.99989539, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 57.7171915874426, "language_loss": 0.68232363, "learning_rate": 8.550105631356077e-07, "loss": 0.72033238, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.6934595108032227 }, { "auxiliary_loss_clip": 0.01320834, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.04310489, "balance_loss_mlp": 1.02038169, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 3.079164227919468, "language_loss": 0.77285826, "learning_rate": 8.543719645955961e-07, "loss": 0.79634666, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.714679718017578 }, { "auxiliary_loss_clip": 0.01276106, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.04695451, "balance_loss_mlp": 1.01817656, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.701145044297368, "language_loss": 0.74486208, "learning_rate": 8.537335398464467e-07, "loss": 0.76787174, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.6793060302734375 }, { "auxiliary_loss_clip": 0.01276289, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.04513073, "balance_loss_mlp": 1.02022147, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 2.7951797320621354, "language_loss": 0.85315126, "learning_rate": 8.53095288985007e-07, "loss": 0.87618935, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.693242073059082 }, { "auxiliary_loss_clip": 0.01173585, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.05021632, "balance_loss_mlp": 1.01555955, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 2.3081902934757017, "language_loss": 0.82448292, "learning_rate": 8.524572121081009e-07, "loss": 0.84643829, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.592787742614746 }, { "auxiliary_loss_clip": 0.01228457, "auxiliary_loss_mlp": 0.01025121, "balance_loss_clip": 1.04767096, "balance_loss_mlp": 1.01793325, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 4.074057104036073, "language_loss": 0.62265337, "learning_rate": 8.518193093125232e-07, "loss": 0.64518917, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.6831910610198975 }, { "auxiliary_loss_clip": 0.01280946, "auxiliary_loss_mlp": 0.01026854, "balance_loss_clip": 1.04924107, "balance_loss_mlp": 1.02014804, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 2.1546275681548996, "language_loss": 0.8091526, "learning_rate": 8.511815806950436e-07, "loss": 0.83223057, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.683828115463257 }, { "auxiliary_loss_clip": 0.01219837, "auxiliary_loss_mlp": 0.01024687, "balance_loss_clip": 1.0453577, "balance_loss_mlp": 1.01766872, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 2.2492706098012127, "language_loss": 0.77795136, "learning_rate": 8.505440263524044e-07, "loss": 0.80039656, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 4.396458864212036 }, { "auxiliary_loss_clip": 0.01227481, "auxiliary_loss_mlp": 0.01030333, "balance_loss_clip": 1.04681945, "balance_loss_mlp": 1.02315938, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 3.834735511742285, "language_loss": 0.88285607, "learning_rate": 8.49906646381322e-07, "loss": 0.90543419, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.604480266571045 }, { "auxiliary_loss_clip": 0.01224927, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.04611909, "balance_loss_mlp": 1.0193758, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 2.30531638708349, "language_loss": 0.72750664, "learning_rate": 8.492694408784884e-07, "loss": 0.75002265, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 3.790470838546753 }, { "auxiliary_loss_clip": 0.01231006, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.04977155, "balance_loss_mlp": 1.02017248, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.9448212336801287, "language_loss": 0.62701344, "learning_rate": 8.486324099405642e-07, "loss": 0.64959538, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.621645927429199 }, { "auxiliary_loss_clip": 0.01223374, "auxiliary_loss_mlp": 0.01020538, "balance_loss_clip": 1.04770017, "balance_loss_mlp": 1.01385331, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.7128199747647768, "language_loss": 0.74780846, "learning_rate": 8.479955536641887e-07, "loss": 0.77024764, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.6963984966278076 }, { "auxiliary_loss_clip": 0.01267917, "auxiliary_loss_mlp": 0.01030276, "balance_loss_clip": 1.04182363, "balance_loss_mlp": 1.02308452, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 1.9732411190955865, "language_loss": 0.66182995, "learning_rate": 8.473588721459716e-07, "loss": 0.68481183, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.753777027130127 }, { "auxiliary_loss_clip": 0.0122683, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.05186796, "balance_loss_mlp": 1.01961207, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 2.2992100514095672, "language_loss": 0.70615333, "learning_rate": 8.467223654824967e-07, "loss": 0.72870207, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.8392271995544434 }, { "auxiliary_loss_clip": 0.01223292, "auxiliary_loss_mlp": 0.01023924, "balance_loss_clip": 1.04760635, "balance_loss_mlp": 1.01674128, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 2.84416271939699, "language_loss": 0.62650132, "learning_rate": 8.460860337703233e-07, "loss": 0.64897346, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 3.730315685272217 }, { "auxiliary_loss_clip": 0.01323844, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.04630995, "balance_loss_mlp": 1.01817203, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 2.446307049655094, "language_loss": 0.70672852, "learning_rate": 8.454498771059797e-07, "loss": 0.73022747, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.7182745933532715 }, { "auxiliary_loss_clip": 0.01367319, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.04310918, "balance_loss_mlp": 1.02109194, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.041046344920763, "language_loss": 0.83455539, "learning_rate": 8.448138955859725e-07, "loss": 0.85850775, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.6875553131103516 }, { "auxiliary_loss_clip": 0.01274644, "auxiliary_loss_mlp": 0.01022743, "balance_loss_clip": 1.04812896, "balance_loss_mlp": 1.01557839, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.9993237736629212, "language_loss": 0.90248072, "learning_rate": 8.44178089306778e-07, "loss": 0.92545462, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.703164577484131 }, { "auxiliary_loss_clip": 0.01175714, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.05058491, "balance_loss_mlp": 1.01682258, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.8382054431002621, "language_loss": 0.76951301, "learning_rate": 8.4354245836485e-07, "loss": 0.79150879, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.5712578296661377 }, { "auxiliary_loss_clip": 0.01326026, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.0467732, "balance_loss_mlp": 1.02187026, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.5918280349820018, "language_loss": 0.72974515, "learning_rate": 8.429070028566108e-07, "loss": 0.75330067, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.743687152862549 }, { "auxiliary_loss_clip": 0.0122314, "auxiliary_loss_mlp": 0.01030092, "balance_loss_clip": 1.04869914, "balance_loss_mlp": 1.02332389, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 2.086282197646638, "language_loss": 0.75142312, "learning_rate": 8.422717228784586e-07, "loss": 0.77395546, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.631183624267578 }, { "auxiliary_loss_clip": 0.01371949, "auxiliary_loss_mlp": 0.01024167, "balance_loss_clip": 1.04703808, "balance_loss_mlp": 1.01665688, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 5.189896558309082, "language_loss": 0.69133031, "learning_rate": 8.416366185267663e-07, "loss": 0.71529144, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.6890931129455566 }, { "auxiliary_loss_clip": 0.01223921, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.04742956, "balance_loss_mlp": 1.02152395, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 7.449210107110314, "language_loss": 0.77938664, "learning_rate": 8.410016898978778e-07, "loss": 0.80190969, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.6699061393737793 }, { "auxiliary_loss_clip": 0.01273191, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.04776239, "balance_loss_mlp": 1.01990986, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 2.274506071059542, "language_loss": 0.79139251, "learning_rate": 8.403669370881115e-07, "loss": 0.81439006, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.7218527793884277 }, { "auxiliary_loss_clip": 0.01175611, "auxiliary_loss_mlp": 0.01022894, "balance_loss_clip": 1.05041575, "balance_loss_mlp": 1.01616812, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.5785796481828511, "language_loss": 0.7837435, "learning_rate": 8.397323601937587e-07, "loss": 0.80572855, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.636770486831665 }, { "auxiliary_loss_clip": 0.01318464, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04476452, "balance_loss_mlp": 1.0172627, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 2.1171987976223634, "language_loss": 0.77131003, "learning_rate": 8.390979593110838e-07, "loss": 0.79473579, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.7573959827423096 }, { "auxiliary_loss_clip": 0.01278955, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.05010748, "balance_loss_mlp": 1.0204128, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.8432152140169413, "language_loss": 0.81607658, "learning_rate": 8.384637345363262e-07, "loss": 0.83914638, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.715832233428955 }, { "auxiliary_loss_clip": 0.01272082, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.04439831, "balance_loss_mlp": 1.02211809, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 2.511799582199191, "language_loss": 0.76527077, "learning_rate": 8.378296859656964e-07, "loss": 0.78828418, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.721820116043091 }, { "auxiliary_loss_clip": 0.01274683, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04686618, "balance_loss_mlp": 1.02093577, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 2.733046752509775, "language_loss": 0.68440831, "learning_rate": 8.371958136953792e-07, "loss": 0.70743328, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.7167656421661377 }, { "auxiliary_loss_clip": 0.0132718, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.0450573, "balance_loss_mlp": 1.02032161, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 2.994247269083327, "language_loss": 0.65748644, "learning_rate": 8.365621178215326e-07, "loss": 0.68104374, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.669482707977295 }, { "auxiliary_loss_clip": 0.01220962, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.04704142, "balance_loss_mlp": 1.02087605, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.1957228564987, "language_loss": 0.75391889, "learning_rate": 8.359285984402871e-07, "loss": 0.77641153, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.6058285236358643 }, { "auxiliary_loss_clip": 0.01268244, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.04709935, "balance_loss_mlp": 1.0231396, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 1.9607184593279743, "language_loss": 0.74008048, "learning_rate": 8.352952556477489e-07, "loss": 0.76306176, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.6542465686798096 }, { "auxiliary_loss_clip": 0.01221751, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.04806232, "balance_loss_mlp": 1.02465892, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 2.495515794061675, "language_loss": 0.76503319, "learning_rate": 8.34662089539993e-07, "loss": 0.78757107, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 3.6216371059417725 }, { "auxiliary_loss_clip": 0.01175538, "auxiliary_loss_mlp": 0.01028813, "balance_loss_clip": 1.05124855, "balance_loss_mlp": 1.02275753, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 1.9170247842489587, "language_loss": 0.79557258, "learning_rate": 8.340291002130722e-07, "loss": 0.81761611, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 3.557610511779785 }, { "auxiliary_loss_clip": 0.01180563, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.05248594, "balance_loss_mlp": 1.01821804, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 5.4960986216603676, "language_loss": 0.79354644, "learning_rate": 8.3339628776301e-07, "loss": 0.81560594, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 2.569021224975586 }, { "auxiliary_loss_clip": 0.01176654, "auxiliary_loss_mlp": 0.01024067, "balance_loss_clip": 1.05007148, "balance_loss_mlp": 1.01716816, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 1.9074211640316827, "language_loss": 0.57167542, "learning_rate": 8.327636522858033e-07, "loss": 0.59368265, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 3.603419303894043 }, { "auxiliary_loss_clip": 0.01273633, "auxiliary_loss_mlp": 0.01025753, "balance_loss_clip": 1.04752493, "balance_loss_mlp": 1.01898539, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 1.7886341529385117, "language_loss": 0.77226913, "learning_rate": 8.321311938774225e-07, "loss": 0.79526299, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 2.7550671100616455 }, { "auxiliary_loss_clip": 0.01178825, "auxiliary_loss_mlp": 0.01033714, "balance_loss_clip": 1.05036688, "balance_loss_mlp": 1.02613842, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 2.1234646838317515, "language_loss": 0.7906186, "learning_rate": 8.314989126338104e-07, "loss": 0.81274402, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.645000696182251 }, { "auxiliary_loss_clip": 0.01225524, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.0466609, "balance_loss_mlp": 1.02273655, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 2.461786785762009, "language_loss": 0.84605008, "learning_rate": 8.308668086508847e-07, "loss": 0.86860406, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.6028425693511963 }, { "auxiliary_loss_clip": 0.01326716, "auxiliary_loss_mlp": 0.01024855, "balance_loss_clip": 1.0421418, "balance_loss_mlp": 1.01815224, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 2.147887423885243, "language_loss": 0.7381261, "learning_rate": 8.302348820245342e-07, "loss": 0.7616418, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 2.8925933837890625 }, { "auxiliary_loss_clip": 0.01229053, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 1.04652786, "balance_loss_mlp": 1.01994538, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 6.1058056464656225, "language_loss": 0.69578564, "learning_rate": 8.296031328506232e-07, "loss": 0.71835577, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 3.645230293273926 }, { "auxiliary_loss_clip": 0.01273987, "auxiliary_loss_mlp": 0.01024281, "balance_loss_clip": 1.04616189, "balance_loss_mlp": 1.01713407, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 2.4432586919608883, "language_loss": 0.75670993, "learning_rate": 8.289715612249857e-07, "loss": 0.77969265, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.7276535034179688 }, { "auxiliary_loss_clip": 0.01273725, "auxiliary_loss_mlp": 0.0102477, "balance_loss_clip": 1.04748881, "balance_loss_mlp": 1.01736689, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.5833044574865274, "language_loss": 0.77817464, "learning_rate": 8.283401672434305e-07, "loss": 0.80115962, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.6112139225006104 }, { "auxiliary_loss_clip": 0.01270694, "auxiliary_loss_mlp": 0.01023496, "balance_loss_clip": 1.04887295, "balance_loss_mlp": 1.01700854, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 2.388482628775998, "language_loss": 0.70531482, "learning_rate": 8.277089510017412e-07, "loss": 0.7282567, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.712289810180664 }, { "auxiliary_loss_clip": 0.01273063, "auxiliary_loss_mlp": 0.01026024, "balance_loss_clip": 1.0498575, "balance_loss_mlp": 1.01950622, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.8967734549157547, "language_loss": 0.82041168, "learning_rate": 8.270779125956719e-07, "loss": 0.84340256, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.673396110534668 }, { "auxiliary_loss_clip": 0.01367978, "auxiliary_loss_mlp": 0.0102359, "balance_loss_clip": 1.04325545, "balance_loss_mlp": 1.01644325, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.0027457090004575, "language_loss": 0.80273306, "learning_rate": 8.264470521209505e-07, "loss": 0.82664871, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.7325599193573 }, { "auxiliary_loss_clip": 0.01221528, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.0463903, "balance_loss_mlp": 1.0203408, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 5.798270690099815, "language_loss": 0.76736474, "learning_rate": 8.258163696732785e-07, "loss": 0.78985715, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.5857813358306885 }, { "auxiliary_loss_clip": 0.0122371, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 1.04723239, "balance_loss_mlp": 1.020172, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 2.2768191306622976, "language_loss": 0.76868582, "learning_rate": 8.251858653483288e-07, "loss": 0.79119909, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.6750447750091553 }, { "auxiliary_loss_clip": 0.01226464, "auxiliary_loss_mlp": 0.01029117, "balance_loss_clip": 1.05092287, "balance_loss_mlp": 1.02195835, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 2.8884882926127857, "language_loss": 0.85776639, "learning_rate": 8.245555392417501e-07, "loss": 0.88032222, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.549610137939453 }, { "auxiliary_loss_clip": 0.01362454, "auxiliary_loss_mlp": 0.01023924, "balance_loss_clip": 1.04020262, "balance_loss_mlp": 1.01706934, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.7035248770415534, "language_loss": 0.78921044, "learning_rate": 8.239253914491613e-07, "loss": 0.81307423, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.748521327972412 }, { "auxiliary_loss_clip": 0.01321131, "auxiliary_loss_mlp": 0.01028659, "balance_loss_clip": 1.04676545, "balance_loss_mlp": 1.02150393, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 2.4760273439137537, "language_loss": 0.75181961, "learning_rate": 8.232954220661556e-07, "loss": 0.77531755, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.7554194927215576 }, { "auxiliary_loss_clip": 0.01175608, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 1.05211294, "balance_loss_mlp": 1.02007484, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.6680784225991703, "language_loss": 0.70345795, "learning_rate": 8.226656311882989e-07, "loss": 0.72548115, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.631424903869629 }, { "auxiliary_loss_clip": 0.01222183, "auxiliary_loss_mlp": 0.01023305, "balance_loss_clip": 1.04944372, "balance_loss_mlp": 1.01686752, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.16701423217405, "language_loss": 0.76651001, "learning_rate": 8.22036018911129e-07, "loss": 0.78896493, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.5704567432403564 }, { "auxiliary_loss_clip": 0.01178927, "auxiliary_loss_mlp": 0.01027286, "balance_loss_clip": 1.05063879, "balance_loss_mlp": 1.01991308, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.1959944450809328, "language_loss": 0.80742484, "learning_rate": 8.214065853301599e-07, "loss": 0.82948697, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.639282703399658 }, { "auxiliary_loss_clip": 0.01125995, "auxiliary_loss_mlp": 0.0100049, "balance_loss_clip": 1.01273787, "balance_loss_mlp": 0.99947053, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8148449386016225, "language_loss": 0.58199197, "learning_rate": 8.207773305408734e-07, "loss": 0.60325682, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.3316452503204346 }, { "auxiliary_loss_clip": 0.01379356, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 1.04325664, "balance_loss_mlp": 1.01788342, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 3.0499334291563676, "language_loss": 0.80142498, "learning_rate": 8.201482546387288e-07, "loss": 0.82547146, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.7531797885894775 }, { "auxiliary_loss_clip": 0.01222996, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.04856241, "balance_loss_mlp": 1.02336979, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.6403406910434282, "language_loss": 0.91694701, "learning_rate": 8.195193577191553e-07, "loss": 0.93948245, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.697624683380127 }, { "auxiliary_loss_clip": 0.0118215, "auxiliary_loss_mlp": 0.02566059, "balance_loss_clip": 1.04808342, "balance_loss_mlp": 0.99988061, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.7941677500031643, "language_loss": 0.84521824, "learning_rate": 8.188906398775579e-07, "loss": 0.88270032, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.6936471462249756 }, { "auxiliary_loss_clip": 0.0117527, "auxiliary_loss_mlp": 0.0256729, "balance_loss_clip": 1.04900765, "balance_loss_mlp": 0.99988139, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 2.1381997046122954, "language_loss": 0.68752319, "learning_rate": 8.18262101209311e-07, "loss": 0.72494876, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 3.6303818225860596 }, { "auxiliary_loss_clip": 0.01126573, "auxiliary_loss_mlp": 0.01020979, "balance_loss_clip": 1.04737425, "balance_loss_mlp": 1.01419616, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.9899901738622872, "language_loss": 0.7042762, "learning_rate": 8.176337418097626e-07, "loss": 0.72575176, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 3.547966957092285 }, { "auxiliary_loss_clip": 0.01226282, "auxiliary_loss_mlp": 0.02565341, "balance_loss_clip": 1.05224133, "balance_loss_mlp": 0.9998672, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 5.25090864534407, "language_loss": 0.79998672, "learning_rate": 8.170055617742364e-07, "loss": 0.83790296, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.618321418762207 }, { "auxiliary_loss_clip": 0.01268442, "auxiliary_loss_mlp": 0.01031833, "balance_loss_clip": 1.04528034, "balance_loss_mlp": 1.02428675, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 1.8235677249896851, "language_loss": 0.70527565, "learning_rate": 8.163775611980252e-07, "loss": 0.72827834, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 3.5883419513702393 }, { "auxiliary_loss_clip": 0.01273073, "auxiliary_loss_mlp": 0.01031095, "balance_loss_clip": 1.04906571, "balance_loss_mlp": 1.0242672, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.7990477578416386, "language_loss": 0.78999615, "learning_rate": 8.157497401763982e-07, "loss": 0.81303775, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.632392644882202 }, { "auxiliary_loss_clip": 0.01224995, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.05040765, "balance_loss_mlp": 1.02298141, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.8613478662150384, "language_loss": 0.77923644, "learning_rate": 8.151220988045935e-07, "loss": 0.80178452, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 2.6504716873168945 }, { "auxiliary_loss_clip": 0.01221208, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.04804432, "balance_loss_mlp": 1.02152193, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 2.6379825550999363, "language_loss": 0.82656986, "learning_rate": 8.144946371778234e-07, "loss": 0.8490662, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.660689353942871 }, { "auxiliary_loss_clip": 0.01273767, "auxiliary_loss_mlp": 0.02568841, "balance_loss_clip": 1.04858577, "balance_loss_mlp": 0.99990314, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 2.154621122456865, "language_loss": 0.78737092, "learning_rate": 8.138673553912751e-07, "loss": 0.82579702, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.658308982849121 }, { "auxiliary_loss_clip": 0.01274583, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.04372883, "balance_loss_mlp": 1.01913667, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 7.315498804949607, "language_loss": 0.56623459, "learning_rate": 8.132402535401059e-07, "loss": 0.58924556, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 3.7115705013275146 }, { "auxiliary_loss_clip": 0.01230048, "auxiliary_loss_mlp": 0.01023406, "balance_loss_clip": 1.05254102, "balance_loss_mlp": 1.01690912, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.9145228280617346, "language_loss": 0.74254203, "learning_rate": 8.126133317194465e-07, "loss": 0.76507658, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.6091763973236084 }, { "auxiliary_loss_clip": 0.0133845, "auxiliary_loss_mlp": 0.0102819, "balance_loss_clip": 1.04357541, "balance_loss_mlp": 1.02029204, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 4.569618945962764, "language_loss": 0.74755365, "learning_rate": 8.11986590024401e-07, "loss": 0.77122009, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.9392287731170654 }, { "auxiliary_loss_clip": 0.01283215, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.05006301, "balance_loss_mlp": 1.01854038, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.9407709516981124, "language_loss": 0.68813109, "learning_rate": 8.113600285500442e-07, "loss": 0.71122801, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.961409568786621 }, { "auxiliary_loss_clip": 0.01174812, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.04866672, "balance_loss_mlp": 1.01853275, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.9417182611035864, "language_loss": 0.74660933, "learning_rate": 8.107336473914268e-07, "loss": 0.76860809, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.6059730052948 }, { "auxiliary_loss_clip": 0.01176667, "auxiliary_loss_mlp": 0.01001252, "balance_loss_clip": 1.01200366, "balance_loss_mlp": 1.00016689, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7722970597643075, "language_loss": 0.55763364, "learning_rate": 8.101074466435694e-07, "loss": 0.57941282, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.177396059036255 }, { "auxiliary_loss_clip": 0.01220526, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.04651916, "balance_loss_mlp": 1.02131152, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 4.256589370798534, "language_loss": 0.67999923, "learning_rate": 8.094814264014662e-07, "loss": 0.70248979, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.5858206748962402 }, { "auxiliary_loss_clip": 0.01178132, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.04989433, "balance_loss_mlp": 1.01943254, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 3.9833627989596208, "language_loss": 0.81369948, "learning_rate": 8.088555867600844e-07, "loss": 0.83574963, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.554027557373047 }, { "auxiliary_loss_clip": 0.01322248, "auxiliary_loss_mlp": 0.01025149, "balance_loss_clip": 1.04514027, "balance_loss_mlp": 1.01838994, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.8675214754439786, "language_loss": 0.60127515, "learning_rate": 8.08229927814362e-07, "loss": 0.62474906, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.793527841567993 }, { "auxiliary_loss_clip": 0.01320509, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.0450412, "balance_loss_mlp": 1.02128959, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.9603917490672167, "language_loss": 0.65150213, "learning_rate": 8.076044496592134e-07, "loss": 0.67499185, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.7760169506073 }, { "auxiliary_loss_clip": 0.01269474, "auxiliary_loss_mlp": 0.01027901, "balance_loss_clip": 1.04820657, "balance_loss_mlp": 1.02071881, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.2401373840914354, "language_loss": 0.77907121, "learning_rate": 8.069791523895204e-07, "loss": 0.80204493, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.631726026535034 }, { "auxiliary_loss_clip": 0.01317402, "auxiliary_loss_mlp": 0.01021681, "balance_loss_clip": 1.04248452, "balance_loss_mlp": 1.01441252, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 3.2375368002383085, "language_loss": 0.77423716, "learning_rate": 8.063540361001422e-07, "loss": 0.79762793, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.699613571166992 }, { "auxiliary_loss_clip": 0.01322594, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.04606938, "balance_loss_mlp": 1.02278566, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.2425855591580346, "language_loss": 0.79385531, "learning_rate": 8.057291008859069e-07, "loss": 0.81738019, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.678907632827759 }, { "auxiliary_loss_clip": 0.01222615, "auxiliary_loss_mlp": 0.0102522, "balance_loss_clip": 1.04708195, "balance_loss_mlp": 1.01852298, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 2.125759287721424, "language_loss": 0.67951298, "learning_rate": 8.051043468416187e-07, "loss": 0.70199132, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.702498197555542 }, { "auxiliary_loss_clip": 0.01174179, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.0511322, "balance_loss_mlp": 1.0185883, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 2.107843607315672, "language_loss": 0.82290876, "learning_rate": 8.044797740620506e-07, "loss": 0.84490097, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.580397605895996 }, { "auxiliary_loss_clip": 0.01366863, "auxiliary_loss_mlp": 0.0102363, "balance_loss_clip": 1.04454243, "balance_loss_mlp": 1.01689208, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 2.8427138173251563, "language_loss": 0.7874676, "learning_rate": 8.038553826419494e-07, "loss": 0.81137258, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.8006014823913574 }, { "auxiliary_loss_clip": 0.01172723, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.04830813, "balance_loss_mlp": 1.01907134, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.6287573381659903, "language_loss": 0.81183672, "learning_rate": 8.032311726760364e-07, "loss": 0.83382285, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.6343538761138916 }, { "auxiliary_loss_clip": 0.01317516, "auxiliary_loss_mlp": 0.01027348, "balance_loss_clip": 1.0440352, "balance_loss_mlp": 1.02027631, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 2.3890919377286144, "language_loss": 0.69454497, "learning_rate": 8.026071442590022e-07, "loss": 0.71799362, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.1084680557250977 }, { "auxiliary_loss_clip": 0.01224561, "auxiliary_loss_mlp": 0.0102357, "balance_loss_clip": 1.05068874, "balance_loss_mlp": 1.01695347, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.9680143893574589, "language_loss": 0.80761534, "learning_rate": 8.019832974855134e-07, "loss": 0.83009666, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 3.658043384552002 }, { "auxiliary_loss_clip": 0.01321111, "auxiliary_loss_mlp": 0.01024156, "balance_loss_clip": 1.04479337, "balance_loss_mlp": 1.01709855, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.4775047421512704, "language_loss": 0.82444888, "learning_rate": 8.013596324502052e-07, "loss": 0.84790158, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 3.603930950164795 }, { "auxiliary_loss_clip": 0.01218262, "auxiliary_loss_mlp": 0.01024499, "balance_loss_clip": 1.04994452, "balance_loss_mlp": 1.01798987, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.8347346256581167, "language_loss": 0.78748327, "learning_rate": 8.007361492476872e-07, "loss": 0.80991089, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.631948947906494 }, { "auxiliary_loss_clip": 0.01331773, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.04617083, "balance_loss_mlp": 1.02304339, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.597879719582589, "language_loss": 0.78872013, "learning_rate": 8.001128479725426e-07, "loss": 0.81234258, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 3.6226654052734375 }, { "auxiliary_loss_clip": 0.01367498, "auxiliary_loss_mlp": 0.0102685, "balance_loss_clip": 1.04169726, "balance_loss_mlp": 1.01960802, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.650509980079547, "language_loss": 0.80912769, "learning_rate": 7.994897287193248e-07, "loss": 0.83307111, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 2.732728958129883 }, { "auxiliary_loss_clip": 0.01227211, "auxiliary_loss_mlp": 0.01023762, "balance_loss_clip": 1.04800332, "balance_loss_mlp": 1.01657391, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.6454628759522256, "language_loss": 0.83761537, "learning_rate": 7.988667915825605e-07, "loss": 0.86012506, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 2.610414981842041 }, { "auxiliary_loss_clip": 0.01276656, "auxiliary_loss_mlp": 0.01021173, "balance_loss_clip": 1.04766917, "balance_loss_mlp": 1.01397276, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 2.0748590171738988, "language_loss": 0.7551223, "learning_rate": 7.982440366567491e-07, "loss": 0.77810055, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.7322845458984375 }, { "auxiliary_loss_clip": 0.01218808, "auxiliary_loss_mlp": 0.01022556, "balance_loss_clip": 1.04523039, "balance_loss_mlp": 1.01621675, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.5944601146856607, "language_loss": 0.7532748, "learning_rate": 7.97621464036361e-07, "loss": 0.77568847, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.694451093673706 }, { "auxiliary_loss_clip": 0.01225487, "auxiliary_loss_mlp": 0.0102502, "balance_loss_clip": 1.04733813, "balance_loss_mlp": 1.01823735, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 2.416100178911936, "language_loss": 0.68209934, "learning_rate": 7.969990738158417e-07, "loss": 0.70460451, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 3.570748805999756 }, { "auxiliary_loss_clip": 0.01228686, "auxiliary_loss_mlp": 0.01022474, "balance_loss_clip": 1.05257058, "balance_loss_mlp": 1.01564074, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 2.274189338541866, "language_loss": 0.85477501, "learning_rate": 7.963768660896062e-07, "loss": 0.87728661, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.610940456390381 }, { "auxiliary_loss_clip": 0.01227419, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.04749537, "balance_loss_mlp": 1.01642513, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 1.9399512176288194, "language_loss": 0.82316935, "learning_rate": 7.957548409520432e-07, "loss": 0.84567809, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.6751492023468018 }, { "auxiliary_loss_clip": 0.01320948, "auxiliary_loss_mlp": 0.01029083, "balance_loss_clip": 1.04495943, "balance_loss_mlp": 1.02237487, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 2.079056778379464, "language_loss": 0.84201419, "learning_rate": 7.951329984975135e-07, "loss": 0.86551452, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.6579058170318604 }, { "auxiliary_loss_clip": 0.01134359, "auxiliary_loss_mlp": 0.00999721, "balance_loss_clip": 1.01277077, "balance_loss_mlp": 0.9988147, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.717422328040634, "language_loss": 0.54193568, "learning_rate": 7.94511338820349e-07, "loss": 0.56327653, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.234156370162964 }, { "auxiliary_loss_clip": 0.01272179, "auxiliary_loss_mlp": 0.02569328, "balance_loss_clip": 1.04697108, "balance_loss_mlp": 0.99989027, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.7422190069208683, "language_loss": 0.78537619, "learning_rate": 7.938898620148575e-07, "loss": 0.82379121, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.749943733215332 }, { "auxiliary_loss_clip": 0.01271604, "auxiliary_loss_mlp": 0.01028664, "balance_loss_clip": 1.04662585, "balance_loss_mlp": 1.02141011, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 2.000003783717063, "language_loss": 0.70726603, "learning_rate": 7.932685681753135e-07, "loss": 0.73026872, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.6436381340026855 }, { "auxiliary_loss_clip": 0.01170381, "auxiliary_loss_mlp": 0.01021628, "balance_loss_clip": 1.0485363, "balance_loss_mlp": 1.01484156, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 5.647116235347149, "language_loss": 0.62561107, "learning_rate": 7.92647457395969e-07, "loss": 0.64753115, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.7194204330444336 }, { "auxiliary_loss_clip": 0.0143181, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.03915191, "balance_loss_mlp": 1.02127671, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.9939644464040103, "language_loss": 0.74414921, "learning_rate": 7.920265297710444e-07, "loss": 0.76875067, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.7600209712982178 }, { "auxiliary_loss_clip": 0.01227786, "auxiliary_loss_mlp": 0.01023669, "balance_loss_clip": 1.05048299, "balance_loss_mlp": 1.01753879, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.8328910417134827, "language_loss": 0.73509967, "learning_rate": 7.914057853947363e-07, "loss": 0.75761425, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.6229407787323 }, { "auxiliary_loss_clip": 0.01329563, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.04683292, "balance_loss_mlp": 1.02397084, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 2.127765499753154, "language_loss": 0.62666494, "learning_rate": 7.907852243612089e-07, "loss": 0.65027726, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.7219955921173096 }, { "auxiliary_loss_clip": 0.01270592, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.0448885, "balance_loss_mlp": 1.01705027, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 2.27453548970221, "language_loss": 0.7218321, "learning_rate": 7.901648467646009e-07, "loss": 0.74477828, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.675102949142456 }, { "auxiliary_loss_clip": 0.01177647, "auxiliary_loss_mlp": 0.01021786, "balance_loss_clip": 1.0509057, "balance_loss_mlp": 1.01514268, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 2.818612924499078, "language_loss": 0.724944, "learning_rate": 7.895446526990244e-07, "loss": 0.74693835, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.5880191326141357 }, { "auxiliary_loss_clip": 0.01380695, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04362071, "balance_loss_mlp": 1.01715994, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.9441009464373546, "language_loss": 0.75626659, "learning_rate": 7.889246422585609e-07, "loss": 0.78031421, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.7732906341552734 }, { "auxiliary_loss_clip": 0.01176891, "auxiliary_loss_mlp": 0.01021799, "balance_loss_clip": 1.05130053, "balance_loss_mlp": 1.01493573, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 1.7459553495804907, "language_loss": 0.73586494, "learning_rate": 7.883048155372675e-07, "loss": 0.75785184, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.577468156814575 }, { "auxiliary_loss_clip": 0.01280607, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.04929709, "balance_loss_mlp": 1.01852083, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.5687882504027466, "language_loss": 0.70974058, "learning_rate": 7.876851726291698e-07, "loss": 0.73280108, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.675692319869995 }, { "auxiliary_loss_clip": 0.01327052, "auxiliary_loss_mlp": 0.01021276, "balance_loss_clip": 1.04401278, "balance_loss_mlp": 1.01423073, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 2.4573865318422836, "language_loss": 0.78457737, "learning_rate": 7.870657136282666e-07, "loss": 0.80806065, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.6881115436553955 }, { "auxiliary_loss_clip": 0.01219711, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.04607081, "balance_loss_mlp": 1.02546823, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.9421507389846253, "language_loss": 0.82052428, "learning_rate": 7.86446438628531e-07, "loss": 0.84304428, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.663004159927368 }, { "auxiliary_loss_clip": 0.01068074, "auxiliary_loss_mlp": 0.009995, "balance_loss_clip": 1.01181316, "balance_loss_mlp": 0.99860042, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7594398194094182, "language_loss": 0.56817383, "learning_rate": 7.858273477239059e-07, "loss": 0.58884954, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 4.166310548782349 }, { "auxiliary_loss_clip": 0.01363642, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.04090095, "balance_loss_mlp": 1.01991534, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 1.7392622404063502, "language_loss": 0.71447396, "learning_rate": 7.852084410083067e-07, "loss": 0.73837757, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 3.636399269104004 }, { "auxiliary_loss_clip": 0.01267221, "auxiliary_loss_mlp": 0.01025144, "balance_loss_clip": 1.04709852, "balance_loss_mlp": 1.01870108, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.6261757017706762, "language_loss": 0.63558489, "learning_rate": 7.84589718575621e-07, "loss": 0.65850854, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 3.6102259159088135 }, { "auxiliary_loss_clip": 0.01274228, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04291427, "balance_loss_mlp": 1.01705503, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 2.2149782862988254, "language_loss": 0.69107431, "learning_rate": 7.83971180519708e-07, "loss": 0.71405637, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.7111892700195312 }, { "auxiliary_loss_clip": 0.01176694, "auxiliary_loss_mlp": 0.01023589, "balance_loss_clip": 1.05069208, "balance_loss_mlp": 1.01636744, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 2.0389951113073392, "language_loss": 0.75469208, "learning_rate": 7.833528269344008e-07, "loss": 0.77669489, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.6802830696105957 }, { "auxiliary_loss_clip": 0.01329029, "auxiliary_loss_mlp": 0.01030978, "balance_loss_clip": 1.04858136, "balance_loss_mlp": 1.02349138, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.2426438022568203, "language_loss": 0.77649409, "learning_rate": 7.827346579135023e-07, "loss": 0.80009419, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.667869806289673 }, { "auxiliary_loss_clip": 0.01271297, "auxiliary_loss_mlp": 0.01022303, "balance_loss_clip": 1.04659581, "balance_loss_mlp": 1.01481962, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 2.0002164004919387, "language_loss": 0.8365016, "learning_rate": 7.821166735507885e-07, "loss": 0.85943758, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 2.6534228324890137 }, { "auxiliary_loss_clip": 0.01173329, "auxiliary_loss_mlp": 0.01025969, "balance_loss_clip": 1.05089998, "balance_loss_mlp": 1.01967788, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 4.303914518065851, "language_loss": 0.68709868, "learning_rate": 7.81498873940007e-07, "loss": 0.70909166, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.5598297119140625 }, { "auxiliary_loss_clip": 0.01230347, "auxiliary_loss_mlp": 0.01027479, "balance_loss_clip": 1.04748404, "balance_loss_mlp": 1.02030015, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.2033915052461066, "language_loss": 0.77378464, "learning_rate": 7.808812591748768e-07, "loss": 0.79636288, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 3.6012301445007324 }, { "auxiliary_loss_clip": 0.01322007, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.04530609, "balance_loss_mlp": 1.02600098, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 2.418817509813332, "language_loss": 0.64861202, "learning_rate": 7.802638293490915e-07, "loss": 0.6721682, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.6807761192321777 }, { "auxiliary_loss_clip": 0.01278496, "auxiliary_loss_mlp": 0.01024808, "balance_loss_clip": 1.04761791, "balance_loss_mlp": 1.017658, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.6252609366954602, "language_loss": 0.76927984, "learning_rate": 7.796465845563123e-07, "loss": 0.79231286, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.7058422565460205 }, { "auxiliary_loss_clip": 0.01269733, "auxiliary_loss_mlp": 0.02564834, "balance_loss_clip": 1.04584503, "balance_loss_mlp": 0.99988723, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 3.209063205505246, "language_loss": 0.79618001, "learning_rate": 7.790295248901766e-07, "loss": 0.83452564, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.6915714740753174 }, { "auxiliary_loss_clip": 0.01226691, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.04946518, "balance_loss_mlp": 1.02152109, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 1.635887264156226, "language_loss": 0.62343508, "learning_rate": 7.784126504442902e-07, "loss": 0.6459865, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.7545413970947266 }, { "auxiliary_loss_clip": 0.01318576, "auxiliary_loss_mlp": 0.01023689, "balance_loss_clip": 1.04623652, "balance_loss_mlp": 1.01661706, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.4323684682182403, "language_loss": 0.67919981, "learning_rate": 7.777959613122351e-07, "loss": 0.70262241, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.6851065158843994 }, { "auxiliary_loss_clip": 0.01269065, "auxiliary_loss_mlp": 0.01026389, "balance_loss_clip": 1.04899311, "balance_loss_mlp": 1.0194478, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.873458220411954, "language_loss": 0.77786887, "learning_rate": 7.771794575875604e-07, "loss": 0.80082345, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.728510618209839 }, { "auxiliary_loss_clip": 0.01232679, "auxiliary_loss_mlp": 0.01024181, "balance_loss_clip": 1.05356789, "balance_loss_mlp": 1.01699543, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.4366105762533796, "language_loss": 0.7756511, "learning_rate": 7.765631393637888e-07, "loss": 0.79821968, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.729074239730835 }, { "auxiliary_loss_clip": 0.01225077, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.04607749, "balance_loss_mlp": 1.01949608, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 3.1222308665172536, "language_loss": 0.48938826, "learning_rate": 7.75947006734417e-07, "loss": 0.5119099, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.6543447971343994 }, { "auxiliary_loss_clip": 0.0117362, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.04781795, "balance_loss_mlp": 1.02039206, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 2.125103450868867, "language_loss": 0.83101332, "learning_rate": 7.753310597929101e-07, "loss": 0.85302281, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.4853689670562744 }, { "auxiliary_loss_clip": 0.01068327, "auxiliary_loss_mlp": 0.01001124, "balance_loss_clip": 1.01197541, "balance_loss_mlp": 1.00019383, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7511514293808181, "language_loss": 0.55066073, "learning_rate": 7.747152986327095e-07, "loss": 0.57135528, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.073885440826416 }, { "auxiliary_loss_clip": 0.01375974, "auxiliary_loss_mlp": 0.01026834, "balance_loss_clip": 1.04368091, "balance_loss_mlp": 1.02061725, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 2.5194239381993757, "language_loss": 0.68383974, "learning_rate": 7.740997233472228e-07, "loss": 0.7078678, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.6963229179382324 }, { "auxiliary_loss_clip": 0.01273082, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.04577327, "balance_loss_mlp": 1.02241182, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 3.119503678804544, "language_loss": 0.70388269, "learning_rate": 7.734843340298329e-07, "loss": 0.72690618, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.694007396697998 }, { "auxiliary_loss_clip": 0.01278156, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.04594862, "balance_loss_mlp": 1.0187459, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.686605695577765, "language_loss": 0.75281823, "learning_rate": 7.72869130773895e-07, "loss": 0.77586371, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.7841813564300537 }, { "auxiliary_loss_clip": 0.0112408, "auxiliary_loss_mlp": 0.00999788, "balance_loss_clip": 1.01234317, "balance_loss_mlp": 0.99887657, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.8025263532660742, "language_loss": 0.59317547, "learning_rate": 7.722541136727343e-07, "loss": 0.61441416, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.0865020751953125 }, { "auxiliary_loss_clip": 0.01223695, "auxiliary_loss_mlp": 0.01023719, "balance_loss_clip": 1.04882455, "balance_loss_mlp": 1.01680827, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 2.0197958005920498, "language_loss": 0.80981112, "learning_rate": 7.716392828196483e-07, "loss": 0.83228528, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.595790147781372 }, { "auxiliary_loss_clip": 0.01227018, "auxiliary_loss_mlp": 0.01027411, "balance_loss_clip": 1.05071926, "balance_loss_mlp": 1.02009773, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 3.4109628413949395, "language_loss": 0.77507067, "learning_rate": 7.710246383079064e-07, "loss": 0.79761493, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.560617685317993 }, { "auxiliary_loss_clip": 0.0127563, "auxiliary_loss_mlp": 0.01023361, "balance_loss_clip": 1.0441258, "balance_loss_mlp": 1.01679802, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 2.7817768370409546, "language_loss": 0.91923529, "learning_rate": 7.704101802307492e-07, "loss": 0.94222522, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 3.6022841930389404 }, { "auxiliary_loss_clip": 0.01319316, "auxiliary_loss_mlp": 0.01024316, "balance_loss_clip": 1.04485703, "balance_loss_mlp": 1.01715744, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.588311236508762, "language_loss": 0.86954236, "learning_rate": 7.697959086813912e-07, "loss": 0.89297867, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 3.655559778213501 }, { "auxiliary_loss_clip": 0.01318985, "auxiliary_loss_mlp": 0.01025872, "balance_loss_clip": 1.04271889, "balance_loss_mlp": 1.01845074, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.612382373294819, "language_loss": 0.80168939, "learning_rate": 7.691818237530145e-07, "loss": 0.82513797, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.7146339416503906 }, { "auxiliary_loss_clip": 0.01384317, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.04402483, "balance_loss_mlp": 1.01862121, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 2.0522967178307403, "language_loss": 0.77399039, "learning_rate": 7.685679255387774e-07, "loss": 0.79809082, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 3.6164064407348633 }, { "auxiliary_loss_clip": 0.01271968, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.04774714, "balance_loss_mlp": 1.01636243, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 2.0546021639441916, "language_loss": 0.76674616, "learning_rate": 7.679542141318065e-07, "loss": 0.78969836, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.681095600128174 }, { "auxiliary_loss_clip": 0.01266916, "auxiliary_loss_mlp": 0.01030718, "balance_loss_clip": 1.04370379, "balance_loss_mlp": 1.023983, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 2.040784169741203, "language_loss": 0.75908172, "learning_rate": 7.673406896252013e-07, "loss": 0.78205812, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.6941769123077393 }, { "auxiliary_loss_clip": 0.01321212, "auxiliary_loss_mlp": 0.01028832, "balance_loss_clip": 1.04246581, "balance_loss_mlp": 1.02165854, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.71505792754764, "language_loss": 0.7861672, "learning_rate": 7.667273521120347e-07, "loss": 0.80966765, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 2.8818881511688232 }, { "auxiliary_loss_clip": 0.01322396, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.04449081, "balance_loss_mlp": 1.02152395, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 1.9483856827888342, "language_loss": 0.79475021, "learning_rate": 7.661142016853468e-07, "loss": 0.81826031, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.6534714698791504 }, { "auxiliary_loss_clip": 0.013717, "auxiliary_loss_mlp": 0.01026877, "balance_loss_clip": 1.04172254, "balance_loss_mlp": 1.01973653, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.7545426527856822, "language_loss": 0.74638903, "learning_rate": 7.655012384381543e-07, "loss": 0.77037477, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.736276865005493 }, { "auxiliary_loss_clip": 0.01271345, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.02491796, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 3.018069927766298, "language_loss": 0.81884313, "learning_rate": 7.648884624634415e-07, "loss": 0.84187692, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 3.615183115005493 }, { "auxiliary_loss_clip": 0.01220923, "auxiliary_loss_mlp": 0.01024041, "balance_loss_clip": 1.04861903, "balance_loss_mlp": 1.01735616, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 2.18952584536538, "language_loss": 0.89065409, "learning_rate": 7.642758738541683e-07, "loss": 0.9131037, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.601533889770508 }, { "auxiliary_loss_clip": 0.01122425, "auxiliary_loss_mlp": 0.0100395, "balance_loss_clip": 1.01312172, "balance_loss_mlp": 1.00302052, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7683450160840272, "language_loss": 0.60732484, "learning_rate": 7.636634727032621e-07, "loss": 0.62858862, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.0593743324279785 }, { "auxiliary_loss_clip": 0.01329825, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.04311728, "balance_loss_mlp": 1.01724052, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 2.182834546857687, "language_loss": 0.78824651, "learning_rate": 7.630512591036231e-07, "loss": 0.81179339, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.7113571166992188 }, { "auxiliary_loss_clip": 0.01226195, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.05057502, "balance_loss_mlp": 1.0163331, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.788572976245571, "language_loss": 0.64824045, "learning_rate": 7.624392331481255e-07, "loss": 0.67073697, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.6183338165283203 }, { "auxiliary_loss_clip": 0.01119019, "auxiliary_loss_mlp": 0.0100283, "balance_loss_clip": 1.0115447, "balance_loss_mlp": 1.00183511, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.8185116723051613, "language_loss": 0.51847219, "learning_rate": 7.618273949296115e-07, "loss": 0.53969067, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.102205514907837 }, { "auxiliary_loss_clip": 0.01268307, "auxiliary_loss_mlp": 0.01025312, "balance_loss_clip": 1.04410684, "balance_loss_mlp": 1.01812053, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 3.406882180543949, "language_loss": 0.68548411, "learning_rate": 7.612157445408987e-07, "loss": 0.70842028, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.658825635910034 }, { "auxiliary_loss_clip": 0.01284839, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.05201733, "balance_loss_mlp": 1.02347803, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.083570441025999, "language_loss": 0.74188197, "learning_rate": 7.606042820747716e-07, "loss": 0.76504165, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.6026697158813477 }, { "auxiliary_loss_clip": 0.01284498, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.05320549, "balance_loss_mlp": 1.01795053, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.932398921831613, "language_loss": 0.85328257, "learning_rate": 7.599930076239889e-07, "loss": 0.87638068, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.6824817657470703 }, { "auxiliary_loss_clip": 0.01379413, "auxiliary_loss_mlp": 0.02566404, "balance_loss_clip": 1.04788232, "balance_loss_mlp": 0.99989331, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 1.9424135754922212, "language_loss": 0.70870423, "learning_rate": 7.593819212812818e-07, "loss": 0.74816239, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.86793851852417 }, { "auxiliary_loss_clip": 0.01222765, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.04885435, "balance_loss_mlp": 1.02157736, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 2.0600584061111133, "language_loss": 0.71810937, "learning_rate": 7.587710231393508e-07, "loss": 0.74062288, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.6317508220672607 }, { "auxiliary_loss_clip": 0.01465584, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.04039013, "balance_loss_mlp": 1.0198729, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 2.105942035367066, "language_loss": 0.8374818, "learning_rate": 7.581603132908685e-07, "loss": 0.8624059, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.8666701316833496 }, { "auxiliary_loss_clip": 0.01315322, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.0434618, "balance_loss_mlp": 1.01835704, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 2.018201871612926, "language_loss": 0.78438532, "learning_rate": 7.575497918284795e-07, "loss": 0.80779386, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.901970386505127 }, { "auxiliary_loss_clip": 0.01179523, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.04993534, "balance_loss_mlp": 1.0253253, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 2.3742287445896864, "language_loss": 0.74533558, "learning_rate": 7.569394588447984e-07, "loss": 0.76745558, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.5633316040039062 }, { "auxiliary_loss_clip": 0.01218745, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 1.04548168, "balance_loss_mlp": 1.01794767, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 2.5642413737107606, "language_loss": 0.77805853, "learning_rate": 7.563293144324146e-07, "loss": 0.80049372, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.5446665287017822 }, { "auxiliary_loss_clip": 0.01173818, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.05016828, "balance_loss_mlp": 1.01859772, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 2.5671355103698255, "language_loss": 0.80264354, "learning_rate": 7.557193586838834e-07, "loss": 0.82463795, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.6515095233917236 }, { "auxiliary_loss_clip": 0.01175493, "auxiliary_loss_mlp": 0.01024028, "balance_loss_clip": 1.04585195, "balance_loss_mlp": 1.0167737, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.2895862739696797, "language_loss": 0.70753241, "learning_rate": 7.551095916917371e-07, "loss": 0.72952753, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.640380620956421 }, { "auxiliary_loss_clip": 0.01233493, "auxiliary_loss_mlp": 0.01026905, "balance_loss_clip": 1.0453763, "balance_loss_mlp": 1.01903152, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 3.589529598952646, "language_loss": 0.66543555, "learning_rate": 7.545000135484758e-07, "loss": 0.68803954, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 3.741142749786377 }, { "auxiliary_loss_clip": 0.01176298, "auxiliary_loss_mlp": 0.02568139, "balance_loss_clip": 1.04978848, "balance_loss_mlp": 0.99988008, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 4.216731882571455, "language_loss": 0.6276564, "learning_rate": 7.538906243465714e-07, "loss": 0.66510081, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 3.5479612350463867 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01024516, "balance_loss_clip": 1.05130959, "balance_loss_mlp": 1.01737881, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 3.3152192774045237, "language_loss": 0.7844466, "learning_rate": 7.5328142417847e-07, "loss": 0.80645472, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.519859790802002 }, { "auxiliary_loss_clip": 0.012222, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.04552698, "balance_loss_mlp": 1.01996684, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.743762986854525, "language_loss": 0.69180226, "learning_rate": 7.526724131365838e-07, "loss": 0.71429193, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 3.4501099586486816 }, { "auxiliary_loss_clip": 0.01273776, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.0502224, "balance_loss_mlp": 1.02027988, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 2.08814031914499, "language_loss": 0.70636976, "learning_rate": 7.520635913133017e-07, "loss": 0.72938538, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.6019370555877686 }, { "auxiliary_loss_clip": 0.01230729, "auxiliary_loss_mlp": 0.01022611, "balance_loss_clip": 1.04996765, "balance_loss_mlp": 1.01517189, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.9240050333631207, "language_loss": 0.82361829, "learning_rate": 7.514549588009798e-07, "loss": 0.84615165, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.6869900226593018 }, { "auxiliary_loss_clip": 0.01277929, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04975665, "balance_loss_mlp": 1.01755762, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 2.228321915394622, "language_loss": 0.70842957, "learning_rate": 7.508465156919492e-07, "loss": 0.73146236, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.7669787406921387 }, { "auxiliary_loss_clip": 0.01272888, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.02278697, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 3.6572921041609017, "language_loss": 0.61109471, "learning_rate": 7.502382620785083e-07, "loss": 0.63412404, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 2.5856759548187256 }, { "auxiliary_loss_clip": 0.01219768, "auxiliary_loss_mlp": 0.01005721, "balance_loss_clip": 1.01037312, "balance_loss_mlp": 1.00475502, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8076483368732894, "language_loss": 0.62521088, "learning_rate": 7.496301980529289e-07, "loss": 0.64746571, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.274460554122925 }, { "auxiliary_loss_clip": 0.01178152, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.0507828, "balance_loss_mlp": 1.02026832, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.8296425511068293, "language_loss": 0.74300975, "learning_rate": 7.490223237074547e-07, "loss": 0.76506686, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 3.7193727493286133 }, { "auxiliary_loss_clip": 0.01324859, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.04272532, "balance_loss_mlp": 1.01655316, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 1.8841978972129265, "language_loss": 0.66067874, "learning_rate": 7.484146391342989e-07, "loss": 0.68416399, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.8049747943878174 }, { "auxiliary_loss_clip": 0.01268938, "auxiliary_loss_mlp": 0.0102513, "balance_loss_clip": 1.04500711, "balance_loss_mlp": 1.01851726, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 2.203243027028387, "language_loss": 0.5709548, "learning_rate": 7.478071444256484e-07, "loss": 0.59389544, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.6695635318756104 }, { "auxiliary_loss_clip": 0.01234216, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.04678428, "balance_loss_mlp": 1.02076685, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 2.0552200859045913, "language_loss": 0.79293263, "learning_rate": 7.471998396736579e-07, "loss": 0.81555498, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.776374340057373 }, { "auxiliary_loss_clip": 0.01323269, "auxiliary_loss_mlp": 0.01029329, "balance_loss_clip": 1.04617357, "balance_loss_mlp": 1.02239156, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.8393289774064632, "language_loss": 0.7626003, "learning_rate": 7.465927249704549e-07, "loss": 0.78612626, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.8894524574279785 }, { "auxiliary_loss_clip": 0.01222066, "auxiliary_loss_mlp": 0.01024331, "balance_loss_clip": 1.04796731, "balance_loss_mlp": 1.01711321, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 1.969001359642246, "language_loss": 0.77522135, "learning_rate": 7.459858004081398e-07, "loss": 0.79768533, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.6393566131591797 }, { "auxiliary_loss_clip": 0.01214257, "auxiliary_loss_mlp": 0.01003755, "balance_loss_clip": 1.01071763, "balance_loss_mlp": 1.00283706, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.7100737493187583, "language_loss": 0.57990015, "learning_rate": 7.453790660787815e-07, "loss": 0.60208023, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.3846049308776855 }, { "auxiliary_loss_clip": 0.01277013, "auxiliary_loss_mlp": 0.01029589, "balance_loss_clip": 1.04818344, "balance_loss_mlp": 1.02229369, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 4.645017133926816, "language_loss": 0.63636297, "learning_rate": 7.447725220744214e-07, "loss": 0.65942895, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.7954699993133545 }, { "auxiliary_loss_clip": 0.01176822, "auxiliary_loss_mlp": 0.01024098, "balance_loss_clip": 1.0497613, "balance_loss_mlp": 1.01681995, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.7617900828654802, "language_loss": 0.770872, "learning_rate": 7.441661684870717e-07, "loss": 0.79288119, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.6312646865844727 }, { "auxiliary_loss_clip": 0.0117683, "auxiliary_loss_mlp": 0.01023243, "balance_loss_clip": 1.05002451, "balance_loss_mlp": 1.01619172, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 1.776870145160391, "language_loss": 0.81805676, "learning_rate": 7.435600054087152e-07, "loss": 0.84005749, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.673257827758789 }, { "auxiliary_loss_clip": 0.01180004, "auxiliary_loss_mlp": 0.01030249, "balance_loss_clip": 1.05203664, "balance_loss_mlp": 1.02282834, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.2890636628303245, "language_loss": 0.74550152, "learning_rate": 7.42954032931308e-07, "loss": 0.76760399, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.6606762409210205 }, { "auxiliary_loss_clip": 0.01274393, "auxiliary_loss_mlp": 0.01029746, "balance_loss_clip": 1.04751992, "balance_loss_mlp": 1.02263784, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 1.9485468480348205, "language_loss": 0.74740916, "learning_rate": 7.423482511467733e-07, "loss": 0.77045059, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.7783994674682617 }, { "auxiliary_loss_clip": 0.01414465, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.0415225, "balance_loss_mlp": 1.02125168, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.8948561856401813, "language_loss": 0.64608759, "learning_rate": 7.417426601470099e-07, "loss": 0.6705128, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.8125898838043213 }, { "auxiliary_loss_clip": 0.01227646, "auxiliary_loss_mlp": 0.01028766, "balance_loss_clip": 1.0490936, "balance_loss_mlp": 1.02114904, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.367959889933403, "language_loss": 0.78555286, "learning_rate": 7.411372600238841e-07, "loss": 0.80811691, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.6708152294158936 }, { "auxiliary_loss_clip": 0.01177006, "auxiliary_loss_mlp": 0.01023671, "balance_loss_clip": 1.05052638, "balance_loss_mlp": 1.0163343, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 2.3394326850840343, "language_loss": 0.74282992, "learning_rate": 7.405320508692346e-07, "loss": 0.76483673, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.5553367137908936 }, { "auxiliary_loss_clip": 0.01172131, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.04982066, "balance_loss_mlp": 1.01875174, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 2.3085958953795482, "language_loss": 0.75458527, "learning_rate": 7.399270327748727e-07, "loss": 0.77655524, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.5766327381134033 }, { "auxiliary_loss_clip": 0.01326428, "auxiliary_loss_mlp": 0.02561479, "balance_loss_clip": 1.04559541, "balance_loss_mlp": 0.99990588, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.9469647934556997, "language_loss": 0.74356413, "learning_rate": 7.39322205832577e-07, "loss": 0.78244328, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.756756544113159 }, { "auxiliary_loss_clip": 0.01270306, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.04632676, "balance_loss_mlp": 1.01801264, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.9013777018159936, "language_loss": 0.81249762, "learning_rate": 7.387175701341009e-07, "loss": 0.83545607, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 3.646077871322632 }, { "auxiliary_loss_clip": 0.01222947, "auxiliary_loss_mlp": 0.01026751, "balance_loss_clip": 1.04731011, "balance_loss_mlp": 1.01921153, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.497714497225054, "language_loss": 0.72009808, "learning_rate": 7.381131257711659e-07, "loss": 0.74259508, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 3.536708354949951 }, { "auxiliary_loss_clip": 0.01274358, "auxiliary_loss_mlp": 0.01026182, "balance_loss_clip": 1.0527879, "balance_loss_mlp": 1.01954508, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.9999774317361363, "language_loss": 0.83673066, "learning_rate": 7.375088728354677e-07, "loss": 0.85973603, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.6322882175445557 }, { "auxiliary_loss_clip": 0.01325233, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04525828, "balance_loss_mlp": 1.02110517, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 2.1291199128661042, "language_loss": 0.67728299, "learning_rate": 7.369048114186691e-07, "loss": 0.70081198, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 3.8121414184570312 }, { "auxiliary_loss_clip": 0.01231447, "auxiliary_loss_mlp": 0.02562033, "balance_loss_clip": 1.04790294, "balance_loss_mlp": 0.9998973, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.6935255311534207, "language_loss": 0.82821482, "learning_rate": 7.363009416124055e-07, "loss": 0.8661496, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 2.7420573234558105 }, { "auxiliary_loss_clip": 0.01324836, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.04666328, "balance_loss_mlp": 1.02119982, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 5.160769033336852, "language_loss": 0.62841344, "learning_rate": 7.356972635082852e-07, "loss": 0.65195137, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.695060968399048 }, { "auxiliary_loss_clip": 0.01374301, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.04854059, "balance_loss_mlp": 1.02003396, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 2.1063358523534204, "language_loss": 0.75660747, "learning_rate": 7.35093777197884e-07, "loss": 0.78062618, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.8083248138427734 }, { "auxiliary_loss_clip": 0.01269719, "auxiliary_loss_mlp": 0.01023565, "balance_loss_clip": 1.04561424, "balance_loss_mlp": 1.01702082, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.374038268639117, "language_loss": 0.85221308, "learning_rate": 7.344904827727525e-07, "loss": 0.87514591, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.8112728595733643 }, { "auxiliary_loss_clip": 0.01326096, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 1.04388356, "balance_loss_mlp": 1.02391088, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 3.285100262430146, "language_loss": 0.73433149, "learning_rate": 7.338873803244076e-07, "loss": 0.75789589, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.778938055038452 }, { "auxiliary_loss_clip": 0.01269828, "auxiliary_loss_mlp": 0.01023842, "balance_loss_clip": 1.04657543, "balance_loss_mlp": 1.01750946, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.860551875855276, "language_loss": 0.80908394, "learning_rate": 7.332844699443401e-07, "loss": 0.83202058, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 3.6199100017547607 }, { "auxiliary_loss_clip": 0.01365885, "auxiliary_loss_mlp": 0.01028275, "balance_loss_clip": 1.0419488, "balance_loss_mlp": 1.02186108, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.888853830692175, "language_loss": 0.75661415, "learning_rate": 7.326817517240121e-07, "loss": 0.78055573, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.7752795219421387 }, { "auxiliary_loss_clip": 0.01225033, "auxiliary_loss_mlp": 0.02560579, "balance_loss_clip": 1.04745162, "balance_loss_mlp": 0.99993235, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 2.175521404754996, "language_loss": 0.83389729, "learning_rate": 7.320792257548545e-07, "loss": 0.87175333, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.761828660964966 }, { "auxiliary_loss_clip": 0.01280014, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.04905832, "balance_loss_mlp": 1.01921844, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 2.100964466446376, "language_loss": 0.76432472, "learning_rate": 7.314768921282704e-07, "loss": 0.78739297, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.7057058811187744 }, { "auxiliary_loss_clip": 0.01227317, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.04792595, "balance_loss_mlp": 1.01884103, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.547771716483514, "language_loss": 0.71906888, "learning_rate": 7.30874750935633e-07, "loss": 0.74160135, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.6897242069244385 }, { "auxiliary_loss_clip": 0.01319828, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.04632413, "balance_loss_mlp": 1.01842105, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 2.1382923134064886, "language_loss": 0.79569548, "learning_rate": 7.30272802268286e-07, "loss": 0.81914198, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.6750850677490234 }, { "auxiliary_loss_clip": 0.01402176, "auxiliary_loss_mlp": 0.01022459, "balance_loss_clip": 1.03946352, "balance_loss_mlp": 1.01582456, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 2.1637325769644358, "language_loss": 0.76336777, "learning_rate": 7.29671046217547e-07, "loss": 0.78761411, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.8447535037994385 }, { "auxiliary_loss_clip": 0.01324602, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04561853, "balance_loss_mlp": 1.01866436, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.8869149947985737, "language_loss": 0.82212669, "learning_rate": 7.290694828746988e-07, "loss": 0.84562314, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.7554259300231934 }, { "auxiliary_loss_clip": 0.01329418, "auxiliary_loss_mlp": 0.01023508, "balance_loss_clip": 1.04505849, "balance_loss_mlp": 1.01681781, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 1.8887329862158206, "language_loss": 0.85652339, "learning_rate": 7.284681123310004e-07, "loss": 0.88005269, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.72348690032959 }, { "auxiliary_loss_clip": 0.01223805, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.04851794, "balance_loss_mlp": 1.02003825, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 1.8365552841508, "language_loss": 0.7941044, "learning_rate": 7.27866934677678e-07, "loss": 0.81661367, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.669830322265625 }, { "auxiliary_loss_clip": 0.01368854, "auxiliary_loss_mlp": 0.0102488, "balance_loss_clip": 1.04311633, "balance_loss_mlp": 1.01790023, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.6213937548599522, "language_loss": 0.78908932, "learning_rate": 7.272659500059297e-07, "loss": 0.81302667, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.7290589809417725 }, { "auxiliary_loss_clip": 0.01223166, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.04865456, "balance_loss_mlp": 1.02065206, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 3.0731355216415124, "language_loss": 0.80492854, "learning_rate": 7.266651584069264e-07, "loss": 0.82743585, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.6831986904144287 }, { "auxiliary_loss_clip": 0.01229132, "auxiliary_loss_mlp": 0.01028047, "balance_loss_clip": 1.05161881, "balance_loss_mlp": 1.02157354, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.6289412538524368, "language_loss": 0.56646335, "learning_rate": 7.260645599718045e-07, "loss": 0.58903515, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.732125997543335 }, { "auxiliary_loss_clip": 0.01275372, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.02026868, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 6.29477714699912, "language_loss": 0.67384863, "learning_rate": 7.254641547916767e-07, "loss": 0.69687688, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.688161849975586 }, { "auxiliary_loss_clip": 0.01176766, "auxiliary_loss_mlp": 0.01026846, "balance_loss_clip": 1.05136514, "balance_loss_mlp": 1.01979232, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 1.9558641211295063, "language_loss": 0.6934371, "learning_rate": 7.248639429576226e-07, "loss": 0.71547323, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.630911350250244 }, { "auxiliary_loss_clip": 0.0122862, "auxiliary_loss_mlp": 0.01026436, "balance_loss_clip": 1.05123734, "balance_loss_mlp": 1.01940513, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.7234552126907416, "language_loss": 0.72158515, "learning_rate": 7.242639245606959e-07, "loss": 0.74413568, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.688734531402588 }, { "auxiliary_loss_clip": 0.01278964, "auxiliary_loss_mlp": 0.01028396, "balance_loss_clip": 1.04653096, "balance_loss_mlp": 1.02140749, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 2.0887998958915763, "language_loss": 0.82269895, "learning_rate": 7.236640996919168e-07, "loss": 0.84577262, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.642817497253418 }, { "auxiliary_loss_clip": 0.01228416, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.05040717, "balance_loss_mlp": 1.02017164, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.8166457875773183, "language_loss": 0.70732379, "learning_rate": 7.230644684422782e-07, "loss": 0.72987384, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 3.5984373092651367 }, { "auxiliary_loss_clip": 0.01319558, "auxiliary_loss_mlp": 0.01022363, "balance_loss_clip": 1.04512572, "balance_loss_mlp": 1.01501369, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 1.9019326046748992, "language_loss": 0.81377208, "learning_rate": 7.224650309027451e-07, "loss": 0.83719122, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 3.599729061126709 }, { "auxiliary_loss_clip": 0.01231772, "auxiliary_loss_mlp": 0.01022018, "balance_loss_clip": 1.05296063, "balance_loss_mlp": 1.01525331, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 2.3157866561199065, "language_loss": 0.68881989, "learning_rate": 7.218657871642506e-07, "loss": 0.71135777, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 3.503795623779297 }, { "auxiliary_loss_clip": 0.01180925, "auxiliary_loss_mlp": 0.01023012, "balance_loss_clip": 1.05302691, "balance_loss_mlp": 1.01583004, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 4.389339114217605, "language_loss": 0.62300622, "learning_rate": 7.212667373177012e-07, "loss": 0.64504564, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.670264959335327 }, { "auxiliary_loss_clip": 0.01325612, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.0440259, "balance_loss_mlp": 1.0207715, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 1.648301898834153, "language_loss": 0.75135446, "learning_rate": 7.206678814539704e-07, "loss": 0.77488714, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 2.693706512451172 }, { "auxiliary_loss_clip": 0.01382571, "auxiliary_loss_mlp": 0.01026315, "balance_loss_clip": 1.04518259, "balance_loss_mlp": 1.01965117, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.4884004947086507, "language_loss": 0.72700655, "learning_rate": 7.20069219663904e-07, "loss": 0.75109541, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.7409887313842773 }, { "auxiliary_loss_clip": 0.01228943, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.04799676, "balance_loss_mlp": 1.0217433, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 2.957682660791064, "language_loss": 0.8010956, "learning_rate": 7.1947075203832e-07, "loss": 0.82367438, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.650104284286499 }, { "auxiliary_loss_clip": 0.0106684, "auxiliary_loss_mlp": 0.01002938, "balance_loss_clip": 1.01102829, "balance_loss_mlp": 1.00201452, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8565103802203478, "language_loss": 0.60123616, "learning_rate": 7.188724786680049e-07, "loss": 0.62193388, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.177157402038574 }, { "auxiliary_loss_clip": 0.01273964, "auxiliary_loss_mlp": 0.01020879, "balance_loss_clip": 1.04631984, "balance_loss_mlp": 1.01395547, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.6676783057016382, "language_loss": 0.75658214, "learning_rate": 7.182743996437162e-07, "loss": 0.77953064, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.733860969543457 }, { "auxiliary_loss_clip": 0.01330014, "auxiliary_loss_mlp": 0.01027853, "balance_loss_clip": 1.04605508, "balance_loss_mlp": 1.01968694, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 32.662405792892315, "language_loss": 0.68684548, "learning_rate": 7.176765150561819e-07, "loss": 0.71042418, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 3.6851398944854736 }, { "auxiliary_loss_clip": 0.01178449, "auxiliary_loss_mlp": 0.01026431, "balance_loss_clip": 1.05037045, "balance_loss_mlp": 1.01953804, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.308865361980118, "language_loss": 0.80080682, "learning_rate": 7.170788249961002e-07, "loss": 0.82285565, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.596703052520752 }, { "auxiliary_loss_clip": 0.01172809, "auxiliary_loss_mlp": 0.0102317, "balance_loss_clip": 1.04885352, "balance_loss_mlp": 1.01605916, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 2.0132957587283777, "language_loss": 0.88146675, "learning_rate": 7.164813295541418e-07, "loss": 0.90342659, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.5773067474365234 }, { "auxiliary_loss_clip": 0.01275283, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.04653287, "balance_loss_mlp": 1.02178717, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.9642663707625094, "language_loss": 0.70314252, "learning_rate": 7.15884028820944e-07, "loss": 0.72618651, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.7155799865722656 }, { "auxiliary_loss_clip": 0.01322762, "auxiliary_loss_mlp": 0.01023001, "balance_loss_clip": 1.04403639, "balance_loss_mlp": 1.0156126, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 3.1884172961305746, "language_loss": 0.60488945, "learning_rate": 7.152869228871185e-07, "loss": 0.6283471, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.7576606273651123 }, { "auxiliary_loss_clip": 0.01275984, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.04777622, "balance_loss_mlp": 1.02135026, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 1.8341588104007065, "language_loss": 0.72652757, "learning_rate": 7.146900118432457e-07, "loss": 0.74957639, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.7106711864471436 }, { "auxiliary_loss_clip": 0.01470868, "auxiliary_loss_mlp": 0.01023792, "balance_loss_clip": 1.03616476, "balance_loss_mlp": 1.01692581, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 2.0556127086186464, "language_loss": 0.86049652, "learning_rate": 7.140932957798753e-07, "loss": 0.88544309, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.918816089630127 }, { "auxiliary_loss_clip": 0.01278701, "auxiliary_loss_mlp": 0.01021454, "balance_loss_clip": 1.04515505, "balance_loss_mlp": 1.01476312, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 7.246585463338025, "language_loss": 0.71078533, "learning_rate": 7.134967747875309e-07, "loss": 0.73378688, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.7619755268096924 }, { "auxiliary_loss_clip": 0.01222633, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.04733551, "balance_loss_mlp": 1.01702642, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 2.4698891656860185, "language_loss": 0.81987423, "learning_rate": 7.129004489567014e-07, "loss": 0.84234494, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.6458818912506104 }, { "auxiliary_loss_clip": 0.01323125, "auxiliary_loss_mlp": 0.01022317, "balance_loss_clip": 1.04402924, "balance_loss_mlp": 1.01572812, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.722893409409163, "language_loss": 0.78036922, "learning_rate": 7.123043183778512e-07, "loss": 0.80382359, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.669877052307129 }, { "auxiliary_loss_clip": 0.01327269, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.04793644, "balance_loss_mlp": 1.02027774, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.5520759635931385, "language_loss": 0.65096492, "learning_rate": 7.117083831414114e-07, "loss": 0.67451143, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.673048973083496 }, { "auxiliary_loss_clip": 0.0117525, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.05097771, "balance_loss_mlp": 1.02068996, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.8958256068990107, "language_loss": 0.69969064, "learning_rate": 7.11112643337787e-07, "loss": 0.72171712, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.58666729927063 }, { "auxiliary_loss_clip": 0.01282313, "auxiliary_loss_mlp": 0.01029219, "balance_loss_clip": 1.05257154, "balance_loss_mlp": 1.02167308, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.516180017852946, "language_loss": 0.77061242, "learning_rate": 7.10517099057349e-07, "loss": 0.79372776, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.6469478607177734 }, { "auxiliary_loss_clip": 0.01278278, "auxiliary_loss_mlp": 0.01023195, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.01568449, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.327576560898737, "language_loss": 0.61489546, "learning_rate": 7.099217503904411e-07, "loss": 0.63791013, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.720355987548828 }, { "auxiliary_loss_clip": 0.0127907, "auxiliary_loss_mlp": 0.01028652, "balance_loss_clip": 1.0488534, "balance_loss_mlp": 1.02208018, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 1.8149189464498579, "language_loss": 0.90384626, "learning_rate": 7.093265974273788e-07, "loss": 0.92692345, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.738978624343872 }, { "auxiliary_loss_clip": 0.01227413, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 1.04810977, "balance_loss_mlp": 1.01898146, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 2.1762594004810882, "language_loss": 0.71535742, "learning_rate": 7.087316402584447e-07, "loss": 0.7378847, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.631948471069336 }, { "auxiliary_loss_clip": 0.01174775, "auxiliary_loss_mlp": 0.01022724, "balance_loss_clip": 1.04970574, "balance_loss_mlp": 1.01567912, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 2.1121681794781404, "language_loss": 0.86156547, "learning_rate": 7.081368789738953e-07, "loss": 0.88354051, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.5991873741149902 }, { "auxiliary_loss_clip": 0.01271706, "auxiliary_loss_mlp": 0.01029749, "balance_loss_clip": 1.04373991, "balance_loss_mlp": 1.02268589, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 2.2037119070427376, "language_loss": 0.77738649, "learning_rate": 7.075423136639537e-07, "loss": 0.80040103, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 3.6870412826538086 }, { "auxiliary_loss_clip": 0.01320229, "auxiliary_loss_mlp": 0.01024685, "balance_loss_clip": 1.04422855, "balance_loss_mlp": 1.01733625, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 1.8700745029310044, "language_loss": 0.74674839, "learning_rate": 7.069479444188149e-07, "loss": 0.77019751, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.822258472442627 }, { "auxiliary_loss_clip": 0.01270936, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04613662, "balance_loss_mlp": 1.01809537, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 2.1323902100371854, "language_loss": 0.82255948, "learning_rate": 7.063537713286453e-07, "loss": 0.8455292, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 3.5812249183654785 }, { "auxiliary_loss_clip": 0.01278367, "auxiliary_loss_mlp": 0.0102397, "balance_loss_clip": 1.04631495, "balance_loss_mlp": 1.01673126, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 2.027506922174194, "language_loss": 0.8074525, "learning_rate": 7.057597944835803e-07, "loss": 0.83047587, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.7118425369262695 }, { "auxiliary_loss_clip": 0.01329436, "auxiliary_loss_mlp": 0.01025787, "balance_loss_clip": 1.04522896, "balance_loss_mlp": 1.01882815, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 2.3347493402586617, "language_loss": 0.74779356, "learning_rate": 7.051660139737253e-07, "loss": 0.77134579, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.711240291595459 }, { "auxiliary_loss_clip": 0.01221996, "auxiliary_loss_mlp": 0.02565838, "balance_loss_clip": 1.04886866, "balance_loss_mlp": 0.99990833, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 2.8628177884795565, "language_loss": 0.76765448, "learning_rate": 7.045724298891565e-07, "loss": 0.80553281, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 2.6162407398223877 }, { "auxiliary_loss_clip": 0.01223872, "auxiliary_loss_mlp": 0.01027557, "balance_loss_clip": 1.05012047, "balance_loss_mlp": 1.02001715, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 2.019446308261088, "language_loss": 0.69060481, "learning_rate": 7.039790423199192e-07, "loss": 0.71311903, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.6447596549987793 }, { "auxiliary_loss_clip": 0.01278059, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.04885745, "balance_loss_mlp": 1.01990104, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 2.174853608286618, "language_loss": 0.78094131, "learning_rate": 7.033858513560322e-07, "loss": 0.80399263, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.642805337905884 }, { "auxiliary_loss_clip": 0.0122768, "auxiliary_loss_mlp": 0.01025413, "balance_loss_clip": 1.05220413, "balance_loss_mlp": 1.01860952, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.787957809094547, "language_loss": 0.76285326, "learning_rate": 7.027928570874794e-07, "loss": 0.78538418, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.593320608139038 }, { "auxiliary_loss_clip": 0.01172889, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.04906642, "balance_loss_mlp": 1.0197041, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.811052333871359, "language_loss": 0.85599881, "learning_rate": 7.022000596042194e-07, "loss": 0.87799621, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 3.5031301975250244 }, { "auxiliary_loss_clip": 0.01322148, "auxiliary_loss_mlp": 0.0102571, "balance_loss_clip": 1.04207587, "balance_loss_mlp": 1.01894152, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.4535247860022062, "language_loss": 0.81815481, "learning_rate": 7.016074589961784e-07, "loss": 0.84163332, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.785069465637207 }, { "auxiliary_loss_clip": 0.01277051, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.05009389, "balance_loss_mlp": 1.02079356, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 2.116362364164424, "language_loss": 0.66988432, "learning_rate": 7.01015055353253e-07, "loss": 0.6929298, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.7459042072296143 }, { "auxiliary_loss_clip": 0.01372963, "auxiliary_loss_mlp": 0.01024253, "balance_loss_clip": 1.04598081, "balance_loss_mlp": 1.01686513, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.9279618784701693, "language_loss": 0.78223872, "learning_rate": 7.004228487653123e-07, "loss": 0.80621088, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.740433931350708 }, { "auxiliary_loss_clip": 0.01324243, "auxiliary_loss_mlp": 0.0102509, "balance_loss_clip": 1.04072952, "balance_loss_mlp": 1.01789904, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 2.0401832219985208, "language_loss": 0.78049815, "learning_rate": 6.998308393221906e-07, "loss": 0.80399156, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.6887688636779785 }, { "auxiliary_loss_clip": 0.01327203, "auxiliary_loss_mlp": 0.01024613, "balance_loss_clip": 1.04663336, "balance_loss_mlp": 1.01839066, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.9770352403663543, "language_loss": 0.71003383, "learning_rate": 6.992390271136977e-07, "loss": 0.73355204, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.72068190574646 }, { "auxiliary_loss_clip": 0.01220094, "auxiliary_loss_mlp": 0.01023536, "balance_loss_clip": 1.04524732, "balance_loss_mlp": 1.0166043, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 3.667229653719235, "language_loss": 0.85862792, "learning_rate": 6.986474122296094e-07, "loss": 0.88106424, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.629157543182373 }, { "auxiliary_loss_clip": 0.01181607, "auxiliary_loss_mlp": 0.01024184, "balance_loss_clip": 1.05328441, "balance_loss_mlp": 1.01678681, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 1.8044303836480957, "language_loss": 0.72291887, "learning_rate": 6.980559947596751e-07, "loss": 0.7449767, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.5883724689483643 }, { "auxiliary_loss_clip": 0.01372175, "auxiliary_loss_mlp": 0.01026448, "balance_loss_clip": 1.04364038, "balance_loss_mlp": 1.01927781, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 1.9667315582819909, "language_loss": 0.75952697, "learning_rate": 6.974647747936109e-07, "loss": 0.78351319, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.695183515548706 }, { "auxiliary_loss_clip": 0.01177517, "auxiliary_loss_mlp": 0.0256486, "balance_loss_clip": 1.05141926, "balance_loss_mlp": 0.99995828, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 1.9252103848600302, "language_loss": 0.82430947, "learning_rate": 6.968737524211039e-07, "loss": 0.86173326, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.6466257572174072 }, { "auxiliary_loss_clip": 0.01224073, "auxiliary_loss_mlp": 0.01022828, "balance_loss_clip": 1.04873538, "balance_loss_mlp": 1.01584864, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.4481391884213464, "language_loss": 0.80456436, "learning_rate": 6.962829277318132e-07, "loss": 0.8270334, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.6095941066741943 }, { "auxiliary_loss_clip": 0.01228356, "auxiliary_loss_mlp": 0.01024278, "balance_loss_clip": 1.05202174, "balance_loss_mlp": 1.01760256, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.8470694917955117, "language_loss": 0.83731669, "learning_rate": 6.956923008153652e-07, "loss": 0.85984302, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.654435873031616 }, { "auxiliary_loss_clip": 0.01226942, "auxiliary_loss_mlp": 0.0102823, "balance_loss_clip": 1.04845238, "balance_loss_mlp": 1.02201343, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.3335057693210257, "language_loss": 0.84187281, "learning_rate": 6.951018717613593e-07, "loss": 0.86442459, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.6262354850769043 }, { "auxiliary_loss_clip": 0.01223613, "auxiliary_loss_mlp": 0.01024023, "balance_loss_clip": 1.04918277, "balance_loss_mlp": 1.0169481, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 2.5129570874396028, "language_loss": 0.78405905, "learning_rate": 6.945116406593614e-07, "loss": 0.80653536, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.615175724029541 }, { "auxiliary_loss_clip": 0.01376926, "auxiliary_loss_mlp": 0.01028621, "balance_loss_clip": 1.04672718, "balance_loss_mlp": 1.02201653, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.239542761454538, "language_loss": 0.74537098, "learning_rate": 6.939216075989089e-07, "loss": 0.76942647, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.7062175273895264 }, { "auxiliary_loss_clip": 0.01271007, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.04669464, "balance_loss_mlp": 1.01806927, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 2.2948383265944914, "language_loss": 0.65881139, "learning_rate": 6.933317726695109e-07, "loss": 0.68177849, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.7970571517944336 }, { "auxiliary_loss_clip": 0.01327712, "auxiliary_loss_mlp": 0.01027204, "balance_loss_clip": 1.05179, "balance_loss_mlp": 1.02050722, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.5268833536747697, "language_loss": 0.80018288, "learning_rate": 6.92742135960644e-07, "loss": 0.82373202, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.6629130840301514 }, { "auxiliary_loss_clip": 0.01119622, "auxiliary_loss_mlp": 0.0099942, "balance_loss_clip": 1.01078415, "balance_loss_mlp": 0.99845451, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8256064458886008, "language_loss": 0.55619931, "learning_rate": 6.921526975617556e-07, "loss": 0.57738972, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 4.128159761428833 }, { "auxiliary_loss_clip": 0.01178755, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.04647827, "balance_loss_mlp": 1.02100945, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 1.985047746779728, "language_loss": 0.75243199, "learning_rate": 6.915634575622631e-07, "loss": 0.77450454, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 3.7653064727783203 }, { "auxiliary_loss_clip": 0.01174468, "auxiliary_loss_mlp": 0.01023339, "balance_loss_clip": 1.04912758, "balance_loss_mlp": 1.01662791, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 2.1457197717855427, "language_loss": 0.71091366, "learning_rate": 6.909744160515532e-07, "loss": 0.73289168, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 3.530716896057129 }, { "auxiliary_loss_clip": 0.01269581, "auxiliary_loss_mlp": 0.01020017, "balance_loss_clip": 1.04787481, "balance_loss_mlp": 1.01295733, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 3.968505615926375, "language_loss": 0.69527054, "learning_rate": 6.903855731189849e-07, "loss": 0.71816653, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.8107709884643555 }, { "auxiliary_loss_clip": 0.01283337, "auxiliary_loss_mlp": 0.01023916, "balance_loss_clip": 1.0494802, "balance_loss_mlp": 1.01621521, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 3.879971792912543, "language_loss": 0.81662273, "learning_rate": 6.897969288538825e-07, "loss": 0.83969527, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.6271846294403076 }, { "auxiliary_loss_clip": 0.01265299, "auxiliary_loss_mlp": 0.01029147, "balance_loss_clip": 1.04555368, "balance_loss_mlp": 1.02176213, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.866329697517354, "language_loss": 0.81485051, "learning_rate": 6.892084833455452e-07, "loss": 0.8377949, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 2.6525349617004395 }, { "auxiliary_loss_clip": 0.01221977, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.04922473, "balance_loss_mlp": 1.01863956, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.7284763893768438, "language_loss": 0.84059083, "learning_rate": 6.886202366832384e-07, "loss": 0.86305851, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.6830334663391113 }, { "auxiliary_loss_clip": 0.01378105, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 1.04927826, "balance_loss_mlp": 1.01900506, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 1.9383319453275663, "language_loss": 0.73681742, "learning_rate": 6.880321889561987e-07, "loss": 0.76086277, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.730783700942993 }, { "auxiliary_loss_clip": 0.01319631, "auxiliary_loss_mlp": 0.01029257, "balance_loss_clip": 1.04500675, "balance_loss_mlp": 1.02147913, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.358475084418294, "language_loss": 0.65585488, "learning_rate": 6.874443402536338e-07, "loss": 0.67934382, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.7054529190063477 }, { "auxiliary_loss_clip": 0.01279056, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.05000377, "balance_loss_mlp": 1.01945138, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 25.312146166150775, "language_loss": 0.80305982, "learning_rate": 6.868566906647177e-07, "loss": 0.82611597, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 3.609323501586914 }, { "auxiliary_loss_clip": 0.01226144, "auxiliary_loss_mlp": 0.0102318, "balance_loss_clip": 1.04821718, "balance_loss_mlp": 1.01566935, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 2.7021712317982556, "language_loss": 0.83626795, "learning_rate": 6.862692402785984e-07, "loss": 0.85876119, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.6267597675323486 }, { "auxiliary_loss_clip": 0.0123308, "auxiliary_loss_mlp": 0.01002204, "balance_loss_clip": 1.02036858, "balance_loss_mlp": 1.0011847, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.6848392950552733, "language_loss": 0.49549723, "learning_rate": 6.856819891843899e-07, "loss": 0.5178501, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.350579023361206 }, { "auxiliary_loss_clip": 0.0142323, "auxiliary_loss_mlp": 0.01024805, "balance_loss_clip": 1.04595232, "balance_loss_mlp": 1.0174377, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 2.095752608200626, "language_loss": 0.72533894, "learning_rate": 6.8509493747118e-07, "loss": 0.74981928, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.7668466567993164 }, { "auxiliary_loss_clip": 0.01177003, "auxiliary_loss_mlp": 0.0102517, "balance_loss_clip": 1.05254745, "balance_loss_mlp": 1.01802933, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 2.954277180450541, "language_loss": 0.87989116, "learning_rate": 6.845080852280221e-07, "loss": 0.90191287, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.5901620388031006 }, { "auxiliary_loss_clip": 0.01324265, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.0460248, "balance_loss_mlp": 1.02151775, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.5991712429626839, "language_loss": 0.74560392, "learning_rate": 6.839214325439409e-07, "loss": 0.76912785, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.752765417098999 }, { "auxiliary_loss_clip": 0.01267799, "auxiliary_loss_mlp": 0.01027209, "balance_loss_clip": 1.04946923, "balance_loss_mlp": 1.02024078, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.632968537987518, "language_loss": 0.72088242, "learning_rate": 6.833349795079327e-07, "loss": 0.74383247, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.6660990715026855 }, { "auxiliary_loss_clip": 0.01321052, "auxiliary_loss_mlp": 0.01023147, "balance_loss_clip": 1.04718876, "balance_loss_mlp": 1.01648927, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 2.20454923405599, "language_loss": 0.68388879, "learning_rate": 6.827487262089613e-07, "loss": 0.70733082, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.759269952774048 }, { "auxiliary_loss_clip": 0.01165985, "auxiliary_loss_mlp": 0.01003654, "balance_loss_clip": 1.0110333, "balance_loss_mlp": 1.00266445, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9394468893660747, "language_loss": 0.56737435, "learning_rate": 6.821626727359606e-07, "loss": 0.5890708, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.287353277206421 }, { "auxiliary_loss_clip": 0.01280802, "auxiliary_loss_mlp": 0.01026374, "balance_loss_clip": 1.05293918, "balance_loss_mlp": 1.01822042, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.6969548329094777, "language_loss": 0.77351099, "learning_rate": 6.815768191778348e-07, "loss": 0.79658282, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.645496368408203 }, { "auxiliary_loss_clip": 0.01222434, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.04686737, "balance_loss_mlp": 1.02166677, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.823929737774238, "language_loss": 0.73134887, "learning_rate": 6.809911656234569e-07, "loss": 0.75385684, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.7448415756225586 }, { "auxiliary_loss_clip": 0.01326862, "auxiliary_loss_mlp": 0.01023964, "balance_loss_clip": 1.04410803, "balance_loss_mlp": 1.01721334, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.0918267042129792, "language_loss": 0.78255361, "learning_rate": 6.804057121616707e-07, "loss": 0.80606186, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.726217269897461 }, { "auxiliary_loss_clip": 0.0122596, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.04891813, "balance_loss_mlp": 1.02001143, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 2.4874079412583274, "language_loss": 0.72425199, "learning_rate": 6.798204588812888e-07, "loss": 0.74677932, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.6369259357452393 }, { "auxiliary_loss_clip": 0.01414175, "auxiliary_loss_mlp": 0.02565524, "balance_loss_clip": 1.03984928, "balance_loss_mlp": 0.9999311, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.6955628475015843, "language_loss": 0.75494182, "learning_rate": 6.792354058710937e-07, "loss": 0.79473877, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.7794926166534424 }, { "auxiliary_loss_clip": 0.0116923, "auxiliary_loss_mlp": 0.01023126, "balance_loss_clip": 1.04806852, "balance_loss_mlp": 1.01662588, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 1.794153033073207, "language_loss": 0.65202695, "learning_rate": 6.786505532198374e-07, "loss": 0.67395049, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.639497756958008 }, { "auxiliary_loss_clip": 0.01176191, "auxiliary_loss_mlp": 0.01028068, "balance_loss_clip": 1.05068636, "balance_loss_mlp": 1.02039409, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 2.0862137506075764, "language_loss": 0.85393012, "learning_rate": 6.780659010162411e-07, "loss": 0.87597269, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.6010000705718994 }, { "auxiliary_loss_clip": 0.01326651, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.04839587, "balance_loss_mlp": 1.0244801, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.8507523683339795, "language_loss": 0.8311801, "learning_rate": 6.774814493489975e-07, "loss": 0.85475749, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.7583494186401367 }, { "auxiliary_loss_clip": 0.01221729, "auxiliary_loss_mlp": 0.01021796, "balance_loss_clip": 1.04988909, "balance_loss_mlp": 1.01497722, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 3.3866301240210044, "language_loss": 0.6611588, "learning_rate": 6.768971983067655e-07, "loss": 0.68359405, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 4.480640888214111 }, { "auxiliary_loss_clip": 0.01066131, "auxiliary_loss_mlp": 0.01003366, "balance_loss_clip": 1.01061654, "balance_loss_mlp": 1.00240076, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0109881313846756, "language_loss": 0.67714351, "learning_rate": 6.763131479781772e-07, "loss": 0.69783849, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.008915424346924 }, { "auxiliary_loss_clip": 0.01265829, "auxiliary_loss_mlp": 0.01027469, "balance_loss_clip": 1.04726958, "balance_loss_mlp": 1.02111566, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 1.823764479746317, "language_loss": 0.76238334, "learning_rate": 6.757292984518316e-07, "loss": 0.78531623, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 3.5446746349334717 }, { "auxiliary_loss_clip": 0.01119399, "auxiliary_loss_mlp": 0.01001863, "balance_loss_clip": 1.01069343, "balance_loss_mlp": 1.00093961, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7458236799814033, "language_loss": 0.56339931, "learning_rate": 6.751456498162981e-07, "loss": 0.58461195, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.0842173099517822 }, { "auxiliary_loss_clip": 0.01222305, "auxiliary_loss_mlp": 0.01024382, "balance_loss_clip": 1.04613316, "balance_loss_mlp": 1.01771784, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 4.5428663112970815, "language_loss": 0.84983462, "learning_rate": 6.745622021601174e-07, "loss": 0.87230146, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.6339495182037354 }, { "auxiliary_loss_clip": 0.01323608, "auxiliary_loss_mlp": 0.01021201, "balance_loss_clip": 1.04553175, "balance_loss_mlp": 1.01483846, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.783568945529018, "language_loss": 0.699049, "learning_rate": 6.739789555717954e-07, "loss": 0.72249705, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.667623281478882 }, { "auxiliary_loss_clip": 0.01171896, "auxiliary_loss_mlp": 0.01021236, "balance_loss_clip": 1.04770923, "balance_loss_mlp": 1.01480806, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 1.980593886583483, "language_loss": 0.77783662, "learning_rate": 6.733959101398124e-07, "loss": 0.79976797, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 2.5931572914123535 }, { "auxiliary_loss_clip": 0.01275832, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.04601026, "balance_loss_mlp": 1.02125537, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.7005202565465418, "language_loss": 0.81334823, "learning_rate": 6.728130659526143e-07, "loss": 0.83638704, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.685086488723755 }, { "auxiliary_loss_clip": 0.0127521, "auxiliary_loss_mlp": 0.01023083, "balance_loss_clip": 1.04734564, "balance_loss_mlp": 1.0163691, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.444263799134435, "language_loss": 0.71669495, "learning_rate": 6.7223042309862e-07, "loss": 0.73967791, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.664506196975708 }, { "auxiliary_loss_clip": 0.01220653, "auxiliary_loss_mlp": 0.0102123, "balance_loss_clip": 1.04533958, "balance_loss_mlp": 1.01474237, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 3.587018191390677, "language_loss": 0.73699939, "learning_rate": 6.716479816662144e-07, "loss": 0.75941813, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 3.632021903991699 }, { "auxiliary_loss_clip": 0.01279813, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.04808724, "balance_loss_mlp": 1.02503824, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 4.074505306889431, "language_loss": 0.73427629, "learning_rate": 6.710657417437531e-07, "loss": 0.75739181, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.6814775466918945 }, { "auxiliary_loss_clip": 0.01274436, "auxiliary_loss_mlp": 0.01019802, "balance_loss_clip": 1.04700732, "balance_loss_mlp": 1.01320386, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.4567580256234525, "language_loss": 0.80310583, "learning_rate": 6.704837034195628e-07, "loss": 0.82604825, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.658297061920166 }, { "auxiliary_loss_clip": 0.01221862, "auxiliary_loss_mlp": 0.01026905, "balance_loss_clip": 1.04824162, "balance_loss_mlp": 1.02011299, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 2.001429538461752, "language_loss": 0.85305643, "learning_rate": 6.699018667819376e-07, "loss": 0.87554407, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.6916420459747314 }, { "auxiliary_loss_clip": 0.01223539, "auxiliary_loss_mlp": 0.01022, "balance_loss_clip": 1.04714322, "balance_loss_mlp": 1.0143702, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.7110665086086185, "language_loss": 0.72714448, "learning_rate": 6.693202319191415e-07, "loss": 0.74959981, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.6567742824554443 }, { "auxiliary_loss_clip": 0.01179154, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.05524516, "balance_loss_mlp": 1.02428341, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.7872100585442539, "language_loss": 0.74980086, "learning_rate": 6.687387989194084e-07, "loss": 0.77190274, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.6413331031799316 }, { "auxiliary_loss_clip": 0.01273732, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04993224, "balance_loss_mlp": 1.02125371, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 2.3448350562895706, "language_loss": 0.7966342, "learning_rate": 6.681575678709404e-07, "loss": 0.81965369, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.641674518585205 }, { "auxiliary_loss_clip": 0.0122192, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.04843533, "balance_loss_mlp": 1.01905346, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 3.474892345327748, "language_loss": 0.70847154, "learning_rate": 6.67576538861911e-07, "loss": 0.73095161, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.671633243560791 }, { "auxiliary_loss_clip": 0.01271057, "auxiliary_loss_mlp": 0.0102409, "balance_loss_clip": 1.04686117, "balance_loss_mlp": 1.01781034, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 3.2653861480658777, "language_loss": 0.82232809, "learning_rate": 6.669957119804612e-07, "loss": 0.84527951, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.7344629764556885 }, { "auxiliary_loss_clip": 0.01281902, "auxiliary_loss_mlp": 0.01024813, "balance_loss_clip": 1.04795408, "balance_loss_mlp": 1.01729083, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 5.119572515413475, "language_loss": 0.72654814, "learning_rate": 6.66415087314702e-07, "loss": 0.74961525, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.669663190841675 }, { "auxiliary_loss_clip": 0.01276483, "auxiliary_loss_mlp": 0.01029142, "balance_loss_clip": 1.04763985, "balance_loss_mlp": 1.02284491, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.56401227761443, "language_loss": 0.73168218, "learning_rate": 6.65834664952714e-07, "loss": 0.75473845, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.672999620437622 }, { "auxiliary_loss_clip": 0.01325712, "auxiliary_loss_mlp": 0.01022391, "balance_loss_clip": 1.04617643, "balance_loss_mlp": 1.0152626, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.596677173055138, "language_loss": 0.76052791, "learning_rate": 6.652544449825457e-07, "loss": 0.78400892, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.7332968711853027 }, { "auxiliary_loss_clip": 0.01180264, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.0454495, "balance_loss_mlp": 1.02429533, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.8921460118562172, "language_loss": 0.76471478, "learning_rate": 6.646744274922182e-07, "loss": 0.78683394, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.6834092140197754 }, { "auxiliary_loss_clip": 0.01274084, "auxiliary_loss_mlp": 0.01020489, "balance_loss_clip": 1.0456152, "balance_loss_mlp": 1.01423669, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 2.830884886145923, "language_loss": 0.75762028, "learning_rate": 6.640946125697171e-07, "loss": 0.78056604, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.6767778396606445 }, { "auxiliary_loss_clip": 0.01222864, "auxiliary_loss_mlp": 0.0102351, "balance_loss_clip": 1.04636574, "balance_loss_mlp": 1.01629817, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 2.048382047129729, "language_loss": 0.75823843, "learning_rate": 6.635150003030017e-07, "loss": 0.78070211, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.6648457050323486 }, { "auxiliary_loss_clip": 0.0127116, "auxiliary_loss_mlp": 0.01020217, "balance_loss_clip": 1.04038954, "balance_loss_mlp": 1.01361597, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.3936254852588013, "language_loss": 0.8580609, "learning_rate": 6.629355907799981e-07, "loss": 0.88097465, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.7692723274230957 }, { "auxiliary_loss_clip": 0.01225629, "auxiliary_loss_mlp": 0.01023832, "balance_loss_clip": 1.04740405, "balance_loss_mlp": 1.01701021, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.8190722498444787, "language_loss": 0.69331247, "learning_rate": 6.623563840886015e-07, "loss": 0.71580708, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.676262855529785 }, { "auxiliary_loss_clip": 0.01219354, "auxiliary_loss_mlp": 0.01020503, "balance_loss_clip": 1.04502845, "balance_loss_mlp": 1.01417649, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 2.1389580174744296, "language_loss": 0.69568503, "learning_rate": 6.617773803166795e-07, "loss": 0.71808362, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 4.522045135498047 }, { "auxiliary_loss_clip": 0.01278159, "auxiliary_loss_mlp": 0.02569993, "balance_loss_clip": 1.04896283, "balance_loss_mlp": 0.99993038, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.2186821490042443, "language_loss": 0.82033646, "learning_rate": 6.611985795520634e-07, "loss": 0.85881799, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.7216055393218994 }, { "auxiliary_loss_clip": 0.01331449, "auxiliary_loss_mlp": 0.01021186, "balance_loss_clip": 1.04783034, "balance_loss_mlp": 1.01433754, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 2.2711157169518144, "language_loss": 0.77351958, "learning_rate": 6.606199818825588e-07, "loss": 0.79704595, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 3.6228413581848145 }, { "auxiliary_loss_clip": 0.01277644, "auxiliary_loss_mlp": 0.01021742, "balance_loss_clip": 1.04461598, "balance_loss_mlp": 1.01485229, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 2.038272196802235, "language_loss": 0.81994104, "learning_rate": 6.600415873959377e-07, "loss": 0.84293497, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.6064140796661377 }, { "auxiliary_loss_clip": 0.01426614, "auxiliary_loss_mlp": 0.0255972, "balance_loss_clip": 1.04249763, "balance_loss_mlp": 0.99995553, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 2.2479224658344052, "language_loss": 0.64751232, "learning_rate": 6.594633961799437e-07, "loss": 0.68737561, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.8245465755462646 }, { "auxiliary_loss_clip": 0.01331665, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 1.04672706, "balance_loss_mlp": 1.02078271, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 2.0757582409588324, "language_loss": 0.8150543, "learning_rate": 6.588854083222857e-07, "loss": 0.83865058, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 2.6961748600006104 }, { "auxiliary_loss_clip": 0.01279114, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04889083, "balance_loss_mlp": 1.02037752, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 2.2072623741189608, "language_loss": 0.81112421, "learning_rate": 6.583076239106444e-07, "loss": 0.83418918, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.652164936065674 }, { "auxiliary_loss_clip": 0.0127874, "auxiliary_loss_mlp": 0.01025844, "balance_loss_clip": 1.04799271, "balance_loss_mlp": 1.01853609, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 2.477547769258423, "language_loss": 0.75620103, "learning_rate": 6.577300430326707e-07, "loss": 0.77924693, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.6244242191314697 }, { "auxiliary_loss_clip": 0.01319017, "auxiliary_loss_mlp": 0.01025506, "balance_loss_clip": 1.04579306, "balance_loss_mlp": 1.01855576, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.263507599012854, "language_loss": 0.72378159, "learning_rate": 6.571526657759821e-07, "loss": 0.74722683, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.6899800300598145 }, { "auxiliary_loss_clip": 0.01219657, "auxiliary_loss_mlp": 0.01028081, "balance_loss_clip": 1.04526389, "balance_loss_mlp": 1.02087772, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.6654101620058839, "language_loss": 0.70658493, "learning_rate": 6.565754922281663e-07, "loss": 0.72906232, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 3.57859206199646 }, { "auxiliary_loss_clip": 0.01277505, "auxiliary_loss_mlp": 0.01024643, "balance_loss_clip": 1.04678655, "balance_loss_mlp": 1.0176127, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 3.495495032736239, "language_loss": 0.78585285, "learning_rate": 6.559985224767801e-07, "loss": 0.80887431, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.6502761840820312 }, { "auxiliary_loss_clip": 0.01228208, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.04811358, "balance_loss_mlp": 1.02429497, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.831937810618245, "language_loss": 0.7576409, "learning_rate": 6.55421756609349e-07, "loss": 0.7802369, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.7221741676330566 }, { "auxiliary_loss_clip": 0.01228938, "auxiliary_loss_mlp": 0.0102843, "balance_loss_clip": 1.05363166, "balance_loss_mlp": 1.02160835, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 2.3554204455130003, "language_loss": 0.78962564, "learning_rate": 6.54845194713369e-07, "loss": 0.81219935, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.602095603942871 }, { "auxiliary_loss_clip": 0.01223014, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.048437, "balance_loss_mlp": 1.01971197, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 2.5574737756543824, "language_loss": 0.79952013, "learning_rate": 6.542688368763034e-07, "loss": 0.82201576, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.6224911212921143 }, { "auxiliary_loss_clip": 0.01221978, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.05015481, "balance_loss_mlp": 1.01833093, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 2.5280376847795893, "language_loss": 0.76975441, "learning_rate": 6.536926831855854e-07, "loss": 0.79222393, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.6224327087402344 }, { "auxiliary_loss_clip": 0.01266488, "auxiliary_loss_mlp": 0.01025161, "balance_loss_clip": 1.04732084, "balance_loss_mlp": 1.01818109, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.3107245435179453, "language_loss": 0.73248649, "learning_rate": 6.531167337286165e-07, "loss": 0.75540292, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.6970276832580566 }, { "auxiliary_loss_clip": 0.01268142, "auxiliary_loss_mlp": 0.01025082, "balance_loss_clip": 1.04822397, "balance_loss_mlp": 1.01883268, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.7152318984043387, "language_loss": 0.79823136, "learning_rate": 6.52540988592768e-07, "loss": 0.82116365, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.668212890625 }, { "auxiliary_loss_clip": 0.01274834, "auxiliary_loss_mlp": 0.01021977, "balance_loss_clip": 1.04740286, "balance_loss_mlp": 1.01509535, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 2.0547363880467113, "language_loss": 0.83476758, "learning_rate": 6.519654478653814e-07, "loss": 0.85773569, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.739183187484741 }, { "auxiliary_loss_clip": 0.01176791, "auxiliary_loss_mlp": 0.00999544, "balance_loss_clip": 1.01084042, "balance_loss_mlp": 0.99863797, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7461348572111697, "language_loss": 0.56071663, "learning_rate": 6.51390111633763e-07, "loss": 0.58247995, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.2973554134368896 }, { "auxiliary_loss_clip": 0.01425856, "auxiliary_loss_mlp": 0.01027034, "balance_loss_clip": 1.0408442, "balance_loss_mlp": 1.02012599, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.658433612952696, "language_loss": 0.76211667, "learning_rate": 6.508149799851932e-07, "loss": 0.78664559, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.840946674346924 }, { "auxiliary_loss_clip": 0.01268423, "auxiliary_loss_mlp": 0.01023271, "balance_loss_clip": 1.04721832, "balance_loss_mlp": 1.01697361, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 2.7391642297892673, "language_loss": 0.61581111, "learning_rate": 6.502400530069183e-07, "loss": 0.63872808, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.7304325103759766 }, { "auxiliary_loss_clip": 0.01323687, "auxiliary_loss_mlp": 0.01027723, "balance_loss_clip": 1.04771972, "balance_loss_mlp": 1.01987922, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 1.9796226764208247, "language_loss": 0.68382335, "learning_rate": 6.496653307861535e-07, "loss": 0.70733738, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.732464075088501 }, { "auxiliary_loss_clip": 0.01230798, "auxiliary_loss_mlp": 0.01029067, "balance_loss_clip": 1.04888511, "balance_loss_mlp": 1.02232289, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.8643467178595983, "language_loss": 0.66104394, "learning_rate": 6.490908134100857e-07, "loss": 0.68364263, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.6495859622955322 }, { "auxiliary_loss_clip": 0.01229755, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.04971886, "balance_loss_mlp": 1.0188632, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 4.223440423536163, "language_loss": 0.69228286, "learning_rate": 6.48516500965866e-07, "loss": 0.71484113, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.6401450634002686 }, { "auxiliary_loss_clip": 0.01228842, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.04731894, "balance_loss_mlp": 1.01982021, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 2.154401496009429, "language_loss": 0.81692988, "learning_rate": 6.479423935406192e-07, "loss": 0.83949125, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.6666197776794434 }, { "auxiliary_loss_clip": 0.01165353, "auxiliary_loss_mlp": 0.0100587, "balance_loss_clip": 1.0128324, "balance_loss_mlp": 1.00504708, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8110917338304526, "language_loss": 0.61974454, "learning_rate": 6.473684912214357e-07, "loss": 0.64145678, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.3562440872192383 }, { "auxiliary_loss_clip": 0.01223233, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.04996681, "balance_loss_mlp": 1.02177286, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 2.4405602781206595, "language_loss": 0.69138944, "learning_rate": 6.467947940953778e-07, "loss": 0.71390563, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 4.377862215042114 }, { "auxiliary_loss_clip": 0.01269473, "auxiliary_loss_mlp": 0.01023145, "balance_loss_clip": 1.04581237, "balance_loss_mlp": 1.01718783, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.9023915620191663, "language_loss": 0.72610456, "learning_rate": 6.462213022494732e-07, "loss": 0.74903071, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.7209227085113525 }, { "auxiliary_loss_clip": 0.01122267, "auxiliary_loss_mlp": 0.01001529, "balance_loss_clip": 1.01050997, "balance_loss_mlp": 1.00065255, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7687749830276606, "language_loss": 0.60979927, "learning_rate": 6.456480157707201e-07, "loss": 0.63103724, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 4.0394651889801025 }, { "auxiliary_loss_clip": 0.01318939, "auxiliary_loss_mlp": 0.01024032, "balance_loss_clip": 1.04300511, "balance_loss_mlp": 1.01739502, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.1666616921281534, "language_loss": 0.85046244, "learning_rate": 6.450749347460866e-07, "loss": 0.87389207, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.685598373413086 }, { "auxiliary_loss_clip": 0.01176473, "auxiliary_loss_mlp": 0.01025259, "balance_loss_clip": 1.05102038, "balance_loss_mlp": 1.01822519, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.934405758446165, "language_loss": 0.78885877, "learning_rate": 6.445020592625083e-07, "loss": 0.81087613, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.6710145473480225 }, { "auxiliary_loss_clip": 0.01172448, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.04809999, "balance_loss_mlp": 1.01871502, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 9.12028382521391, "language_loss": 0.80821812, "learning_rate": 6.4392938940689e-07, "loss": 0.83019495, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 2.5523126125335693 }, { "auxiliary_loss_clip": 0.01373751, "auxiliary_loss_mlp": 0.02566026, "balance_loss_clip": 1.04592419, "balance_loss_mlp": 0.99990934, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.4037299847384603, "language_loss": 0.71420819, "learning_rate": 6.433569252661049e-07, "loss": 0.75360596, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.817814588546753 }, { "auxiliary_loss_clip": 0.01315797, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.04520679, "balance_loss_mlp": 1.01973116, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 3.4111673806220972, "language_loss": 0.71398056, "learning_rate": 6.427846669269952e-07, "loss": 0.73739696, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.643432855606079 }, { "auxiliary_loss_clip": 0.01176859, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.05252433, "balance_loss_mlp": 1.02236319, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 2.040041871240816, "language_loss": 0.82699132, "learning_rate": 6.422126144763729e-07, "loss": 0.84904772, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.6025140285491943 }, { "auxiliary_loss_clip": 0.01318755, "auxiliary_loss_mlp": 0.02566607, "balance_loss_clip": 1.04193974, "balance_loss_mlp": 0.99990523, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 2.0326685134013767, "language_loss": 0.76746434, "learning_rate": 6.416407680010174e-07, "loss": 0.80631804, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 3.626586437225342 }, { "auxiliary_loss_clip": 0.01289514, "auxiliary_loss_mlp": 0.01022746, "balance_loss_clip": 1.04990673, "balance_loss_mlp": 1.01586151, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 2.2274175018283318, "language_loss": 0.8124699, "learning_rate": 6.410691275876774e-07, "loss": 0.83559251, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.7540652751922607 }, { "auxiliary_loss_clip": 0.01179649, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.04711366, "balance_loss_mlp": 1.02627456, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.547374993624888, "language_loss": 0.77227032, "learning_rate": 6.404976933230704e-07, "loss": 0.79440022, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.7227747440338135 }, { "auxiliary_loss_clip": 0.01279294, "auxiliary_loss_mlp": 0.01027422, "balance_loss_clip": 1.04722381, "balance_loss_mlp": 1.01978052, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 2.55403645182253, "language_loss": 0.72164202, "learning_rate": 6.399264652938813e-07, "loss": 0.74470925, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.7635414600372314 }, { "auxiliary_loss_clip": 0.01270774, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.04588246, "balance_loss_mlp": 1.01977944, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.843109752483123, "language_loss": 0.74374574, "learning_rate": 6.393554435867679e-07, "loss": 0.76672077, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.721463918685913 }, { "auxiliary_loss_clip": 0.01322944, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.04497194, "balance_loss_mlp": 1.01861262, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 2.00377586977192, "language_loss": 0.83668971, "learning_rate": 6.387846282883502e-07, "loss": 0.86017114, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.713993549346924 }, { "auxiliary_loss_clip": 0.01173231, "auxiliary_loss_mlp": 0.01025747, "balance_loss_clip": 1.0502615, "balance_loss_mlp": 1.01873708, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 2.9782315327146214, "language_loss": 0.76677418, "learning_rate": 6.38214019485223e-07, "loss": 0.78876394, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.649962902069092 }, { "auxiliary_loss_clip": 0.01415195, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.04042697, "balance_loss_mlp": 1.02190185, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 2.0737401019353467, "language_loss": 0.71448481, "learning_rate": 6.376436172639461e-07, "loss": 0.73892283, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.7121047973632812 }, { "auxiliary_loss_clip": 0.01383559, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.04437697, "balance_loss_mlp": 1.02215087, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.2295708699522834, "language_loss": 0.65512085, "learning_rate": 6.370734217110487e-07, "loss": 0.67925537, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.8439457416534424 }, { "auxiliary_loss_clip": 0.01280709, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.05317569, "balance_loss_mlp": 1.02000284, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 2.978742089231542, "language_loss": 0.64454889, "learning_rate": 6.36503432913031e-07, "loss": 0.66762984, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 3.1692657470703125 }, { "auxiliary_loss_clip": 0.01221919, "auxiliary_loss_mlp": 0.01026769, "balance_loss_clip": 1.04853606, "balance_loss_mlp": 1.01969099, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 2.2618764912246974, "language_loss": 0.69214255, "learning_rate": 6.359336509563569e-07, "loss": 0.71462941, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.6195778846740723 }, { "auxiliary_loss_clip": 0.01314187, "auxiliary_loss_mlp": 0.01022804, "balance_loss_clip": 1.04413903, "balance_loss_mlp": 1.01583028, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.8559249344465174, "language_loss": 0.80747175, "learning_rate": 6.353640759274641e-07, "loss": 0.83084166, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.701646566390991 }, { "auxiliary_loss_clip": 0.01223505, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.04617643, "balance_loss_mlp": 1.02055383, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 2.686094586760584, "language_loss": 0.74481332, "learning_rate": 6.347947079127556e-07, "loss": 0.7673195, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.592686176300049 }, { "auxiliary_loss_clip": 0.01272837, "auxiliary_loss_mlp": 0.01022111, "balance_loss_clip": 1.04712272, "balance_loss_mlp": 1.01527786, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.3397020194551086, "language_loss": 0.77213418, "learning_rate": 6.342255469986053e-07, "loss": 0.7950837, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.6553595066070557 }, { "auxiliary_loss_clip": 0.01173144, "auxiliary_loss_mlp": 0.0101773, "balance_loss_clip": 1.04971766, "balance_loss_mlp": 1.01112878, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 2.1720141077511697, "language_loss": 0.76385111, "learning_rate": 6.336565932713533e-07, "loss": 0.78575993, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.613816499710083 }, { "auxiliary_loss_clip": 0.01277915, "auxiliary_loss_mlp": 0.01027089, "balance_loss_clip": 1.05145383, "balance_loss_mlp": 1.01995778, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 2.290258192399156, "language_loss": 0.779181, "learning_rate": 6.330878468173088e-07, "loss": 0.80223107, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.736750364303589 }, { "auxiliary_loss_clip": 0.01219737, "auxiliary_loss_mlp": 0.0102382, "balance_loss_clip": 1.04544711, "balance_loss_mlp": 1.01721263, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 1.6761854892236625, "language_loss": 0.72697723, "learning_rate": 6.32519307722752e-07, "loss": 0.74941278, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.5814549922943115 }, { "auxiliary_loss_clip": 0.01235172, "auxiliary_loss_mlp": 0.00998633, "balance_loss_clip": 1.02028131, "balance_loss_mlp": 0.99757808, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8358916795645344, "language_loss": 0.5498358, "learning_rate": 6.31950976073929e-07, "loss": 0.57217383, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 5.030264377593994 }, { "auxiliary_loss_clip": 0.01374665, "auxiliary_loss_mlp": 0.01025132, "balance_loss_clip": 1.04694247, "balance_loss_mlp": 1.01833427, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 1.9598036488623054, "language_loss": 0.80569386, "learning_rate": 6.31382851957055e-07, "loss": 0.82969183, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.736459970474243 }, { "auxiliary_loss_clip": 0.01321318, "auxiliary_loss_mlp": 0.02563228, "balance_loss_clip": 1.0465548, "balance_loss_mlp": 0.9999243, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 3.085368888046895, "language_loss": 0.71805328, "learning_rate": 6.308149354583143e-07, "loss": 0.75689876, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 3.8512353897094727 }, { "auxiliary_loss_clip": 0.01230993, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 1.05086279, "balance_loss_mlp": 1.02306962, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 1.9029299382187905, "language_loss": 0.8191613, "learning_rate": 6.302472266638586e-07, "loss": 0.84177494, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.6258554458618164 }, { "auxiliary_loss_clip": 0.01181791, "auxiliary_loss_mlp": 0.01024153, "balance_loss_clip": 1.05221343, "balance_loss_mlp": 1.01670218, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.144105035960463, "language_loss": 0.70089531, "learning_rate": 6.296797256598101e-07, "loss": 0.72295475, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.6991336345672607 }, { "auxiliary_loss_clip": 0.01318181, "auxiliary_loss_mlp": 0.01019232, "balance_loss_clip": 1.04513574, "balance_loss_mlp": 1.01267266, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.8202269419668702, "language_loss": 0.81000996, "learning_rate": 6.291124325322576e-07, "loss": 0.8333841, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 2.705414056777954 }, { "auxiliary_loss_clip": 0.0127938, "auxiliary_loss_mlp": 0.01022674, "balance_loss_clip": 1.04844022, "balance_loss_mlp": 1.01553345, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.668046434183786, "language_loss": 0.62507302, "learning_rate": 6.285453473672595e-07, "loss": 0.64809346, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.8758492469787598 }, { "auxiliary_loss_clip": 0.01174277, "auxiliary_loss_mlp": 0.01024879, "balance_loss_clip": 1.05016756, "balance_loss_mlp": 1.01866841, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 2.921265657843424, "language_loss": 0.75667942, "learning_rate": 6.279784702508415e-07, "loss": 0.77867103, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.575937509536743 }, { "auxiliary_loss_clip": 0.01130837, "auxiliary_loss_mlp": 0.00999833, "balance_loss_clip": 1.01036882, "balance_loss_mlp": 0.99889755, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7787333355883057, "language_loss": 0.58578098, "learning_rate": 6.274118012689979e-07, "loss": 0.60708761, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.37434720993042 }, { "auxiliary_loss_clip": 0.01268728, "auxiliary_loss_mlp": 0.01026449, "balance_loss_clip": 1.04405403, "balance_loss_mlp": 1.01940656, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.5689219261703458, "language_loss": 0.68369991, "learning_rate": 6.268453405076943e-07, "loss": 0.70665169, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 3.6518826484680176 }, { "auxiliary_loss_clip": 0.01273185, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.04598916, "balance_loss_mlp": 1.02175951, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 2.9302514136478424, "language_loss": 0.8218137, "learning_rate": 6.262790880528592e-07, "loss": 0.84483171, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.647641658782959 }, { "auxiliary_loss_clip": 0.01334067, "auxiliary_loss_mlp": 0.01029813, "balance_loss_clip": 1.04370785, "balance_loss_mlp": 1.0224489, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.8017058103746266, "language_loss": 0.79848087, "learning_rate": 6.257130439903951e-07, "loss": 0.82211965, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.692953109741211 }, { "auxiliary_loss_clip": 0.01178465, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.02367747, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.939514489147152, "language_loss": 0.80967134, "learning_rate": 6.251472084061695e-07, "loss": 0.83176339, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.6251463890075684 }, { "auxiliary_loss_clip": 0.01221813, "auxiliary_loss_mlp": 0.01027573, "balance_loss_clip": 1.04842687, "balance_loss_mlp": 1.02093065, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 3.5311608987528147, "language_loss": 0.88843358, "learning_rate": 6.245815813860191e-07, "loss": 0.91092736, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.6296768188476562 }, { "auxiliary_loss_clip": 0.01178394, "auxiliary_loss_mlp": 0.0102967, "balance_loss_clip": 1.05039096, "balance_loss_mlp": 1.02219892, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 2.038011386610414, "language_loss": 0.70361817, "learning_rate": 6.240161630157495e-07, "loss": 0.72569883, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.573375940322876 }, { "auxiliary_loss_clip": 0.01179251, "auxiliary_loss_mlp": 0.01025243, "balance_loss_clip": 1.05209529, "balance_loss_mlp": 1.01823938, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 3.051608268786277, "language_loss": 0.70308137, "learning_rate": 6.23450953381133e-07, "loss": 0.72512627, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.606398582458496 }, { "auxiliary_loss_clip": 0.01271659, "auxiliary_loss_mlp": 0.01023047, "balance_loss_clip": 1.04646218, "balance_loss_mlp": 1.01669645, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 1.8318306828413755, "language_loss": 0.67886597, "learning_rate": 6.228859525679131e-07, "loss": 0.70181304, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.583494186401367 }, { "auxiliary_loss_clip": 0.01224338, "auxiliary_loss_mlp": 0.01023102, "balance_loss_clip": 1.04972148, "balance_loss_mlp": 1.01627707, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.537618473656424, "language_loss": 0.79484689, "learning_rate": 6.223211606617986e-07, "loss": 0.81732124, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.588749885559082 }, { "auxiliary_loss_clip": 0.01222354, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.05190086, "balance_loss_mlp": 1.02248955, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.776608233216633, "language_loss": 0.8395915, "learning_rate": 6.217565777484701e-07, "loss": 0.86209702, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.612347364425659 }, { "auxiliary_loss_clip": 0.01270485, "auxiliary_loss_mlp": 0.02565191, "balance_loss_clip": 1.04759645, "balance_loss_mlp": 0.99991363, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 2.027173292677425, "language_loss": 0.80173403, "learning_rate": 6.211922039135722e-07, "loss": 0.84009075, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.682100534439087 }, { "auxiliary_loss_clip": 0.0117618, "auxiliary_loss_mlp": 0.01022571, "balance_loss_clip": 1.05200696, "balance_loss_mlp": 1.01615167, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 2.9193839104557964, "language_loss": 0.81114149, "learning_rate": 6.206280392427201e-07, "loss": 0.83312899, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.5994954109191895 }, { "auxiliary_loss_clip": 0.01217989, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.04558873, "balance_loss_mlp": 1.02164483, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.8135812013524546, "language_loss": 0.73869145, "learning_rate": 6.200640838214983e-07, "loss": 0.76115966, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.7624800205230713 }, { "auxiliary_loss_clip": 0.01174668, "auxiliary_loss_mlp": 0.01022057, "balance_loss_clip": 1.0504818, "balance_loss_mlp": 1.01586163, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 1.7825597858636124, "language_loss": 0.6684745, "learning_rate": 6.195003377354578e-07, "loss": 0.69044173, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.5867245197296143 }, { "auxiliary_loss_clip": 0.01223774, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.04585421, "balance_loss_mlp": 1.02184677, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.7195984159482283, "language_loss": 0.73467451, "learning_rate": 6.189368010701183e-07, "loss": 0.75719929, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.5812289714813232 }, { "auxiliary_loss_clip": 0.01228966, "auxiliary_loss_mlp": 0.01023705, "balance_loss_clip": 1.0470438, "balance_loss_mlp": 1.01669526, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 1.8197526702011737, "language_loss": 0.7643103, "learning_rate": 6.183734739109683e-07, "loss": 0.78683704, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.624143600463867 }, { "auxiliary_loss_clip": 0.01134333, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.0527761, "balance_loss_mlp": 1.01990557, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 3.2100509338241063, "language_loss": 0.69131732, "learning_rate": 6.178103563434629e-07, "loss": 0.71293181, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.6519241333007812 }, { "auxiliary_loss_clip": 0.01176883, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.05202699, "balance_loss_mlp": 1.02196944, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.9156463045510121, "language_loss": 0.841717, "learning_rate": 6.172474484530283e-07, "loss": 0.86377245, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 3.567047357559204 }, { "auxiliary_loss_clip": 0.01268137, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.04271722, "balance_loss_mlp": 1.01824677, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 2.521099386594235, "language_loss": 0.76004755, "learning_rate": 6.166847503250563e-07, "loss": 0.78298044, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.7720963954925537 }, { "auxiliary_loss_clip": 0.01270525, "auxiliary_loss_mlp": 0.01024116, "balance_loss_clip": 1.04617739, "balance_loss_mlp": 1.01702905, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 3.3943868643665773, "language_loss": 0.78649306, "learning_rate": 6.161222620449078e-07, "loss": 0.80943954, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 3.6157946586608887 }, { "auxiliary_loss_clip": 0.01322665, "auxiliary_loss_mlp": 0.01026297, "balance_loss_clip": 1.04667711, "balance_loss_mlp": 1.01914132, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 2.5406326941022233, "language_loss": 0.80043435, "learning_rate": 6.155599836979117e-07, "loss": 0.82392395, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.7174274921417236 }, { "auxiliary_loss_clip": 0.01370364, "auxiliary_loss_mlp": 0.0102392, "balance_loss_clip": 1.04249167, "balance_loss_mlp": 1.0166508, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 3.084013146784845, "language_loss": 0.8181107, "learning_rate": 6.149979153693649e-07, "loss": 0.84205353, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.719174385070801 }, { "auxiliary_loss_clip": 0.01226308, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.04770601, "balance_loss_mlp": 1.02451611, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 1.9938376380588003, "language_loss": 0.7703532, "learning_rate": 6.144360571445343e-07, "loss": 0.79293442, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 2.6908886432647705 }, { "auxiliary_loss_clip": 0.01223247, "auxiliary_loss_mlp": 0.01023889, "balance_loss_clip": 1.04994309, "balance_loss_mlp": 1.01701367, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.9310994199712697, "language_loss": 0.80133092, "learning_rate": 6.138744091086509e-07, "loss": 0.82380223, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.603250741958618 }, { "auxiliary_loss_clip": 0.01322344, "auxiliary_loss_mlp": 0.01022311, "balance_loss_clip": 1.04664874, "balance_loss_mlp": 1.01587117, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 3.8479032605367802, "language_loss": 0.72406077, "learning_rate": 6.133129713469183e-07, "loss": 0.74750733, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.755819082260132 }, { "auxiliary_loss_clip": 0.01332158, "auxiliary_loss_mlp": 0.01028016, "balance_loss_clip": 1.04423451, "balance_loss_mlp": 1.02100682, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.8741366260917156, "language_loss": 0.63984865, "learning_rate": 6.127517439445053e-07, "loss": 0.66345042, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.759401798248291 }, { "auxiliary_loss_clip": 0.0126955, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.04549956, "balance_loss_mlp": 1.0212028, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 2.802083561051667, "language_loss": 0.81585163, "learning_rate": 6.121907269865498e-07, "loss": 0.83881783, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 3.6661221981048584 }, { "auxiliary_loss_clip": 0.01221089, "auxiliary_loss_mlp": 0.01001475, "balance_loss_clip": 1.01131225, "balance_loss_mlp": 1.00041437, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9328799528121419, "language_loss": 0.67185277, "learning_rate": 6.116299205581577e-07, "loss": 0.69407839, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.2214555740356445 }, { "auxiliary_loss_clip": 0.01179544, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.05040479, "balance_loss_mlp": 1.021222, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 1.8298276358854912, "language_loss": 0.68432462, "learning_rate": 6.110693247444018e-07, "loss": 0.70640898, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.672105550765991 }, { "auxiliary_loss_clip": 0.01314855, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 1.0438205, "balance_loss_mlp": 1.01845491, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 2.1678966264859705, "language_loss": 0.82797599, "learning_rate": 6.105089396303258e-07, "loss": 0.85137606, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.705355167388916 }, { "auxiliary_loss_clip": 0.01275617, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.04781389, "balance_loss_mlp": 1.02011561, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 2.0891232547884395, "language_loss": 0.76118004, "learning_rate": 6.099487653009383e-07, "loss": 0.78420854, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.726875066757202 }, { "auxiliary_loss_clip": 0.01224304, "auxiliary_loss_mlp": 0.01022846, "balance_loss_clip": 1.04951191, "balance_loss_mlp": 1.01701093, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 1.9855627546290837, "language_loss": 0.83022672, "learning_rate": 6.093888018412192e-07, "loss": 0.85269821, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.64103364944458 }, { "auxiliary_loss_clip": 0.01123053, "auxiliary_loss_mlp": 0.01001162, "balance_loss_clip": 1.01119924, "balance_loss_mlp": 1.00024974, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7008956402672702, "language_loss": 0.5461086, "learning_rate": 6.088290493361125e-07, "loss": 0.5673508, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.3366308212280273 }, { "auxiliary_loss_clip": 0.01373236, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.04501438, "balance_loss_mlp": 1.02143943, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 1.9921749948058005, "language_loss": 0.71548057, "learning_rate": 6.082695078705322e-07, "loss": 0.73950255, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.682206392288208 }, { "auxiliary_loss_clip": 0.01223934, "auxiliary_loss_mlp": 0.01024744, "balance_loss_clip": 1.04981256, "balance_loss_mlp": 1.01772869, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 4.574554554113987, "language_loss": 0.69066495, "learning_rate": 6.077101775293618e-07, "loss": 0.71315169, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.600038528442383 }, { "auxiliary_loss_clip": 0.01223394, "auxiliary_loss_mlp": 0.01025654, "balance_loss_clip": 1.04752994, "balance_loss_mlp": 1.01834893, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 3.8497875426713195, "language_loss": 0.82372731, "learning_rate": 6.071510583974504e-07, "loss": 0.84621775, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.5657777786254883 }, { "auxiliary_loss_clip": 0.01177624, "auxiliary_loss_mlp": 0.01027698, "balance_loss_clip": 1.05196762, "balance_loss_mlp": 1.02098024, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 3.0805514366232933, "language_loss": 0.7196995, "learning_rate": 6.065921505596161e-07, "loss": 0.74175268, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.5643513202667236 }, { "auxiliary_loss_clip": 0.01319997, "auxiliary_loss_mlp": 0.01021567, "balance_loss_clip": 1.04676604, "balance_loss_mlp": 1.01474476, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 6.289571373869322, "language_loss": 0.77271402, "learning_rate": 6.060334541006445e-07, "loss": 0.7961297, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.649203300476074 }, { "auxiliary_loss_clip": 0.0132539, "auxiliary_loss_mlp": 0.01026579, "balance_loss_clip": 1.04169118, "balance_loss_mlp": 1.01961696, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.8405067803854804, "language_loss": 0.69333887, "learning_rate": 6.05474969105289e-07, "loss": 0.71685857, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.781316041946411 }, { "auxiliary_loss_clip": 0.01223417, "auxiliary_loss_mlp": 0.01024279, "balance_loss_clip": 1.04838991, "balance_loss_mlp": 1.01684952, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.949498321056778, "language_loss": 0.73749995, "learning_rate": 6.049166956582725e-07, "loss": 0.75997692, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.554251194000244 }, { "auxiliary_loss_clip": 0.01222111, "auxiliary_loss_mlp": 0.01021944, "balance_loss_clip": 1.04775143, "balance_loss_mlp": 1.01556373, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 1.9957709299986253, "language_loss": 0.87481868, "learning_rate": 6.043586338442841e-07, "loss": 0.89725924, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.6925723552703857 }, { "auxiliary_loss_clip": 0.01169788, "auxiliary_loss_mlp": 0.01021659, "balance_loss_clip": 1.04854262, "balance_loss_mlp": 1.01507592, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.4306156614819727, "language_loss": 0.73073763, "learning_rate": 6.038007837479815e-07, "loss": 0.75265205, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.609532117843628 }, { "auxiliary_loss_clip": 0.0122165, "auxiliary_loss_mlp": 0.01023116, "balance_loss_clip": 1.0486536, "balance_loss_mlp": 1.01639581, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 2.0300690379358004, "language_loss": 0.64324123, "learning_rate": 6.032431454539897e-07, "loss": 0.66568887, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.631594657897949 }, { "auxiliary_loss_clip": 0.013198, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.04515493, "balance_loss_mlp": 1.01883435, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.8814573378597863, "language_loss": 0.8152746, "learning_rate": 6.026857190469014e-07, "loss": 0.83872855, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 4.791935205459595 }, { "auxiliary_loss_clip": 0.01273827, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04696703, "balance_loss_mlp": 1.01668024, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 1.8238713453225737, "language_loss": 0.73882818, "learning_rate": 6.0212850461128e-07, "loss": 0.76180297, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.66326642036438 }, { "auxiliary_loss_clip": 0.01271893, "auxiliary_loss_mlp": 0.0102441, "balance_loss_clip": 1.04417872, "balance_loss_mlp": 1.01721573, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 2.653528689598395, "language_loss": 0.74502492, "learning_rate": 6.015715022316516e-07, "loss": 0.76798797, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 3.689634323120117 }, { "auxiliary_loss_clip": 0.01376685, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.04177952, "balance_loss_mlp": 1.02105963, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 4.369677647015697, "language_loss": 0.77833241, "learning_rate": 6.010147119925154e-07, "loss": 0.80238223, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.7420337200164795 }, { "auxiliary_loss_clip": 0.01314751, "auxiliary_loss_mlp": 0.01021336, "balance_loss_clip": 1.04379892, "balance_loss_mlp": 1.01445198, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 2.6684610039873693, "language_loss": 0.6669631, "learning_rate": 6.004581339783348e-07, "loss": 0.69032395, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.7293553352355957 }, { "auxiliary_loss_clip": 0.01229491, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.04997909, "balance_loss_mlp": 1.02047026, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 3.290869433383632, "language_loss": 0.68799067, "learning_rate": 5.999017682735425e-07, "loss": 0.71056604, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.6805953979492188 }, { "auxiliary_loss_clip": 0.01428354, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.04302549, "balance_loss_mlp": 1.0172534, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 22.064945291353514, "language_loss": 0.66457927, "learning_rate": 5.993456149625387e-07, "loss": 0.68910539, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.8822181224823 }, { "auxiliary_loss_clip": 0.01317606, "auxiliary_loss_mlp": 0.01022213, "balance_loss_clip": 1.04473674, "balance_loss_mlp": 1.01593041, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.8309137063280165, "language_loss": 0.8250488, "learning_rate": 5.987896741296909e-07, "loss": 0.84844697, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.7634711265563965 }, { "auxiliary_loss_clip": 0.01273413, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.04876471, "balance_loss_mlp": 1.02089381, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.0566261733172446, "language_loss": 0.77988744, "learning_rate": 5.982339458593361e-07, "loss": 0.80289644, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.6642098426818848 }, { "auxiliary_loss_clip": 0.01220083, "auxiliary_loss_mlp": 0.02564001, "balance_loss_clip": 1.0493958, "balance_loss_mlp": 0.99992168, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.6161488112966702, "language_loss": 0.84175545, "learning_rate": 5.976784302357767e-07, "loss": 0.87959623, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 3.5672101974487305 }, { "auxiliary_loss_clip": 0.01226644, "auxiliary_loss_mlp": 0.0102714, "balance_loss_clip": 1.04818463, "balance_loss_mlp": 1.02029407, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 3.8817193326729016, "language_loss": 0.73654789, "learning_rate": 5.971231273432855e-07, "loss": 0.75908571, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.6422367095947266 }, { "auxiliary_loss_clip": 0.01120451, "auxiliary_loss_mlp": 0.01001742, "balance_loss_clip": 1.01108479, "balance_loss_mlp": 1.00080037, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.813998951886596, "language_loss": 0.54529709, "learning_rate": 5.965680372661e-07, "loss": 0.56651902, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.1143436431884766 }, { "auxiliary_loss_clip": 0.01274505, "auxiliary_loss_mlp": 0.01017435, "balance_loss_clip": 1.04897571, "balance_loss_mlp": 1.01130819, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.7482367631342113, "language_loss": 0.56402946, "learning_rate": 5.960131600884266e-07, "loss": 0.58694887, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.764019012451172 }, { "auxiliary_loss_clip": 0.01321806, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 1.04389215, "balance_loss_mlp": 1.019521, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.96281666475847, "language_loss": 0.76179385, "learning_rate": 5.954584958944413e-07, "loss": 0.78527528, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.7051656246185303 }, { "auxiliary_loss_clip": 0.01323402, "auxiliary_loss_mlp": 0.02563886, "balance_loss_clip": 1.0432179, "balance_loss_mlp": 0.99990463, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 2.0073899310533156, "language_loss": 0.81491971, "learning_rate": 5.949040447682854e-07, "loss": 0.85379261, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.692431926727295 }, { "auxiliary_loss_clip": 0.01278832, "auxiliary_loss_mlp": 0.01022138, "balance_loss_clip": 1.04695368, "balance_loss_mlp": 1.01523912, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 4.66048957171087, "language_loss": 0.68318677, "learning_rate": 5.943498067940686e-07, "loss": 0.70619643, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.710841178894043 }, { "auxiliary_loss_clip": 0.01271801, "auxiliary_loss_mlp": 0.01025555, "balance_loss_clip": 1.05137372, "balance_loss_mlp": 1.01922154, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 2.0390895924136627, "language_loss": 0.81308281, "learning_rate": 5.937957820558686e-07, "loss": 0.83605635, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.694814443588257 }, { "auxiliary_loss_clip": 0.01075194, "auxiliary_loss_mlp": 0.0100244, "balance_loss_clip": 1.01026452, "balance_loss_mlp": 1.00152254, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8514658805479722, "language_loss": 0.65306121, "learning_rate": 5.932419706377296e-07, "loss": 0.6738376, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.1912496089935303 }, { "auxiliary_loss_clip": 0.0131789, "auxiliary_loss_mlp": 0.01023935, "balance_loss_clip": 1.04715323, "balance_loss_mlp": 1.01741719, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.893077587079921, "language_loss": 0.74232876, "learning_rate": 5.92688372623666e-07, "loss": 0.76574695, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.7779858112335205 }, { "auxiliary_loss_clip": 0.01221202, "auxiliary_loss_mlp": 0.01026001, "balance_loss_clip": 1.04601073, "balance_loss_mlp": 1.01860392, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 2.1995240343211635, "language_loss": 0.74274671, "learning_rate": 5.921349880976574e-07, "loss": 0.76521879, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.580608367919922 }, { "auxiliary_loss_clip": 0.01277544, "auxiliary_loss_mlp": 0.02565571, "balance_loss_clip": 1.04594088, "balance_loss_mlp": 0.99994129, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 2.235081666993195, "language_loss": 0.82289761, "learning_rate": 5.915818171436515e-07, "loss": 0.86132872, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.718810796737671 }, { "auxiliary_loss_clip": 0.01274077, "auxiliary_loss_mlp": 0.01027239, "balance_loss_clip": 1.04341531, "balance_loss_mlp": 1.01959157, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 2.917010381467531, "language_loss": 0.74279988, "learning_rate": 5.910288598455642e-07, "loss": 0.76581311, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.6561546325683594 }, { "auxiliary_loss_clip": 0.01231, "auxiliary_loss_mlp": 0.01024979, "balance_loss_clip": 1.04875398, "balance_loss_mlp": 1.01757026, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.308221615623373, "language_loss": 0.74260187, "learning_rate": 5.90476116287278e-07, "loss": 0.76516169, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.55393648147583 }, { "auxiliary_loss_clip": 0.01271809, "auxiliary_loss_mlp": 0.01021194, "balance_loss_clip": 1.04820812, "balance_loss_mlp": 1.01431274, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 2.1447554004780645, "language_loss": 0.67844522, "learning_rate": 5.899235865526456e-07, "loss": 0.70137525, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.6653871536254883 }, { "auxiliary_loss_clip": 0.01320404, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.04528737, "balance_loss_mlp": 1.02301085, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.721989366609964, "language_loss": 0.82171917, "learning_rate": 5.893712707254825e-07, "loss": 0.84521782, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.684800863265991 }, { "auxiliary_loss_clip": 0.01366835, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.03994823, "balance_loss_mlp": 1.02318883, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 3.5436703336880444, "language_loss": 0.66022241, "learning_rate": 5.888191688895769e-07, "loss": 0.68420041, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.782712697982788 }, { "auxiliary_loss_clip": 0.01175734, "auxiliary_loss_mlp": 0.0102284, "balance_loss_clip": 1.04834056, "balance_loss_mlp": 1.0150615, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 3.282848727190275, "language_loss": 0.6220715, "learning_rate": 5.882672811286813e-07, "loss": 0.64405727, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 3.5912246704101562 }, { "auxiliary_loss_clip": 0.01175296, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.04903674, "balance_loss_mlp": 1.02077389, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.156063962687213, "language_loss": 0.68907344, "learning_rate": 5.877156075265166e-07, "loss": 0.71110588, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.5461061000823975 }, { "auxiliary_loss_clip": 0.01274632, "auxiliary_loss_mlp": 0.01023233, "balance_loss_clip": 1.04341245, "balance_loss_mlp": 1.01580048, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 2.4504963543430796, "language_loss": 0.69621205, "learning_rate": 5.871641481667715e-07, "loss": 0.71919072, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 3.5113821029663086 }, { "auxiliary_loss_clip": 0.01376286, "auxiliary_loss_mlp": 0.01026634, "balance_loss_clip": 1.04567409, "balance_loss_mlp": 1.01938057, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 2.941750633353927, "language_loss": 0.84015191, "learning_rate": 5.866129031331011e-07, "loss": 0.86418116, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.736659049987793 }, { "auxiliary_loss_clip": 0.0127059, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.04409361, "balance_loss_mlp": 1.01707578, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 2.350313271549552, "language_loss": 0.83365321, "learning_rate": 5.8606187250913e-07, "loss": 0.85660338, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.6694650650024414 }, { "auxiliary_loss_clip": 0.01224555, "auxiliary_loss_mlp": 0.02567071, "balance_loss_clip": 1.05040801, "balance_loss_mlp": 0.99997067, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 2.9636995698056188, "language_loss": 0.8420198, "learning_rate": 5.855110563784482e-07, "loss": 0.8799361, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.6513469219207764 }, { "auxiliary_loss_clip": 0.01221381, "auxiliary_loss_mlp": 0.02563043, "balance_loss_clip": 1.04640102, "balance_loss_mlp": 0.99991596, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 1.9613089379509383, "language_loss": 0.64034998, "learning_rate": 5.849604548246156e-07, "loss": 0.67819422, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.713392734527588 }, { "auxiliary_loss_clip": 0.01281171, "auxiliary_loss_mlp": 0.02565415, "balance_loss_clip": 1.05061126, "balance_loss_mlp": 0.99993885, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 2.1878244644099167, "language_loss": 0.80402201, "learning_rate": 5.844100679311565e-07, "loss": 0.84248787, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.6313488483428955 }, { "auxiliary_loss_clip": 0.01270402, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.04786468, "balance_loss_mlp": 1.01910794, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 2.737486751958453, "language_loss": 0.76284558, "learning_rate": 5.838598957815637e-07, "loss": 0.78581274, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.654484272003174 }, { "auxiliary_loss_clip": 0.01265717, "auxiliary_loss_mlp": 0.01024312, "balance_loss_clip": 1.04579544, "balance_loss_mlp": 1.01775897, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.5855610806726423, "language_loss": 0.85596973, "learning_rate": 5.833099384592996e-07, "loss": 0.87887001, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 3.601959466934204 }, { "auxiliary_loss_clip": 0.01269132, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.04854047, "balance_loss_mlp": 1.01973557, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.3207012740589867, "language_loss": 0.71775103, "learning_rate": 5.827601960477913e-07, "loss": 0.74071097, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.6713671684265137 }, { "auxiliary_loss_clip": 0.01223189, "auxiliary_loss_mlp": 0.01019657, "balance_loss_clip": 1.0470103, "balance_loss_mlp": 1.01352072, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 2.1965720626103886, "language_loss": 0.70598632, "learning_rate": 5.822106686304344e-07, "loss": 0.72841477, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.6362462043762207 }, { "auxiliary_loss_clip": 0.01325121, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.04603994, "balance_loss_mlp": 1.01888454, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 1.7572402948020396, "language_loss": 0.57980227, "learning_rate": 5.816613562905919e-07, "loss": 0.60331178, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.7409708499908447 }, { "auxiliary_loss_clip": 0.01320691, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.04910409, "balance_loss_mlp": 1.0187223, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.8794735660386386, "language_loss": 0.69802773, "learning_rate": 5.811122591115933e-07, "loss": 0.72148609, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.7690701484680176 }, { "auxiliary_loss_clip": 0.01328259, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.0527153, "balance_loss_mlp": 1.01850712, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.659974698452859, "language_loss": 0.71408081, "learning_rate": 5.805633771767376e-07, "loss": 0.73761737, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.7120754718780518 }, { "auxiliary_loss_clip": 0.01176747, "auxiliary_loss_mlp": 0.01025633, "balance_loss_clip": 1.0493536, "balance_loss_mlp": 1.01890969, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 2.043086645890323, "language_loss": 0.77759093, "learning_rate": 5.800147105692888e-07, "loss": 0.79961473, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.655167818069458 }, { "auxiliary_loss_clip": 0.01225822, "auxiliary_loss_mlp": 0.01025627, "balance_loss_clip": 1.04727018, "balance_loss_mlp": 1.01860595, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.7686406187708583, "language_loss": 0.79022503, "learning_rate": 5.794662593724795e-07, "loss": 0.81273949, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.6210312843322754 }, { "auxiliary_loss_clip": 0.01175468, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.05062735, "balance_loss_mlp": 1.02282858, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 2.1037384509263823, "language_loss": 0.74852085, "learning_rate": 5.789180236695091e-07, "loss": 0.77057719, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.5613694190979004 }, { "auxiliary_loss_clip": 0.01219169, "auxiliary_loss_mlp": 0.01022028, "balance_loss_clip": 1.04855406, "balance_loss_mlp": 1.01559961, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 1.9089030422057378, "language_loss": 0.84846056, "learning_rate": 5.78370003543544e-07, "loss": 0.8708725, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.6367571353912354 }, { "auxiliary_loss_clip": 0.01224438, "auxiliary_loss_mlp": 0.02567938, "balance_loss_clip": 1.04904437, "balance_loss_mlp": 0.99995041, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 2.3371609562120734, "language_loss": 0.83881831, "learning_rate": 5.778221990777203e-07, "loss": 0.87674212, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.6249451637268066 }, { "auxiliary_loss_clip": 0.01268852, "auxiliary_loss_mlp": 0.01019934, "balance_loss_clip": 1.04680169, "balance_loss_mlp": 1.01307023, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 3.9714871188246548, "language_loss": 0.82512867, "learning_rate": 5.772746103551372e-07, "loss": 0.84801656, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.7070794105529785 }, { "auxiliary_loss_clip": 0.01269259, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 1.04659724, "balance_loss_mlp": 1.01782036, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 3.4868142947686223, "language_loss": 0.71883452, "learning_rate": 5.767272374588648e-07, "loss": 0.74177152, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.7219178676605225 }, { "auxiliary_loss_clip": 0.01224059, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.05035067, "balance_loss_mlp": 1.02167463, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.7738038317054508, "language_loss": 0.78135568, "learning_rate": 5.76180080471939e-07, "loss": 0.80387664, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.7574949264526367 }, { "auxiliary_loss_clip": 0.0117836, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.05115378, "balance_loss_mlp": 1.01859963, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.1640104135084526, "language_loss": 0.72216511, "learning_rate": 5.756331394773631e-07, "loss": 0.74421257, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.5598721504211426 }, { "auxiliary_loss_clip": 0.01425441, "auxiliary_loss_mlp": 0.02567548, "balance_loss_clip": 1.04423225, "balance_loss_mlp": 0.99990153, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.801745504469567, "language_loss": 0.76260245, "learning_rate": 5.750864145581071e-07, "loss": 0.80253237, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.7734079360961914 }, { "auxiliary_loss_clip": 0.0117547, "auxiliary_loss_mlp": 0.01024143, "balance_loss_clip": 1.05153537, "balance_loss_mlp": 1.01761317, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 5.017659384972666, "language_loss": 0.85896051, "learning_rate": 5.745399057971085e-07, "loss": 0.88095665, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.5953924655914307 }, { "auxiliary_loss_clip": 0.01228437, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.0496521, "balance_loss_mlp": 1.02360415, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.4110012736293074, "language_loss": 0.75431013, "learning_rate": 5.739936132772738e-07, "loss": 0.77690226, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 4.4660797119140625 }, { "auxiliary_loss_clip": 0.01173786, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.04876161, "balance_loss_mlp": 1.01979208, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 2.163901674112299, "language_loss": 0.7431438, "learning_rate": 5.734475370814733e-07, "loss": 0.76515168, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.6504836082458496 }, { "auxiliary_loss_clip": 0.012271, "auxiliary_loss_mlp": 0.0102697, "balance_loss_clip": 1.0474087, "balance_loss_mlp": 1.01982927, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 2.2552159738939435, "language_loss": 0.78434741, "learning_rate": 5.729016772925483e-07, "loss": 0.8068881, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 3.5863378047943115 }, { "auxiliary_loss_clip": 0.01373978, "auxiliary_loss_mlp": 0.01023478, "balance_loss_clip": 1.04815626, "balance_loss_mlp": 1.01609874, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 2.056327465273519, "language_loss": 0.70586479, "learning_rate": 5.723560339933038e-07, "loss": 0.72983938, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.733764410018921 }, { "auxiliary_loss_clip": 0.0122517, "auxiliary_loss_mlp": 0.02566143, "balance_loss_clip": 1.04702926, "balance_loss_mlp": 0.99991584, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.4589438924532505, "language_loss": 0.65583867, "learning_rate": 5.71810607266513e-07, "loss": 0.69375181, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.7091054916381836 }, { "auxiliary_loss_clip": 0.01227258, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.01906085, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.9661942443155997, "language_loss": 0.60616797, "learning_rate": 5.712653971949184e-07, "loss": 0.6286996, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.6045713424682617 }, { "auxiliary_loss_clip": 0.01222048, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.04735112, "balance_loss_mlp": 1.02384615, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.8599694545261123, "language_loss": 0.7518695, "learning_rate": 5.707204038612268e-07, "loss": 0.77440053, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.596513271331787 }, { "auxiliary_loss_clip": 0.01278217, "auxiliary_loss_mlp": 0.01027975, "balance_loss_clip": 1.05033767, "balance_loss_mlp": 1.02024388, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.4130629400980483, "language_loss": 0.73829138, "learning_rate": 5.701756273481138e-07, "loss": 0.76135325, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.648343801498413 }, { "auxiliary_loss_clip": 0.01177614, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.04599237, "balance_loss_mlp": 1.02138603, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.7607551957522287, "language_loss": 0.73910797, "learning_rate": 5.696310677382212e-07, "loss": 0.76116329, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.6665496826171875 }, { "auxiliary_loss_clip": 0.01222953, "auxiliary_loss_mlp": 0.01000971, "balance_loss_clip": 1.01022935, "balance_loss_mlp": 1.00001717, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8843038822479503, "language_loss": 0.61684626, "learning_rate": 5.690867251141576e-07, "loss": 0.63908547, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 4.3029069900512695 }, { "auxiliary_loss_clip": 0.01130883, "auxiliary_loss_mlp": 0.01025795, "balance_loss_clip": 1.04916692, "balance_loss_mlp": 1.0178256, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 4.07425225914332, "language_loss": 0.91491729, "learning_rate": 5.685425995585013e-07, "loss": 0.9364841, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.6235241889953613 }, { "auxiliary_loss_clip": 0.01179187, "auxiliary_loss_mlp": 0.01001127, "balance_loss_clip": 1.01063776, "balance_loss_mlp": 1.00017965, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.7538096601953213, "language_loss": 0.58897656, "learning_rate": 5.679986911537935e-07, "loss": 0.6107797, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.3508191108703613 }, { "auxiliary_loss_clip": 0.01370651, "auxiliary_loss_mlp": 0.01022495, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.01552379, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.7718832810101364, "language_loss": 0.67448497, "learning_rate": 5.674549999825462e-07, "loss": 0.69841647, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.825464963912964 }, { "auxiliary_loss_clip": 0.01121706, "auxiliary_loss_mlp": 0.0100214, "balance_loss_clip": 1.01036334, "balance_loss_mlp": 1.00125742, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9173419316114866, "language_loss": 0.71399301, "learning_rate": 5.669115261272363e-07, "loss": 0.73523146, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.2390894889831543 }, { "auxiliary_loss_clip": 0.01226915, "auxiliary_loss_mlp": 0.01022829, "balance_loss_clip": 1.0509963, "balance_loss_mlp": 1.01552129, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 2.35391806916075, "language_loss": 0.73081839, "learning_rate": 5.663682696703081e-07, "loss": 0.75331581, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.696017026901245 }, { "auxiliary_loss_clip": 0.01173942, "auxiliary_loss_mlp": 0.01023404, "balance_loss_clip": 1.04988444, "balance_loss_mlp": 1.01684475, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 1.9165617859443422, "language_loss": 0.82076812, "learning_rate": 5.658252306941746e-07, "loss": 0.84274161, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.659330129623413 }, { "auxiliary_loss_clip": 0.01371443, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04677474, "balance_loss_mlp": 1.01924527, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.5944498324984284, "language_loss": 0.75686073, "learning_rate": 5.65282409281212e-07, "loss": 0.78083861, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.8042211532592773 }, { "auxiliary_loss_clip": 0.01275369, "auxiliary_loss_mlp": 0.01024685, "balance_loss_clip": 1.04788733, "balance_loss_mlp": 1.01755071, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.8596609427690036, "language_loss": 0.69944173, "learning_rate": 5.64739805513768e-07, "loss": 0.72244227, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.642709732055664 }, { "auxiliary_loss_clip": 0.01115529, "auxiliary_loss_mlp": 0.02504816, "balance_loss_clip": 1.00970125, "balance_loss_mlp": 0.99989492, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7927551439912602, "language_loss": 0.55635941, "learning_rate": 5.641974194741541e-07, "loss": 0.59256285, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.13092303276062 }, { "auxiliary_loss_clip": 0.01188189, "auxiliary_loss_mlp": 0.01017347, "balance_loss_clip": 1.03452992, "balance_loss_mlp": 1.01614845, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7963464942061677, "language_loss": 0.63733768, "learning_rate": 5.636552512446502e-07, "loss": 0.65939295, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.1864728927612305 }, { "auxiliary_loss_clip": 0.01219218, "auxiliary_loss_mlp": 0.01024805, "balance_loss_clip": 1.04654527, "balance_loss_mlp": 1.01775098, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 1.6580298578660668, "language_loss": 0.77818835, "learning_rate": 5.631133009075027e-07, "loss": 0.80062854, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.6504998207092285 }, { "auxiliary_loss_clip": 0.01224499, "auxiliary_loss_mlp": 0.02561144, "balance_loss_clip": 1.04954362, "balance_loss_mlp": 0.99991286, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 2.4658359782947614, "language_loss": 0.69065249, "learning_rate": 5.625715685449242e-07, "loss": 0.72850895, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.60783314704895 }, { "auxiliary_loss_clip": 0.0122709, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.05282497, "balance_loss_mlp": 1.0180707, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 2.6884939738693467, "language_loss": 0.71522743, "learning_rate": 5.620300542390966e-07, "loss": 0.73774189, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.7197155952453613 }, { "auxiliary_loss_clip": 0.01171777, "auxiliary_loss_mlp": 0.01025715, "balance_loss_clip": 1.04606342, "balance_loss_mlp": 1.01931369, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 3.309560362732711, "language_loss": 0.85277712, "learning_rate": 5.614887580721659e-07, "loss": 0.87475204, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.661543846130371 }, { "auxiliary_loss_clip": 0.01313096, "auxiliary_loss_mlp": 0.01021874, "balance_loss_clip": 1.04735947, "balance_loss_mlp": 1.01470685, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.8495513758690354, "language_loss": 0.73645842, "learning_rate": 5.609476801262481e-07, "loss": 0.75980806, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.6161885261535645 }, { "auxiliary_loss_clip": 0.01326244, "auxiliary_loss_mlp": 0.01022683, "balance_loss_clip": 1.04943061, "balance_loss_mlp": 1.01581013, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 6.430575865007234, "language_loss": 0.64449358, "learning_rate": 5.604068204834223e-07, "loss": 0.66798282, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.637721061706543 }, { "auxiliary_loss_clip": 0.01374492, "auxiliary_loss_mlp": 0.02567432, "balance_loss_clip": 1.04749155, "balance_loss_mlp": 0.99991024, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 2.5277643688928833, "language_loss": 0.76891434, "learning_rate": 5.598661792257367e-07, "loss": 0.80833358, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 3.619931697845459 }, { "auxiliary_loss_clip": 0.01223553, "auxiliary_loss_mlp": 0.01024753, "balance_loss_clip": 1.04696774, "balance_loss_mlp": 1.01808953, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 2.341477210487527, "language_loss": 0.76124513, "learning_rate": 5.593257564352071e-07, "loss": 0.78372824, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.6094188690185547 }, { "auxiliary_loss_clip": 0.01220583, "auxiliary_loss_mlp": 0.01023088, "balance_loss_clip": 1.04860198, "balance_loss_mlp": 1.01607263, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 2.443947916934433, "language_loss": 0.75707972, "learning_rate": 5.58785552193815e-07, "loss": 0.77951634, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 3.5850815773010254 }, { "auxiliary_loss_clip": 0.01174079, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.04942429, "balance_loss_mlp": 1.02108836, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 1.8056016106256578, "language_loss": 0.75427949, "learning_rate": 5.582455665835086e-07, "loss": 0.77629602, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.643488645553589 }, { "auxiliary_loss_clip": 0.01283333, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.0464797, "balance_loss_mlp": 1.02378118, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 3.0254814372482897, "language_loss": 0.72408408, "learning_rate": 5.577057996862036e-07, "loss": 0.74722946, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.5774903297424316 }, { "auxiliary_loss_clip": 0.01172487, "auxiliary_loss_mlp": 0.01021254, "balance_loss_clip": 1.05016351, "balance_loss_mlp": 1.01469135, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.6908601970475912, "language_loss": 0.7604562, "learning_rate": 5.571662515837814e-07, "loss": 0.78239357, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.5868937969207764 }, { "auxiliary_loss_clip": 0.01271499, "auxiliary_loss_mlp": 0.01020889, "balance_loss_clip": 1.04817951, "balance_loss_mlp": 1.01441264, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.8418662885736523, "language_loss": 0.83807063, "learning_rate": 5.566269223580926e-07, "loss": 0.86099446, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.758990526199341 }, { "auxiliary_loss_clip": 0.01226634, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.04864311, "balance_loss_mlp": 1.02092028, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.7873936643941488, "language_loss": 0.75297332, "learning_rate": 5.560878120909511e-07, "loss": 0.775518, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.6205759048461914 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.00999613, "balance_loss_clip": 1.01050353, "balance_loss_mlp": 0.99871314, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8573362773396029, "language_loss": 0.58535588, "learning_rate": 5.55548920864141e-07, "loss": 0.60657179, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.1821839809417725 }, { "auxiliary_loss_clip": 0.01224474, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.05105042, "balance_loss_mlp": 1.02609658, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.8194907174813417, "language_loss": 0.77817619, "learning_rate": 5.550102487594113e-07, "loss": 0.80075073, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 3.4887900352478027 }, { "auxiliary_loss_clip": 0.01377825, "auxiliary_loss_mlp": 0.0256309, "balance_loss_clip": 1.04417968, "balance_loss_mlp": 0.99988943, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.7314207399295598, "language_loss": 0.71496189, "learning_rate": 5.54471795858477e-07, "loss": 0.75437099, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.779519557952881 }, { "auxiliary_loss_clip": 0.01324233, "auxiliary_loss_mlp": 0.01021687, "balance_loss_clip": 1.04210413, "balance_loss_mlp": 1.01513052, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 2.055811255149393, "language_loss": 0.82904482, "learning_rate": 5.539335622430235e-07, "loss": 0.85250401, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.6748807430267334 }, { "auxiliary_loss_clip": 0.01223544, "auxiliary_loss_mlp": 0.01026291, "balance_loss_clip": 1.04703712, "balance_loss_mlp": 1.01953149, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 2.343368377835373, "language_loss": 0.74940574, "learning_rate": 5.533955479946975e-07, "loss": 0.77190411, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.6151602268218994 }, { "auxiliary_loss_clip": 0.01140598, "auxiliary_loss_mlp": 0.02505905, "balance_loss_clip": 1.03381109, "balance_loss_mlp": 0.999856, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8599922338209752, "language_loss": 0.65691185, "learning_rate": 5.528577531951173e-07, "loss": 0.6933769, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.262866735458374 }, { "auxiliary_loss_clip": 0.01275734, "auxiliary_loss_mlp": 0.01021492, "balance_loss_clip": 1.04701352, "balance_loss_mlp": 1.01489711, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 3.287680098640903, "language_loss": 0.74110419, "learning_rate": 5.523201779258653e-07, "loss": 0.76407647, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.6236345767974854 }, { "auxiliary_loss_clip": 0.0117419, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.04879963, "balance_loss_mlp": 1.02531099, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.9881197200048353, "language_loss": 0.84320873, "learning_rate": 5.517828222684912e-07, "loss": 0.86527717, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.61331844329834 }, { "auxiliary_loss_clip": 0.01170778, "auxiliary_loss_mlp": 0.00999132, "balance_loss_clip": 1.01129103, "balance_loss_mlp": 0.99816054, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7688963970796907, "language_loss": 0.58973449, "learning_rate": 5.512456863045117e-07, "loss": 0.61143363, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.2237744331359863 }, { "auxiliary_loss_clip": 0.01175035, "auxiliary_loss_mlp": 0.0102478, "balance_loss_clip": 1.04878044, "balance_loss_mlp": 1.01746953, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 2.0772423071249055, "language_loss": 0.74018776, "learning_rate": 5.507087701154089e-07, "loss": 0.76218593, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.638357400894165 }, { "auxiliary_loss_clip": 0.01376864, "auxiliary_loss_mlp": 0.01024895, "balance_loss_clip": 1.04576755, "balance_loss_mlp": 1.01817417, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 1.8510273383329774, "language_loss": 0.75490171, "learning_rate": 5.50172073782634e-07, "loss": 0.77891934, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.7240495681762695 }, { "auxiliary_loss_clip": 0.01321697, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.04949307, "balance_loss_mlp": 1.018888, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 1.78594469456083, "language_loss": 0.87850195, "learning_rate": 5.496355973876023e-07, "loss": 0.90197319, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.702868700027466 }, { "auxiliary_loss_clip": 0.01322706, "auxiliary_loss_mlp": 0.02568305, "balance_loss_clip": 1.04327083, "balance_loss_mlp": 0.99990547, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 6.089656377979419, "language_loss": 0.71362203, "learning_rate": 5.490993410116984e-07, "loss": 0.75253212, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.882114887237549 }, { "auxiliary_loss_clip": 0.01322722, "auxiliary_loss_mlp": 0.01024867, "balance_loss_clip": 1.04580677, "balance_loss_mlp": 1.01786053, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.6118983368832827, "language_loss": 0.69863456, "learning_rate": 5.485633047362704e-07, "loss": 0.72211051, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.859867572784424 }, { "auxiliary_loss_clip": 0.01180139, "auxiliary_loss_mlp": 0.01026924, "balance_loss_clip": 1.05332935, "balance_loss_mlp": 1.01993489, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 2.773695799726802, "language_loss": 0.78935415, "learning_rate": 5.480274886426341e-07, "loss": 0.81142473, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.6259210109710693 }, { "auxiliary_loss_clip": 0.0122593, "auxiliary_loss_mlp": 0.01021996, "balance_loss_clip": 1.0520364, "balance_loss_mlp": 1.01543057, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 2.2402579583670037, "language_loss": 0.77890122, "learning_rate": 5.474918928120744e-07, "loss": 0.80138052, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.5898306369781494 }, { "auxiliary_loss_clip": 0.01219138, "auxiliary_loss_mlp": 0.01029296, "balance_loss_clip": 1.04704833, "balance_loss_mlp": 1.02246869, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 1.941264987992297, "language_loss": 0.87429225, "learning_rate": 5.469565173258392e-07, "loss": 0.89677668, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.6447532176971436 }, { "auxiliary_loss_clip": 0.01178463, "auxiliary_loss_mlp": 0.01022354, "balance_loss_clip": 1.05030262, "balance_loss_mlp": 1.01484966, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 1.8012704733051066, "language_loss": 0.64102608, "learning_rate": 5.464213622651454e-07, "loss": 0.6630342, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 3.413572072982788 }, { "auxiliary_loss_clip": 0.01326648, "auxiliary_loss_mlp": 0.01021989, "balance_loss_clip": 1.04793692, "balance_loss_mlp": 1.01521456, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 2.073583220253837, "language_loss": 0.84306288, "learning_rate": 5.458864277111753e-07, "loss": 0.86654925, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 3.618757486343384 }, { "auxiliary_loss_clip": 0.01269321, "auxiliary_loss_mlp": 0.02561494, "balance_loss_clip": 1.04632068, "balance_loss_mlp": 0.99990529, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.60596640838214, "language_loss": 0.68989909, "learning_rate": 5.453517137450769e-07, "loss": 0.72820723, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.651176691055298 }, { "auxiliary_loss_clip": 0.0122373, "auxiliary_loss_mlp": 0.0102204, "balance_loss_clip": 1.0505569, "balance_loss_mlp": 1.01551378, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 1.6386470170136365, "language_loss": 0.76115036, "learning_rate": 5.448172204479684e-07, "loss": 0.78360808, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 3.4458539485931396 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.04908621, "balance_loss_mlp": 1.02150702, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.7971847543412012, "language_loss": 0.74655199, "learning_rate": 5.442829479009294e-07, "loss": 0.76857239, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.624239921569824 }, { "auxiliary_loss_clip": 0.01130667, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.04805923, "balance_loss_mlp": 1.01958799, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 1.8226818016947173, "language_loss": 0.71791887, "learning_rate": 5.437488961850103e-07, "loss": 0.73949277, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.619051694869995 }, { "auxiliary_loss_clip": 0.01364906, "auxiliary_loss_mlp": 0.01023564, "balance_loss_clip": 1.04463446, "balance_loss_mlp": 1.01752353, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.9187707125703009, "language_loss": 0.75435752, "learning_rate": 5.432150653812258e-07, "loss": 0.77824223, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.756120204925537 }, { "auxiliary_loss_clip": 0.01222419, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.05013442, "balance_loss_mlp": 1.02001381, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 2.0807618059489443, "language_loss": 0.8236661, "learning_rate": 5.42681455570557e-07, "loss": 0.84615821, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.58658766746521 }, { "auxiliary_loss_clip": 0.01173466, "auxiliary_loss_mlp": 0.01023039, "balance_loss_clip": 1.05024743, "balance_loss_mlp": 1.01629198, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 2.8160949659949943, "language_loss": 0.64803928, "learning_rate": 5.42148066833954e-07, "loss": 0.67000437, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.614374876022339 }, { "auxiliary_loss_clip": 0.01173919, "auxiliary_loss_mlp": 0.0102105, "balance_loss_clip": 1.05061543, "balance_loss_mlp": 1.01480937, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 2.285416712495025, "language_loss": 0.75487685, "learning_rate": 5.416148992523289e-07, "loss": 0.77682656, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.654759407043457 }, { "auxiliary_loss_clip": 0.01470306, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.04086769, "balance_loss_mlp": 1.02083659, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 1.9714192319747155, "language_loss": 0.78554267, "learning_rate": 5.410819529065644e-07, "loss": 0.81051898, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 3.767184019088745 }, { "auxiliary_loss_clip": 0.01367621, "auxiliary_loss_mlp": 0.01023891, "balance_loss_clip": 1.04155076, "balance_loss_mlp": 1.01735795, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 2.0327745553595844, "language_loss": 0.6518271, "learning_rate": 5.405492278775079e-07, "loss": 0.67574227, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.9145278930664062 }, { "auxiliary_loss_clip": 0.01277852, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.04433346, "balance_loss_mlp": 1.01865721, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.335101918689425, "language_loss": 0.79958642, "learning_rate": 5.400167242459732e-07, "loss": 0.82262605, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.7245540618896484 }, { "auxiliary_loss_clip": 0.01221424, "auxiliary_loss_mlp": 0.0102341, "balance_loss_clip": 1.04753029, "balance_loss_mlp": 1.01717246, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 10.442047718374237, "language_loss": 0.80464089, "learning_rate": 5.394844420927405e-07, "loss": 0.82708919, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.644745111465454 }, { "auxiliary_loss_clip": 0.01172842, "auxiliary_loss_mlp": 0.0102461, "balance_loss_clip": 1.04945946, "balance_loss_mlp": 1.01809525, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.4062462420811586, "language_loss": 0.73258698, "learning_rate": 5.389523814985562e-07, "loss": 0.75456154, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.624577045440674 }, { "auxiliary_loss_clip": 0.01376322, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.04382646, "balance_loss_mlp": 1.02608562, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 1.7679749184341063, "language_loss": 0.76124573, "learning_rate": 5.384205425441344e-07, "loss": 0.78533942, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.8141911029815674 }, { "auxiliary_loss_clip": 0.0126843, "auxiliary_loss_mlp": 0.01024478, "balance_loss_clip": 1.04369438, "balance_loss_mlp": 1.01784706, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.6216784590860933, "language_loss": 0.84336627, "learning_rate": 5.378889253101537e-07, "loss": 0.86629534, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.710787057876587 }, { "auxiliary_loss_clip": 0.01222594, "auxiliary_loss_mlp": 0.01021211, "balance_loss_clip": 1.04518962, "balance_loss_mlp": 1.01478601, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.6922651094101255, "language_loss": 0.81228715, "learning_rate": 5.373575298772617e-07, "loss": 0.83472514, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.6289193630218506 }, { "auxiliary_loss_clip": 0.01125612, "auxiliary_loss_mlp": 0.01000475, "balance_loss_clip": 1.01079929, "balance_loss_mlp": 0.99944353, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7563001926213312, "language_loss": 0.61227423, "learning_rate": 5.368263563260689e-07, "loss": 0.63353509, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.297755241394043 }, { "auxiliary_loss_clip": 0.01223949, "auxiliary_loss_mlp": 0.01024931, "balance_loss_clip": 1.04659617, "balance_loss_mlp": 1.0181365, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 2.177679176549905, "language_loss": 0.64145339, "learning_rate": 5.362954047371537e-07, "loss": 0.6639421, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.6172263622283936 }, { "auxiliary_loss_clip": 0.01324643, "auxiliary_loss_mlp": 0.01024143, "balance_loss_clip": 1.05180931, "balance_loss_mlp": 1.01674032, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 1.8243167702416283, "language_loss": 0.72368777, "learning_rate": 5.357646751910627e-07, "loss": 0.74717557, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.713099479675293 }, { "auxiliary_loss_clip": 0.01271372, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.04685283, "balance_loss_mlp": 1.02054918, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.523214208722722, "language_loss": 0.80119216, "learning_rate": 5.352341677683061e-07, "loss": 0.82418787, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.7305908203125 }, { "auxiliary_loss_clip": 0.01329323, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.04631996, "balance_loss_mlp": 1.01935017, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 1.790487169270661, "language_loss": 0.79074955, "learning_rate": 5.347038825493617e-07, "loss": 0.81430709, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.7279727458953857 }, { "auxiliary_loss_clip": 0.01267545, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.04838228, "balance_loss_mlp": 1.02330637, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.726584023331789, "language_loss": 0.68433857, "learning_rate": 5.341738196146732e-07, "loss": 0.70731258, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.6466407775878906 }, { "auxiliary_loss_clip": 0.01222439, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.0463109, "balance_loss_mlp": 1.02347231, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.693422032812867, "language_loss": 0.73623395, "learning_rate": 5.336439790446503e-07, "loss": 0.75875807, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.655944347381592 }, { "auxiliary_loss_clip": 0.01323747, "auxiliary_loss_mlp": 0.01023373, "balance_loss_clip": 1.04305947, "balance_loss_mlp": 1.01595211, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 2.6714074853121517, "language_loss": 0.61972147, "learning_rate": 5.331143609196711e-07, "loss": 0.64319265, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 3.0158629417419434 }, { "auxiliary_loss_clip": 0.01221345, "auxiliary_loss_mlp": 0.01025488, "balance_loss_clip": 1.04836178, "balance_loss_mlp": 1.01855576, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.7257421082034516, "language_loss": 0.77462626, "learning_rate": 5.325849653200758e-07, "loss": 0.79709464, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 3.8367178440093994 }, { "auxiliary_loss_clip": 0.01175549, "auxiliary_loss_mlp": 0.01021969, "balance_loss_clip": 1.05140424, "balance_loss_mlp": 1.01524889, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.6760277400774874, "language_loss": 0.76595366, "learning_rate": 5.32055792326175e-07, "loss": 0.78792882, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 3.521735191345215 }, { "auxiliary_loss_clip": 0.01274843, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.04898024, "balance_loss_mlp": 1.01784933, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 2.2236331244873386, "language_loss": 0.72574556, "learning_rate": 5.315268420182437e-07, "loss": 0.74874461, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.651752233505249 }, { "auxiliary_loss_clip": 0.01327906, "auxiliary_loss_mlp": 0.02564643, "balance_loss_clip": 1.04676604, "balance_loss_mlp": 0.99991435, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 2.127844734913713, "language_loss": 0.76421332, "learning_rate": 5.309981144765221e-07, "loss": 0.80313885, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 3.5868334770202637 }, { "auxiliary_loss_clip": 0.01376839, "auxiliary_loss_mlp": 0.01023819, "balance_loss_clip": 1.04418647, "balance_loss_mlp": 1.01694918, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.4744340979915402, "language_loss": 0.75721788, "learning_rate": 5.304696097812196e-07, "loss": 0.78122449, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.712481737136841 }, { "auxiliary_loss_clip": 0.01270338, "auxiliary_loss_mlp": 0.0102368, "balance_loss_clip": 1.04413271, "balance_loss_mlp": 1.01615834, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 3.598583548951113, "language_loss": 0.61135828, "learning_rate": 5.299413280125078e-07, "loss": 0.6342985, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.6653027534484863 }, { "auxiliary_loss_clip": 0.01273017, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.04522967, "balance_loss_mlp": 1.02622354, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 2.0702568109659674, "language_loss": 0.72861302, "learning_rate": 5.294132692505284e-07, "loss": 0.75167716, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.688532590866089 }, { "auxiliary_loss_clip": 0.01364776, "auxiliary_loss_mlp": 0.01026767, "balance_loss_clip": 1.04010117, "balance_loss_mlp": 1.01990318, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 2.102711346714565, "language_loss": 0.79165745, "learning_rate": 5.288854335753861e-07, "loss": 0.81557286, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.74121356010437 }, { "auxiliary_loss_clip": 0.01226329, "auxiliary_loss_mlp": 0.01022316, "balance_loss_clip": 1.04867327, "balance_loss_mlp": 1.01549423, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 1.5784211072343581, "language_loss": 0.75720203, "learning_rate": 5.283578210671551e-07, "loss": 0.77968848, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.734612226486206 }, { "auxiliary_loss_clip": 0.01276237, "auxiliary_loss_mlp": 0.0102306, "balance_loss_clip": 1.04723489, "balance_loss_mlp": 1.01643467, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 3.050296277591386, "language_loss": 0.76949406, "learning_rate": 5.278304318058719e-07, "loss": 0.79248703, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.715555429458618 }, { "auxiliary_loss_clip": 0.01424308, "auxiliary_loss_mlp": 0.01029028, "balance_loss_clip": 1.04459691, "balance_loss_mlp": 1.02217662, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.7206763388753927, "language_loss": 0.79287958, "learning_rate": 5.273032658715411e-07, "loss": 0.81741285, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 3.7544682025909424 }, { "auxiliary_loss_clip": 0.01367897, "auxiliary_loss_mlp": 0.010273, "balance_loss_clip": 1.04206109, "balance_loss_mlp": 1.0202632, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 2.1179562836836423, "language_loss": 0.76555061, "learning_rate": 5.267763233441347e-07, "loss": 0.78950262, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.7582266330718994 }, { "auxiliary_loss_clip": 0.01226328, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.04942012, "balance_loss_mlp": 1.02657604, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 3.4484466487327077, "language_loss": 0.69967902, "learning_rate": 5.26249604303588e-07, "loss": 0.72227865, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.641677141189575 }, { "auxiliary_loss_clip": 0.01174421, "auxiliary_loss_mlp": 0.01021831, "balance_loss_clip": 1.04992557, "balance_loss_mlp": 1.0151794, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 2.138439361576134, "language_loss": 0.78342903, "learning_rate": 5.257231088298057e-07, "loss": 0.80539155, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.5665149688720703 }, { "auxiliary_loss_clip": 0.01220209, "auxiliary_loss_mlp": 0.00999196, "balance_loss_clip": 1.0115912, "balance_loss_mlp": 0.99829596, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7985990304701299, "language_loss": 0.53899252, "learning_rate": 5.25196837002655e-07, "loss": 0.56118655, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.30989670753479 }, { "auxiliary_loss_clip": 0.0126958, "auxiliary_loss_mlp": 0.01023811, "balance_loss_clip": 1.0456866, "balance_loss_mlp": 1.0168556, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 6.163278091915123, "language_loss": 0.68266058, "learning_rate": 5.24670788901971e-07, "loss": 0.70559454, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.78220796585083 }, { "auxiliary_loss_clip": 0.0127679, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.04852724, "balance_loss_mlp": 1.02241671, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 3.013585676152656, "language_loss": 0.69064665, "learning_rate": 5.241449646075557e-07, "loss": 0.71372038, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.8258821964263916 }, { "auxiliary_loss_clip": 0.01229859, "auxiliary_loss_mlp": 0.01025935, "balance_loss_clip": 1.04779184, "balance_loss_mlp": 1.01912832, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.0126002298655776, "language_loss": 0.72369552, "learning_rate": 5.236193641991762e-07, "loss": 0.74625349, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.626279592514038 }, { "auxiliary_loss_clip": 0.01265638, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.04645908, "balance_loss_mlp": 1.01712751, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 2.0401778793619094, "language_loss": 0.69792545, "learning_rate": 5.23093987756565e-07, "loss": 0.72081721, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.692166566848755 }, { "auxiliary_loss_clip": 0.01330639, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04472113, "balance_loss_mlp": 1.01678193, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 2.0965703144657626, "language_loss": 0.75387222, "learning_rate": 5.225688353594217e-07, "loss": 0.77741981, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.745495080947876 }, { "auxiliary_loss_clip": 0.01279066, "auxiliary_loss_mlp": 0.02563431, "balance_loss_clip": 1.04935801, "balance_loss_mlp": 0.99993861, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.197535586187941, "language_loss": 0.77938187, "learning_rate": 5.220439070874108e-07, "loss": 0.81780684, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.6982784271240234 }, { "auxiliary_loss_clip": 0.01222065, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.04935265, "balance_loss_mlp": 1.02453029, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.6842850282506423, "language_loss": 0.70859224, "learning_rate": 5.215192030201652e-07, "loss": 0.73112506, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.7129921913146973 }, { "auxiliary_loss_clip": 0.01310627, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.03911924, "balance_loss_mlp": 1.01796639, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.85707756961009, "language_loss": 0.86287105, "learning_rate": 5.209947232372798e-07, "loss": 0.8862288, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.7136452198028564 }, { "auxiliary_loss_clip": 0.01226259, "auxiliary_loss_mlp": 0.02565691, "balance_loss_clip": 1.04714346, "balance_loss_mlp": 0.99993813, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 2.0496735105689643, "language_loss": 0.80952334, "learning_rate": 5.204704678183196e-07, "loss": 0.84744287, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.767970085144043 }, { "auxiliary_loss_clip": 0.01175639, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.05108261, "balance_loss_mlp": 1.02243972, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 2.1006012463167583, "language_loss": 0.85202432, "learning_rate": 5.19946436842813e-07, "loss": 0.87407517, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.5519728660583496 }, { "auxiliary_loss_clip": 0.01328239, "auxiliary_loss_mlp": 0.01023478, "balance_loss_clip": 1.04884684, "balance_loss_mlp": 1.01663566, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.7415384055499747, "language_loss": 0.68658686, "learning_rate": 5.194226303902546e-07, "loss": 0.71010411, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.7906036376953125 }, { "auxiliary_loss_clip": 0.01270003, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.04536963, "balance_loss_mlp": 1.02434492, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 1.8047932794349952, "language_loss": 0.7113446, "learning_rate": 5.188990485401072e-07, "loss": 0.73435926, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 3.556642770767212 }, { "auxiliary_loss_clip": 0.01224907, "auxiliary_loss_mlp": 0.01029786, "balance_loss_clip": 1.04912043, "balance_loss_mlp": 1.02246666, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 2.25428871343897, "language_loss": 0.86459869, "learning_rate": 5.183756913717954e-07, "loss": 0.88714564, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 3.5687780380249023 }, { "auxiliary_loss_clip": 0.01269013, "auxiliary_loss_mlp": 0.01024313, "balance_loss_clip": 1.04732037, "balance_loss_mlp": 1.01746416, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 2.270852678915111, "language_loss": 0.73507631, "learning_rate": 5.178525589647136e-07, "loss": 0.75800955, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.8162386417388916 }, { "auxiliary_loss_clip": 0.01279962, "auxiliary_loss_mlp": 0.01024724, "balance_loss_clip": 1.04778659, "balance_loss_mlp": 1.01809573, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.8748520035517338, "language_loss": 0.78602397, "learning_rate": 5.173296513982197e-07, "loss": 0.80907083, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 3.540137529373169 }, { "auxiliary_loss_clip": 0.01333901, "auxiliary_loss_mlp": 0.01026818, "balance_loss_clip": 1.04814291, "balance_loss_mlp": 1.01972163, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 3.731314244208566, "language_loss": 0.65029931, "learning_rate": 5.168069687516398e-07, "loss": 0.67390645, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.761063814163208 }, { "auxiliary_loss_clip": 0.01272299, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.04906189, "balance_loss_mlp": 1.02530456, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 1.9176804301240105, "language_loss": 0.72029316, "learning_rate": 5.16284511104263e-07, "loss": 0.74333978, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.646824359893799 }, { "auxiliary_loss_clip": 0.01268937, "auxiliary_loss_mlp": 0.01024624, "balance_loss_clip": 1.04684663, "balance_loss_mlp": 1.01767099, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 3.0275037319196563, "language_loss": 0.80771017, "learning_rate": 5.157622785353457e-07, "loss": 0.8306458, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.6584599018096924 }, { "auxiliary_loss_clip": 0.0112022, "auxiliary_loss_mlp": 0.01001513, "balance_loss_clip": 1.00989723, "balance_loss_mlp": 1.00054729, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6542652116491539, "language_loss": 0.60298318, "learning_rate": 5.152402711241113e-07, "loss": 0.62420052, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.2888240814208984 }, { "auxiliary_loss_clip": 0.01317832, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.04142511, "balance_loss_mlp": 1.01832926, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.8409270634393495, "language_loss": 0.83216739, "learning_rate": 5.147184889497465e-07, "loss": 0.85559642, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.6844303607940674 }, { "auxiliary_loss_clip": 0.01310625, "auxiliary_loss_mlp": 0.01027448, "balance_loss_clip": 1.04255605, "balance_loss_mlp": 1.0200634, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.666724583133917, "language_loss": 0.79889798, "learning_rate": 5.141969320914072e-07, "loss": 0.82227874, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.6946356296539307 }, { "auxiliary_loss_clip": 0.01177975, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.05015993, "balance_loss_mlp": 1.02409649, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 3.472897733931272, "language_loss": 0.62583441, "learning_rate": 5.136756006282113e-07, "loss": 0.64793181, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 3.609440803527832 }, { "auxiliary_loss_clip": 0.01176772, "auxiliary_loss_mlp": 0.01026344, "balance_loss_clip": 1.05089784, "balance_loss_mlp": 1.01919174, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 3.194374980593014, "language_loss": 0.84765595, "learning_rate": 5.131544946392446e-07, "loss": 0.86968714, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.6465771198272705 }, { "auxiliary_loss_clip": 0.01277435, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.05196881, "balance_loss_mlp": 1.02768457, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 2.4628590757949316, "language_loss": 0.64373851, "learning_rate": 5.126336142035592e-07, "loss": 0.66686583, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.787498950958252 }, { "auxiliary_loss_clip": 0.01274902, "auxiliary_loss_mlp": 0.01028761, "balance_loss_clip": 1.04596579, "balance_loss_mlp": 1.02168882, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 4.4384433375599395, "language_loss": 0.72224635, "learning_rate": 5.121129594001721e-07, "loss": 0.74528301, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.75559401512146 }, { "auxiliary_loss_clip": 0.01219733, "auxiliary_loss_mlp": 0.01029838, "balance_loss_clip": 1.04952884, "balance_loss_mlp": 1.0228287, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.6150242945474271, "language_loss": 0.81413436, "learning_rate": 5.115925303080661e-07, "loss": 0.83663011, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.658479928970337 }, { "auxiliary_loss_clip": 0.01174538, "auxiliary_loss_mlp": 0.01021412, "balance_loss_clip": 1.04801238, "balance_loss_mlp": 1.0150938, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 1.9785420487531769, "language_loss": 0.79639161, "learning_rate": 5.110723270061899e-07, "loss": 0.81835109, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.623114824295044 }, { "auxiliary_loss_clip": 0.01172746, "auxiliary_loss_mlp": 0.01023642, "balance_loss_clip": 1.04965138, "balance_loss_mlp": 1.01706171, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 3.277821782427305, "language_loss": 0.79557353, "learning_rate": 5.105523495734572e-07, "loss": 0.81753743, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.59525728225708 }, { "auxiliary_loss_clip": 0.01176431, "auxiliary_loss_mlp": 0.01023795, "balance_loss_clip": 1.04912162, "balance_loss_mlp": 1.01681495, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.990722522488836, "language_loss": 0.7540406, "learning_rate": 5.100325980887499e-07, "loss": 0.77604282, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.6053881645202637 }, { "auxiliary_loss_clip": 0.01181241, "auxiliary_loss_mlp": 0.01027218, "balance_loss_clip": 1.04847503, "balance_loss_mlp": 1.02050984, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 2.056177005806305, "language_loss": 0.83324683, "learning_rate": 5.095130726309116e-07, "loss": 0.85533142, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.6991353034973145 }, { "auxiliary_loss_clip": 0.0106446, "auxiliary_loss_mlp": 0.01003224, "balance_loss_clip": 1.00902939, "balance_loss_mlp": 1.0022881, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.7975465898478871, "language_loss": 0.59002739, "learning_rate": 5.089937732787559e-07, "loss": 0.61070424, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.203735113143921 }, { "auxiliary_loss_clip": 0.01322855, "auxiliary_loss_mlp": 0.01024348, "balance_loss_clip": 1.04358697, "balance_loss_mlp": 1.01702809, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.29623264409304, "language_loss": 0.67012691, "learning_rate": 5.084747001110592e-07, "loss": 0.69359899, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.753668785095215 }, { "auxiliary_loss_clip": 0.01227373, "auxiliary_loss_mlp": 0.02564501, "balance_loss_clip": 1.05399227, "balance_loss_mlp": 0.99989426, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 2.3876572169900405, "language_loss": 0.70295572, "learning_rate": 5.07955853206564e-07, "loss": 0.74087447, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.717557668685913 }, { "auxiliary_loss_clip": 0.01227576, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.01692724, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 2.0764614967293795, "language_loss": 0.7101016, "learning_rate": 5.074372326439807e-07, "loss": 0.73261803, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.799731731414795 }, { "auxiliary_loss_clip": 0.01319585, "auxiliary_loss_mlp": 0.01020654, "balance_loss_clip": 1.04331064, "balance_loss_mlp": 1.01301837, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 4.158024971019772, "language_loss": 0.73843294, "learning_rate": 5.069188385019814e-07, "loss": 0.76183534, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.6515884399414062 }, { "auxiliary_loss_clip": 0.01377742, "auxiliary_loss_mlp": 0.01031319, "balance_loss_clip": 1.04215133, "balance_loss_mlp": 1.02419019, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 3.1845236178073075, "language_loss": 0.60891461, "learning_rate": 5.064006708592077e-07, "loss": 0.63300514, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.760885715484619 }, { "auxiliary_loss_clip": 0.01263172, "auxiliary_loss_mlp": 0.01022989, "balance_loss_clip": 1.04614997, "balance_loss_mlp": 1.01647758, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.430598373594709, "language_loss": 0.75401688, "learning_rate": 5.058827297942641e-07, "loss": 0.77687848, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.6097042560577393 }, { "auxiliary_loss_clip": 0.01179606, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.0480051, "balance_loss_mlp": 1.02015662, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.9902076202986638, "language_loss": 0.75137115, "learning_rate": 5.053650153857237e-07, "loss": 0.77343786, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 3.5692994594573975 }, { "auxiliary_loss_clip": 0.0121995, "auxiliary_loss_mlp": 0.01024257, "balance_loss_clip": 1.04793429, "balance_loss_mlp": 1.01755738, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 3.196147529921042, "language_loss": 0.69734943, "learning_rate": 5.048475277121214e-07, "loss": 0.71979153, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 3.5589799880981445 }, { "auxiliary_loss_clip": 0.01223568, "auxiliary_loss_mlp": 0.01024878, "balance_loss_clip": 1.04815912, "balance_loss_mlp": 1.01786852, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 2.070193694666023, "language_loss": 0.77256083, "learning_rate": 5.043302668519598e-07, "loss": 0.79504532, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.707768201828003 }, { "auxiliary_loss_clip": 0.01226123, "auxiliary_loss_mlp": 0.0102831, "balance_loss_clip": 1.04702163, "balance_loss_mlp": 1.0213629, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.8049542897624458, "language_loss": 0.72059834, "learning_rate": 5.038132328837079e-07, "loss": 0.74314266, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 3.634740114212036 }, { "auxiliary_loss_clip": 0.01224745, "auxiliary_loss_mlp": 0.0102546, "balance_loss_clip": 1.04889369, "balance_loss_mlp": 1.01871872, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 2.75350975775893, "language_loss": 0.74589634, "learning_rate": 5.032964258857993e-07, "loss": 0.7683984, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.6494932174682617 }, { "auxiliary_loss_clip": 0.01221299, "auxiliary_loss_mlp": 0.01030457, "balance_loss_clip": 1.04453373, "balance_loss_mlp": 1.02312887, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.6042215221610898, "language_loss": 0.68235731, "learning_rate": 5.027798459366329e-07, "loss": 0.70487487, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.85683274269104 }, { "auxiliary_loss_clip": 0.01227883, "auxiliary_loss_mlp": 0.0102594, "balance_loss_clip": 1.04809773, "balance_loss_mlp": 1.01882946, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 2.7670064408032955, "language_loss": 0.63608736, "learning_rate": 5.02263493114573e-07, "loss": 0.6586256, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.6154611110687256 }, { "auxiliary_loss_clip": 0.01174669, "auxiliary_loss_mlp": 0.01025748, "balance_loss_clip": 1.04982972, "balance_loss_mlp": 1.01900673, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 2.4522007970313338, "language_loss": 0.7663309, "learning_rate": 5.017473674979502e-07, "loss": 0.78833508, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.5260229110717773 }, { "auxiliary_loss_clip": 0.01215865, "auxiliary_loss_mlp": 0.01003852, "balance_loss_clip": 1.01145029, "balance_loss_mlp": 1.00294614, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7428098261801143, "language_loss": 0.58259106, "learning_rate": 5.01231469165061e-07, "loss": 0.60478818, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.1412134170532227 }, { "auxiliary_loss_clip": 0.01118528, "auxiliary_loss_mlp": 0.01001337, "balance_loss_clip": 1.00945067, "balance_loss_mlp": 1.0003953, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8245731656388763, "language_loss": 0.56806552, "learning_rate": 5.007157981941663e-07, "loss": 0.58926415, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 4.163167953491211 }, { "auxiliary_loss_clip": 0.01173262, "auxiliary_loss_mlp": 0.00999935, "balance_loss_clip": 1.00989461, "balance_loss_mlp": 0.99898165, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8799733442717024, "language_loss": 0.67400622, "learning_rate": 5.002003546634928e-07, "loss": 0.6957382, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.159785270690918 }, { "auxiliary_loss_clip": 0.01375949, "auxiliary_loss_mlp": 0.01031197, "balance_loss_clip": 1.04915714, "balance_loss_mlp": 1.02475989, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.607777177927785, "language_loss": 0.76212275, "learning_rate": 4.996851386512331e-07, "loss": 0.78619421, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.8033576011657715 }, { "auxiliary_loss_clip": 0.01270171, "auxiliary_loss_mlp": 0.01023566, "balance_loss_clip": 1.0474782, "balance_loss_mlp": 1.01610947, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 1.7376615256629229, "language_loss": 0.83223486, "learning_rate": 4.991701502355444e-07, "loss": 0.85517216, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.6751768589019775 }, { "auxiliary_loss_clip": 0.01224389, "auxiliary_loss_mlp": 0.01020236, "balance_loss_clip": 1.04592741, "balance_loss_mlp": 1.01327443, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.498019520588296, "language_loss": 0.75955403, "learning_rate": 4.986553894945518e-07, "loss": 0.78200024, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.6463098526000977 }, { "auxiliary_loss_clip": 0.01272312, "auxiliary_loss_mlp": 0.01019195, "balance_loss_clip": 1.04311156, "balance_loss_mlp": 1.01293707, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.0974022312506473, "language_loss": 0.86706406, "learning_rate": 4.981408565063416e-07, "loss": 0.88997912, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.779273748397827 }, { "auxiliary_loss_clip": 0.01177203, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.05059814, "balance_loss_mlp": 1.0221132, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 2.3149598159024714, "language_loss": 0.75997168, "learning_rate": 4.976265513489701e-07, "loss": 0.78203309, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.5846400260925293 }, { "auxiliary_loss_clip": 0.01227199, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.04867876, "balance_loss_mlp": 1.02552629, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 1.9482149599151688, "language_loss": 0.80549604, "learning_rate": 4.971124741004562e-07, "loss": 0.82809258, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.617812156677246 }, { "auxiliary_loss_clip": 0.01223409, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04788828, "balance_loss_mlp": 1.01880491, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 2.2892643036339555, "language_loss": 0.76374394, "learning_rate": 4.965986248387846e-07, "loss": 0.78623354, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.588573932647705 }, { "auxiliary_loss_clip": 0.0127712, "auxiliary_loss_mlp": 0.01019828, "balance_loss_clip": 1.04688787, "balance_loss_mlp": 1.01327467, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 2.2221988408818687, "language_loss": 0.77285337, "learning_rate": 4.960850036419073e-07, "loss": 0.79582286, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.690772771835327 }, { "auxiliary_loss_clip": 0.0126847, "auxiliary_loss_mlp": 0.01021819, "balance_loss_clip": 1.04723501, "balance_loss_mlp": 1.01479709, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 2.3001990075796535, "language_loss": 0.78836775, "learning_rate": 4.955716105877378e-07, "loss": 0.81127059, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.6428310871124268 }, { "auxiliary_loss_clip": 0.01226844, "auxiliary_loss_mlp": 0.02567025, "balance_loss_clip": 1.04805231, "balance_loss_mlp": 0.99992454, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 3.07632692434293, "language_loss": 0.83201629, "learning_rate": 4.950584457541598e-07, "loss": 0.86995494, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.635831594467163 }, { "auxiliary_loss_clip": 0.01228064, "auxiliary_loss_mlp": 0.01026219, "balance_loss_clip": 1.04878712, "balance_loss_mlp": 1.01933169, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.4033947656505945, "language_loss": 0.81982315, "learning_rate": 4.945455092190183e-07, "loss": 0.84236598, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.66037917137146 }, { "auxiliary_loss_clip": 0.01064568, "auxiliary_loss_mlp": 0.01001575, "balance_loss_clip": 1.00917745, "balance_loss_mlp": 1.00063956, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6876914192085256, "language_loss": 0.5594691, "learning_rate": 4.940328010601271e-07, "loss": 0.58013046, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.19032883644104 }, { "auxiliary_loss_clip": 0.01181722, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.05313277, "balance_loss_mlp": 1.02341962, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.8945726642786342, "language_loss": 0.76822388, "learning_rate": 4.935203213552621e-07, "loss": 0.79034662, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.893869638442993 }, { "auxiliary_loss_clip": 0.01282066, "auxiliary_loss_mlp": 0.01021276, "balance_loss_clip": 1.05097568, "balance_loss_mlp": 1.01409078, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 2.414888850228351, "language_loss": 0.6688379, "learning_rate": 4.930080701821662e-07, "loss": 0.69187135, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.623652696609497 }, { "auxiliary_loss_clip": 0.01272031, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04503214, "balance_loss_mlp": 1.01811266, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 1.9575521625263195, "language_loss": 0.77844357, "learning_rate": 4.92496047618548e-07, "loss": 0.80141425, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.69826340675354 }, { "auxiliary_loss_clip": 0.01221216, "auxiliary_loss_mlp": 0.01020587, "balance_loss_clip": 1.0495702, "balance_loss_mlp": 1.01364017, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 2.044372343355825, "language_loss": 0.77946788, "learning_rate": 4.919842537420811e-07, "loss": 0.80188584, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 3.5619394779205322 }, { "auxiliary_loss_clip": 0.0127607, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.05034077, "balance_loss_mlp": 1.01866388, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.7180282701948317, "language_loss": 0.79426229, "learning_rate": 4.91472688630404e-07, "loss": 0.81728005, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 3.631579637527466 }, { "auxiliary_loss_clip": 0.01173111, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.04987586, "balance_loss_mlp": 1.01975965, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.833003560173933, "language_loss": 0.73914224, "learning_rate": 4.909613523611202e-07, "loss": 0.76113617, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.588418960571289 }, { "auxiliary_loss_clip": 0.01272005, "auxiliary_loss_mlp": 0.02568414, "balance_loss_clip": 1.04193997, "balance_loss_mlp": 0.99991769, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.8714568499781241, "language_loss": 0.7508505, "learning_rate": 4.904502450117991e-07, "loss": 0.78925467, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 3.651624917984009 }, { "auxiliary_loss_clip": 0.01276015, "auxiliary_loss_mlp": 0.01029246, "balance_loss_clip": 1.05138147, "balance_loss_mlp": 1.02160144, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.5817427306238776, "language_loss": 0.71887398, "learning_rate": 4.899393666599762e-07, "loss": 0.74192655, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 2.6607015132904053 }, { "auxiliary_loss_clip": 0.01172722, "auxiliary_loss_mlp": 0.01023133, "balance_loss_clip": 1.04736018, "balance_loss_mlp": 1.01621032, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.484903616691076, "language_loss": 0.72778475, "learning_rate": 4.894287173831506e-07, "loss": 0.74974328, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.552361011505127 }, { "auxiliary_loss_clip": 0.01274017, "auxiliary_loss_mlp": 0.01021576, "balance_loss_clip": 1.04490197, "balance_loss_mlp": 1.01431024, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.6493626537712758, "language_loss": 0.84431577, "learning_rate": 4.889182972587877e-07, "loss": 0.86727166, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.695523262023926 }, { "auxiliary_loss_clip": 0.01331952, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.04743624, "balance_loss_mlp": 1.02106202, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.959656193006223, "language_loss": 0.66469014, "learning_rate": 4.884081063643177e-07, "loss": 0.68828887, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.6938936710357666 }, { "auxiliary_loss_clip": 0.01163048, "auxiliary_loss_mlp": 0.01007602, "balance_loss_clip": 1.0092603, "balance_loss_mlp": 1.00673771, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8777529746337558, "language_loss": 0.52499074, "learning_rate": 4.878981447771353e-07, "loss": 0.54669726, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.2099342346191406 }, { "auxiliary_loss_clip": 0.01319965, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 1.04691708, "balance_loss_mlp": 1.01875663, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 2.230046136607681, "language_loss": 0.73240423, "learning_rate": 4.873884125746035e-07, "loss": 0.7558639, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.7443389892578125 }, { "auxiliary_loss_clip": 0.01272953, "auxiliary_loss_mlp": 0.01023572, "balance_loss_clip": 1.04679275, "balance_loss_mlp": 1.01681602, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.7412080053329917, "language_loss": 0.72702354, "learning_rate": 4.868789098340456e-07, "loss": 0.74998879, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 3.5360422134399414 }, { "auxiliary_loss_clip": 0.0132129, "auxiliary_loss_mlp": 0.0103125, "balance_loss_clip": 1.04552531, "balance_loss_mlp": 1.02444351, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 2.3110555966957205, "language_loss": 0.73463035, "learning_rate": 4.863696366327543e-07, "loss": 0.75815576, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.755352020263672 }, { "auxiliary_loss_clip": 0.01225529, "auxiliary_loss_mlp": 0.0102301, "balance_loss_clip": 1.04662442, "balance_loss_mlp": 1.01594687, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.8805190263244949, "language_loss": 0.78058797, "learning_rate": 4.85860593047986e-07, "loss": 0.80307335, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.669504404067993 }, { "auxiliary_loss_clip": 0.01315997, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.03969193, "balance_loss_mlp": 1.01981199, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.703508273156949, "language_loss": 0.74509203, "learning_rate": 4.853517791569613e-07, "loss": 0.76851642, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.7530031204223633 }, { "auxiliary_loss_clip": 0.01276551, "auxiliary_loss_mlp": 0.02567899, "balance_loss_clip": 1.04543793, "balance_loss_mlp": 0.99991739, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 6.874110933277564, "language_loss": 0.66368943, "learning_rate": 4.848431950368684e-07, "loss": 0.70213401, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.8528971672058105 }, { "auxiliary_loss_clip": 0.01063894, "auxiliary_loss_mlp": 0.02505062, "balance_loss_clip": 1.00878537, "balance_loss_mlp": 0.99988699, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7021159589794077, "language_loss": 0.55662942, "learning_rate": 4.843348407648569e-07, "loss": 0.59231901, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.1406171321868896 }, { "auxiliary_loss_clip": 0.01226358, "auxiliary_loss_mlp": 0.01024498, "balance_loss_clip": 1.04472244, "balance_loss_mlp": 1.01737499, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.998797796599976, "language_loss": 0.83317745, "learning_rate": 4.838267164180457e-07, "loss": 0.85568607, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.5639567375183105 }, { "auxiliary_loss_clip": 0.0117687, "auxiliary_loss_mlp": 0.01024711, "balance_loss_clip": 1.05033064, "balance_loss_mlp": 1.0173409, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 2.415225329714158, "language_loss": 0.8331672, "learning_rate": 4.833188220735156e-07, "loss": 0.85518295, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.5723211765289307 }, { "auxiliary_loss_clip": 0.01220381, "auxiliary_loss_mlp": 0.0102227, "balance_loss_clip": 1.04702878, "balance_loss_mlp": 1.01501, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.081089953671375, "language_loss": 0.75005138, "learning_rate": 4.828111578083152e-07, "loss": 0.77247787, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.5768826007843018 }, { "auxiliary_loss_clip": 0.0127155, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.0489645, "balance_loss_mlp": 1.02012122, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.9985034761231324, "language_loss": 0.81150115, "learning_rate": 4.823037236994556e-07, "loss": 0.83448672, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.6143763065338135 }, { "auxiliary_loss_clip": 0.01119839, "auxiliary_loss_mlp": 0.01001164, "balance_loss_clip": 1.00918424, "balance_loss_mlp": 1.00021636, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7195142640987876, "language_loss": 0.56294608, "learning_rate": 4.817965198239136e-07, "loss": 0.58415616, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.165374994277954 }, { "auxiliary_loss_clip": 0.01323868, "auxiliary_loss_mlp": 0.01028105, "balance_loss_clip": 1.04354525, "balance_loss_mlp": 1.02057099, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.115374394406984, "language_loss": 0.74555838, "learning_rate": 4.812895462586331e-07, "loss": 0.76907814, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.738694667816162 }, { "auxiliary_loss_clip": 0.01323168, "auxiliary_loss_mlp": 0.01022391, "balance_loss_clip": 1.0465889, "balance_loss_mlp": 1.01580167, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.8215165768130004, "language_loss": 0.8196103, "learning_rate": 4.807828030805207e-07, "loss": 0.84306592, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.7651188373565674 }, { "auxiliary_loss_clip": 0.01225958, "auxiliary_loss_mlp": 0.010299, "balance_loss_clip": 1.05024862, "balance_loss_mlp": 1.02311981, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.9320889928785228, "language_loss": 0.67888898, "learning_rate": 4.802762903664495e-07, "loss": 0.70144749, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.6465797424316406 }, { "auxiliary_loss_clip": 0.01279663, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.05115604, "balance_loss_mlp": 1.02409482, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.2583347341653206, "language_loss": 0.73552895, "learning_rate": 4.797700081932565e-07, "loss": 0.75864017, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.6871252059936523 }, { "auxiliary_loss_clip": 0.01414127, "auxiliary_loss_mlp": 0.01024132, "balance_loss_clip": 1.03847969, "balance_loss_mlp": 1.01752234, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.9132445340393964, "language_loss": 0.81931704, "learning_rate": 4.792639566377442e-07, "loss": 0.84369969, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.7843241691589355 }, { "auxiliary_loss_clip": 0.01222279, "auxiliary_loss_mlp": 0.01035445, "balance_loss_clip": 1.04581738, "balance_loss_mlp": 1.02821159, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.9153667362298525, "language_loss": 0.7765044, "learning_rate": 4.78758135776681e-07, "loss": 0.79908168, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 3.611147165298462 }, { "auxiliary_loss_clip": 0.01272259, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.04670751, "balance_loss_mlp": 1.01920331, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 3.788532105645625, "language_loss": 0.78860915, "learning_rate": 4.782525456867989e-07, "loss": 0.81159019, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 3.6410610675811768 }, { "auxiliary_loss_clip": 0.01225239, "auxiliary_loss_mlp": 0.01027033, "balance_loss_clip": 1.04748225, "balance_loss_mlp": 1.01946592, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 2.9099280109282897, "language_loss": 0.83213824, "learning_rate": 4.777471864447959e-07, "loss": 0.85466099, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.6986584663391113 }, { "auxiliary_loss_clip": 0.01275424, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.0448519, "balance_loss_mlp": 1.01878381, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 1.9411003608144712, "language_loss": 0.8076731, "learning_rate": 4.772420581273344e-07, "loss": 0.83068812, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.6299822330474854 }, { "auxiliary_loss_clip": 0.01221283, "auxiliary_loss_mlp": 0.01024458, "balance_loss_clip": 1.04955554, "balance_loss_mlp": 1.01750255, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 2.790792270535227, "language_loss": 0.76814222, "learning_rate": 4.7673716081104134e-07, "loss": 0.79059964, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 3.4956796169281006 }, { "auxiliary_loss_clip": 0.01222659, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.04912543, "balance_loss_mlp": 1.02231336, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.8980746584879422, "language_loss": 0.84159493, "learning_rate": 4.762324945725109e-07, "loss": 0.86411798, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.6636815071105957 }, { "auxiliary_loss_clip": 0.01269062, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.04886365, "balance_loss_mlp": 1.02517724, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.686740503529404, "language_loss": 0.76004821, "learning_rate": 4.7572805948829844e-07, "loss": 0.7830615, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.6994757652282715 }, { "auxiliary_loss_clip": 0.01375595, "auxiliary_loss_mlp": 0.01022186, "balance_loss_clip": 1.04403043, "balance_loss_mlp": 1.01514637, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 2.2047525621016373, "language_loss": 0.70883805, "learning_rate": 4.7522385563492795e-07, "loss": 0.73281586, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.740216016769409 }, { "auxiliary_loss_clip": 0.01326834, "auxiliary_loss_mlp": 0.01022975, "balance_loss_clip": 1.0467453, "balance_loss_mlp": 1.01590252, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 1.921535164390875, "language_loss": 0.70276666, "learning_rate": 4.747198830888863e-07, "loss": 0.72626472, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.726646900177002 }, { "auxiliary_loss_clip": 0.0126988, "auxiliary_loss_mlp": 0.01024365, "balance_loss_clip": 1.04672372, "balance_loss_mlp": 1.0173645, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 4.945016399591349, "language_loss": 0.68527162, "learning_rate": 4.742161419266251e-07, "loss": 0.70821404, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 3.4950335025787354 }, { "auxiliary_loss_clip": 0.01227819, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.04884744, "balance_loss_mlp": 1.02154279, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 5.728748363918008, "language_loss": 0.65273094, "learning_rate": 4.7371263222456304e-07, "loss": 0.67529178, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.5953304767608643 }, { "auxiliary_loss_clip": 0.01113296, "auxiliary_loss_mlp": 0.00999149, "balance_loss_clip": 1.00818264, "balance_loss_mlp": 0.99813557, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.8092152268437073, "language_loss": 0.61319923, "learning_rate": 4.7320935405908004e-07, "loss": 0.63432366, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.093642234802246 }, { "auxiliary_loss_clip": 0.01178159, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.04931998, "balance_loss_mlp": 1.01735234, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.1468910131142334, "language_loss": 0.84444451, "learning_rate": 4.7270630750652475e-07, "loss": 0.86647791, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.4771690368652344 }, { "auxiliary_loss_clip": 0.01217305, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.0458889, "balance_loss_mlp": 1.02069128, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.8197014898160606, "language_loss": 0.8055495, "learning_rate": 4.7220349264320746e-07, "loss": 0.82799494, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.5470292568206787 }, { "auxiliary_loss_clip": 0.0111667, "auxiliary_loss_mlp": 0.01000683, "balance_loss_clip": 1.00914431, "balance_loss_mlp": 0.99974674, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7312329210006615, "language_loss": 0.54900229, "learning_rate": 4.71700909545407e-07, "loss": 0.57017577, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.133159875869751 }, { "auxiliary_loss_clip": 0.01225425, "auxiliary_loss_mlp": 0.01024281, "balance_loss_clip": 1.04728293, "balance_loss_mlp": 1.01734638, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 1.8680053352191384, "language_loss": 0.7700007, "learning_rate": 4.711985582893627e-07, "loss": 0.79249775, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.6112756729125977 }, { "auxiliary_loss_clip": 0.01372482, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.04100037, "balance_loss_mlp": 1.02367282, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.6758813754421278, "language_loss": 0.71969861, "learning_rate": 4.706964389512811e-07, "loss": 0.74372816, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.691852331161499 }, { "auxiliary_loss_clip": 0.01173922, "auxiliary_loss_mlp": 0.01022963, "balance_loss_clip": 1.05054009, "balance_loss_mlp": 1.01626647, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 2.0569095019056713, "language_loss": 0.87586951, "learning_rate": 4.701945516073345e-07, "loss": 0.89783841, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.587524652481079 }, { "auxiliary_loss_clip": 0.0131759, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.0467416, "balance_loss_mlp": 1.02454436, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.8018258922804602, "language_loss": 0.75212526, "learning_rate": 4.696928963336577e-07, "loss": 0.77561164, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.731614828109741 }, { "auxiliary_loss_clip": 0.01113335, "auxiliary_loss_mlp": 0.00999142, "balance_loss_clip": 1.0081768, "balance_loss_mlp": 0.99818224, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8541793200850706, "language_loss": 0.60930467, "learning_rate": 4.6919147320635224e-07, "loss": 0.63042939, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.1227447986602783 }, { "auxiliary_loss_clip": 0.01226334, "auxiliary_loss_mlp": 0.01025587, "balance_loss_clip": 1.04809046, "balance_loss_mlp": 1.01862526, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.171653314047841, "language_loss": 0.73438227, "learning_rate": 4.6869028230148286e-07, "loss": 0.75690144, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.590559959411621 }, { "auxiliary_loss_clip": 0.01319404, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 1.04184985, "balance_loss_mlp": 1.02154517, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.415744720248586, "language_loss": 0.5953331, "learning_rate": 4.6818932369507957e-07, "loss": 0.61881059, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.698227882385254 }, { "auxiliary_loss_clip": 0.0122451, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.05072117, "balance_loss_mlp": 1.02159071, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.6063122595264017, "language_loss": 0.89035887, "learning_rate": 4.676885974631386e-07, "loss": 0.91288829, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.6628284454345703 }, { "auxiliary_loss_clip": 0.0122321, "auxiliary_loss_mlp": 0.01027494, "balance_loss_clip": 1.04951406, "balance_loss_mlp": 1.02068758, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 1.9378268111176016, "language_loss": 0.81132579, "learning_rate": 4.67188103681619e-07, "loss": 0.83383286, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.6460907459259033 }, { "auxiliary_loss_clip": 0.01227133, "auxiliary_loss_mlp": 0.02564861, "balance_loss_clip": 1.05278683, "balance_loss_mlp": 0.9999544, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.0261819456330876, "language_loss": 0.69400871, "learning_rate": 4.666878424264453e-07, "loss": 0.73192871, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.6044821739196777 }, { "auxiliary_loss_clip": 0.0127015, "auxiliary_loss_mlp": 0.0102641, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02044034, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.9073458010830713, "language_loss": 0.73604548, "learning_rate": 4.661878137735069e-07, "loss": 0.75901109, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.6349282264709473 }, { "auxiliary_loss_clip": 0.01270448, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.04699183, "balance_loss_mlp": 1.02029717, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 1.949754506054307, "language_loss": 0.74986196, "learning_rate": 4.656880177986571e-07, "loss": 0.77283525, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 3.4811906814575195 }, { "auxiliary_loss_clip": 0.0127723, "auxiliary_loss_mlp": 0.01022937, "balance_loss_clip": 1.04515779, "balance_loss_mlp": 1.01575494, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 2.5153828254878214, "language_loss": 0.81900287, "learning_rate": 4.6518845457771607e-07, "loss": 0.84200454, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 3.5845248699188232 }, { "auxiliary_loss_clip": 0.01220865, "auxiliary_loss_mlp": 0.02564224, "balance_loss_clip": 1.04643357, "balance_loss_mlp": 0.99992585, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.8929920592932543, "language_loss": 0.78963864, "learning_rate": 4.646891241864652e-07, "loss": 0.82748955, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.6062588691711426 }, { "auxiliary_loss_clip": 0.01225835, "auxiliary_loss_mlp": 0.01027182, "balance_loss_clip": 1.04734898, "balance_loss_mlp": 1.01982653, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 2.103291067480908, "language_loss": 0.72877944, "learning_rate": 4.6419002670065397e-07, "loss": 0.75130963, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.6322989463806152 }, { "auxiliary_loss_clip": 0.01325271, "auxiliary_loss_mlp": 0.0102596, "balance_loss_clip": 1.04848957, "balance_loss_mlp": 1.01867962, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 2.473553960984657, "language_loss": 0.86686575, "learning_rate": 4.6369116219599445e-07, "loss": 0.89037806, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 3.56000018119812 }, { "auxiliary_loss_clip": 0.01319971, "auxiliary_loss_mlp": 0.01024515, "balance_loss_clip": 1.04363596, "balance_loss_mlp": 1.01778269, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.6565436374006373, "language_loss": 0.79698825, "learning_rate": 4.631925307481637e-07, "loss": 0.82043314, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.682694435119629 }, { "auxiliary_loss_clip": 0.0127331, "auxiliary_loss_mlp": 0.01020688, "balance_loss_clip": 1.0482533, "balance_loss_mlp": 1.01419151, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.6653858342167527, "language_loss": 0.75798607, "learning_rate": 4.6269413243280533e-07, "loss": 0.78092605, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.662677764892578 }, { "auxiliary_loss_clip": 0.01286295, "auxiliary_loss_mlp": 0.01025778, "balance_loss_clip": 1.0529896, "balance_loss_mlp": 1.0179162, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 3.3616167101560883, "language_loss": 0.74311936, "learning_rate": 4.621959673255236e-07, "loss": 0.76624012, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.6561100482940674 }, { "auxiliary_loss_clip": 0.01368702, "auxiliary_loss_mlp": 0.01024812, "balance_loss_clip": 1.04315031, "balance_loss_mlp": 1.01817465, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.4215235256187, "language_loss": 0.90665025, "learning_rate": 4.6169803550189135e-07, "loss": 0.93058538, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.724778890609741 }, { "auxiliary_loss_clip": 0.01373038, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04952478, "balance_loss_mlp": 1.0192939, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 1.888501626962539, "language_loss": 0.7735858, "learning_rate": 4.6120033703744355e-07, "loss": 0.7975871, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 3.6518588066101074 }, { "auxiliary_loss_clip": 0.01265589, "auxiliary_loss_mlp": 0.01019448, "balance_loss_clip": 1.04520309, "balance_loss_mlp": 1.01262593, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.9245845689171825, "language_loss": 0.7821722, "learning_rate": 4.607028720076822e-07, "loss": 0.8050226, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.692481279373169 }, { "auxiliary_loss_clip": 0.01220901, "auxiliary_loss_mlp": 0.01025817, "balance_loss_clip": 1.04786813, "balance_loss_mlp": 1.01899791, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 2.5329052603248443, "language_loss": 0.73435932, "learning_rate": 4.6020564048807074e-07, "loss": 0.75682652, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.61336350440979 }, { "auxiliary_loss_clip": 0.01224176, "auxiliary_loss_mlp": 0.01024695, "balance_loss_clip": 1.04821372, "balance_loss_mlp": 1.01805782, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 1.9306213332539468, "language_loss": 0.72018588, "learning_rate": 4.5970864255403883e-07, "loss": 0.74267459, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.8369219303131104 }, { "auxiliary_loss_clip": 0.01218113, "auxiliary_loss_mlp": 0.01023684, "balance_loss_clip": 1.04696774, "balance_loss_mlp": 1.01684391, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 1.9104086502464483, "language_loss": 0.82192457, "learning_rate": 4.59211878280982e-07, "loss": 0.84434247, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.6960506439208984 }, { "auxiliary_loss_clip": 0.0127239, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.04670548, "balance_loss_mlp": 1.02114749, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.169398331492266, "language_loss": 0.7014268, "learning_rate": 4.587153477442578e-07, "loss": 0.72444147, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.630357027053833 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 1.05172324, "balance_loss_mlp": 1.01805472, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 5.026102854688579, "language_loss": 0.81694633, "learning_rate": 4.582190510191899e-07, "loss": 0.83899748, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.6463370323181152 }, { "auxiliary_loss_clip": 0.0131783, "auxiliary_loss_mlp": 0.01022471, "balance_loss_clip": 1.0467819, "balance_loss_mlp": 1.01596236, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.2695689820513048, "language_loss": 0.87532228, "learning_rate": 4.5772298818106625e-07, "loss": 0.89872533, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.7277286052703857 }, { "auxiliary_loss_clip": 0.01235727, "auxiliary_loss_mlp": 0.01022588, "balance_loss_clip": 1.05089355, "balance_loss_mlp": 1.01531577, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 3.0202176119537802, "language_loss": 0.7216152, "learning_rate": 4.572271593051384e-07, "loss": 0.74419832, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.7787137031555176 }, { "auxiliary_loss_clip": 0.01362943, "auxiliary_loss_mlp": 0.010223, "balance_loss_clip": 1.04328704, "balance_loss_mlp": 1.01518941, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.7192610089894722, "language_loss": 0.78298163, "learning_rate": 4.567315644666245e-07, "loss": 0.8068341, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.723257541656494 }, { "auxiliary_loss_clip": 0.01319643, "auxiliary_loss_mlp": 0.01027869, "balance_loss_clip": 1.04568601, "balance_loss_mlp": 1.02151167, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.323116678312739, "language_loss": 0.85045779, "learning_rate": 4.5623620374070507e-07, "loss": 0.87393296, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.7258362770080566 }, { "auxiliary_loss_clip": 0.01220599, "auxiliary_loss_mlp": 0.01002043, "balance_loss_clip": 1.00878024, "balance_loss_mlp": 1.00107181, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7559601810709718, "language_loss": 0.58327967, "learning_rate": 4.557410772025263e-07, "loss": 0.60550612, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.3300929069519043 }, { "auxiliary_loss_clip": 0.01271804, "auxiliary_loss_mlp": 0.01021655, "balance_loss_clip": 1.04634678, "balance_loss_mlp": 1.01510751, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 1.899966942458922, "language_loss": 0.6680854, "learning_rate": 4.5524618492719803e-07, "loss": 0.69102001, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.6873486042022705 }, { "auxiliary_loss_clip": 0.01224026, "auxiliary_loss_mlp": 0.01022273, "balance_loss_clip": 1.04808295, "balance_loss_mlp": 1.01598775, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.8875013092346509, "language_loss": 0.79027241, "learning_rate": 4.54751526989795e-07, "loss": 0.81273544, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.6911325454711914 }, { "auxiliary_loss_clip": 0.01226423, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.04728067, "balance_loss_mlp": 1.0202471, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 2.1652214368593006, "language_loss": 0.7923677, "learning_rate": 4.5425710346535775e-07, "loss": 0.81489968, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.5745956897735596 }, { "auxiliary_loss_clip": 0.0122601, "auxiliary_loss_mlp": 0.01025624, "balance_loss_clip": 1.04893005, "balance_loss_mlp": 1.01848602, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.4441902949608743, "language_loss": 0.81690192, "learning_rate": 4.537629144288877e-07, "loss": 0.83941823, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.670820474624634 }, { "auxiliary_loss_clip": 0.01278724, "auxiliary_loss_mlp": 0.01029603, "balance_loss_clip": 1.04200053, "balance_loss_mlp": 1.02261996, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 2.197972977301138, "language_loss": 0.75359613, "learning_rate": 4.5326895995535477e-07, "loss": 0.7766794, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.7191784381866455 }, { "auxiliary_loss_clip": 0.01219613, "auxiliary_loss_mlp": 0.01024537, "balance_loss_clip": 1.04722321, "balance_loss_mlp": 1.01767039, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.6202734800381675, "language_loss": 0.84668213, "learning_rate": 4.527752401196907e-07, "loss": 0.86912364, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 3.551793336868286 }, { "auxiliary_loss_clip": 0.01271035, "auxiliary_loss_mlp": 0.01024502, "balance_loss_clip": 1.04619229, "balance_loss_mlp": 1.01772809, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.7718158848671413, "language_loss": 0.66590428, "learning_rate": 4.5228175499679254e-07, "loss": 0.68885958, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 3.655639410018921 }, { "auxiliary_loss_clip": 0.01116508, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.0082618, "balance_loss_mlp": 1.00185192, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 1.0163173109035322, "language_loss": 0.54510981, "learning_rate": 4.5178850466152174e-07, "loss": 0.56630284, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.2305045127868652 }, { "auxiliary_loss_clip": 0.0127024, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.04411578, "balance_loss_mlp": 1.02039337, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.871006743321473, "language_loss": 0.81453586, "learning_rate": 4.512954891887031e-07, "loss": 0.8375048, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.6806845664978027 }, { "auxiliary_loss_clip": 0.01277433, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.04870057, "balance_loss_mlp": 1.02095318, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.519839607036761, "language_loss": 0.83551192, "learning_rate": 4.5080270865312806e-07, "loss": 0.85856682, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 3.5463266372680664 }, { "auxiliary_loss_clip": 0.01221912, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.04746532, "balance_loss_mlp": 1.02094197, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 1.9870924670401287, "language_loss": 0.71021163, "learning_rate": 4.5031016312954985e-07, "loss": 0.73270655, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.5864317417144775 }, { "auxiliary_loss_clip": 0.01232076, "auxiliary_loss_mlp": 0.01023, "balance_loss_clip": 1.05068541, "balance_loss_mlp": 1.01588917, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 2.3549746822793924, "language_loss": 0.74523985, "learning_rate": 4.498178526926886e-07, "loss": 0.76779062, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.7205474376678467 }, { "auxiliary_loss_clip": 0.01175165, "auxiliary_loss_mlp": 0.0102337, "balance_loss_clip": 1.05079067, "balance_loss_mlp": 1.01633668, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.3393432463937476, "language_loss": 0.72427732, "learning_rate": 4.4932577741722635e-07, "loss": 0.74626267, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.587312698364258 }, { "auxiliary_loss_clip": 0.01273138, "auxiliary_loss_mlp": 0.01022423, "balance_loss_clip": 1.04576802, "balance_loss_mlp": 1.01527345, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 2.514820878396196, "language_loss": 0.74473941, "learning_rate": 4.4883393737780985e-07, "loss": 0.76769501, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.735563278198242 }, { "auxiliary_loss_clip": 0.01219341, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.04485178, "balance_loss_mlp": 1.02026534, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 2.3393367916886563, "language_loss": 0.78063256, "learning_rate": 4.4834233264905254e-07, "loss": 0.8031016, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 3.583075761795044 }, { "auxiliary_loss_clip": 0.01316983, "auxiliary_loss_mlp": 0.01027156, "balance_loss_clip": 1.04424763, "balance_loss_mlp": 1.01992011, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.501822297182343, "language_loss": 0.70896089, "learning_rate": 4.478509633055294e-07, "loss": 0.73240232, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.676697254180908 }, { "auxiliary_loss_clip": 0.01276271, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.04662037, "balance_loss_mlp": 1.02379346, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 2.901985186213801, "language_loss": 0.80219144, "learning_rate": 4.473598294217813e-07, "loss": 0.8252719, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.6537797451019287 }, { "auxiliary_loss_clip": 0.01219215, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.04898143, "balance_loss_mlp": 1.02476907, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.3829946053577706, "language_loss": 0.71878541, "learning_rate": 4.468689310723124e-07, "loss": 0.74129355, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.641378164291382 }, { "auxiliary_loss_clip": 0.01327062, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.04481721, "balance_loss_mlp": 1.02334714, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.8757681927146213, "language_loss": 0.78758919, "learning_rate": 4.463782683315913e-07, "loss": 0.81116128, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.647749185562134 }, { "auxiliary_loss_clip": 0.01171895, "auxiliary_loss_mlp": 0.01024048, "balance_loss_clip": 1.0497092, "balance_loss_mlp": 1.01732135, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 2.0194899763242202, "language_loss": 0.7369054, "learning_rate": 4.458878412740523e-07, "loss": 0.75886482, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.6744232177734375 }, { "auxiliary_loss_clip": 0.01221316, "auxiliary_loss_mlp": 0.01025144, "balance_loss_clip": 1.04856908, "balance_loss_mlp": 1.01836681, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.702286605860869, "language_loss": 0.78073782, "learning_rate": 4.453976499740919e-07, "loss": 0.80320239, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.5739753246307373 }, { "auxiliary_loss_clip": 0.01222218, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 1.05005813, "balance_loss_mlp": 1.02047634, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 2.8089052471547147, "language_loss": 0.77851856, "learning_rate": 4.4490769450607215e-07, "loss": 0.80101621, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.5707685947418213 }, { "auxiliary_loss_clip": 0.01323714, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.04111528, "balance_loss_mlp": 1.02192891, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 1.9194913857101397, "language_loss": 0.72523445, "learning_rate": 4.4441797494431845e-07, "loss": 0.74876785, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.8937056064605713 }, { "auxiliary_loss_clip": 0.012209, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.04988635, "balance_loss_mlp": 1.0181483, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 2.24350718216248, "language_loss": 0.78224379, "learning_rate": 4.439284913631207e-07, "loss": 0.80470431, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.568086624145508 }, { "auxiliary_loss_clip": 0.01223577, "auxiliary_loss_mlp": 0.01022859, "balance_loss_clip": 1.04921508, "balance_loss_mlp": 1.01571202, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 1.875796914280907, "language_loss": 0.83897436, "learning_rate": 4.434392438367347e-07, "loss": 0.86143875, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.724194288253784 }, { "auxiliary_loss_clip": 0.01128133, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.04831183, "balance_loss_mlp": 1.0199579, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 1.7514729726947278, "language_loss": 0.74095935, "learning_rate": 4.4295023243937677e-07, "loss": 0.7625134, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.7108240127563477 }, { "auxiliary_loss_clip": 0.0123077, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.05241346, "balance_loss_mlp": 1.02151036, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 2.021415957806739, "language_loss": 0.8039242, "learning_rate": 4.4246145724523123e-07, "loss": 0.82652014, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.6468353271484375 }, { "auxiliary_loss_clip": 0.01319479, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.04677498, "balance_loss_mlp": 1.01930034, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.4463129809514124, "language_loss": 0.77295578, "learning_rate": 4.41972918328444e-07, "loss": 0.79640877, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.7070324420928955 }, { "auxiliary_loss_clip": 0.01220968, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.04925108, "balance_loss_mlp": 1.02110708, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 7.952828303959024, "language_loss": 0.77514976, "learning_rate": 4.4148461576312646e-07, "loss": 0.7976433, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.7014849185943604 }, { "auxiliary_loss_clip": 0.01224463, "auxiliary_loss_mlp": 0.01022349, "balance_loss_clip": 1.05074596, "balance_loss_mlp": 1.01613808, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.4677296907421407, "language_loss": 0.74690473, "learning_rate": 4.4099654962335343e-07, "loss": 0.76937282, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.668623685836792 }, { "auxiliary_loss_clip": 0.01279319, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.04808271, "balance_loss_mlp": 1.02345145, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 2.867462476669237, "language_loss": 0.75120568, "learning_rate": 4.405087199831636e-07, "loss": 0.77430522, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 3.6217687129974365 }, { "auxiliary_loss_clip": 0.01272932, "auxiliary_loss_mlp": 0.02565017, "balance_loss_clip": 1.04395914, "balance_loss_mlp": 0.99992275, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 3.0669859085509925, "language_loss": 0.67646235, "learning_rate": 4.400211269165619e-07, "loss": 0.71484184, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.697192430496216 }, { "auxiliary_loss_clip": 0.01176488, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.05271101, "balance_loss_mlp": 1.02304721, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.567859321824638, "language_loss": 0.76929224, "learning_rate": 4.3953377049751416e-07, "loss": 0.79135072, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 3.5741453170776367 }, { "auxiliary_loss_clip": 0.01277463, "auxiliary_loss_mlp": 0.01023443, "balance_loss_clip": 1.04751718, "balance_loss_mlp": 1.01646352, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.6132809708175104, "language_loss": 0.77840883, "learning_rate": 4.390466507999537e-07, "loss": 0.80141789, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.619466781616211 }, { "auxiliary_loss_clip": 0.01319689, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.04573202, "balance_loss_mlp": 1.02148652, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 7.741934950024072, "language_loss": 0.75945878, "learning_rate": 4.385597678977748e-07, "loss": 0.78294492, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.6579172611236572 }, { "auxiliary_loss_clip": 0.01277283, "auxiliary_loss_mlp": 0.01022581, "balance_loss_clip": 1.04705667, "balance_loss_mlp": 1.01533914, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 8.104146481718471, "language_loss": 0.75381112, "learning_rate": 4.3807312186483726e-07, "loss": 0.77680981, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 3.5911638736724854 }, { "auxiliary_loss_clip": 0.01223928, "auxiliary_loss_mlp": 0.01023456, "balance_loss_clip": 1.05349445, "balance_loss_mlp": 1.01661611, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 2.825929844239629, "language_loss": 0.78845692, "learning_rate": 4.375867127749655e-07, "loss": 0.81093079, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.6319997310638428 }, { "auxiliary_loss_clip": 0.01322258, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.04797626, "balance_loss_mlp": 1.02055407, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 2.691243766061732, "language_loss": 0.67370105, "learning_rate": 4.3710054070194744e-07, "loss": 0.69720143, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.7012381553649902 }, { "auxiliary_loss_clip": 0.01175756, "auxiliary_loss_mlp": 0.02568701, "balance_loss_clip": 1.04931259, "balance_loss_mlp": 0.99992043, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 3.4557785986806073, "language_loss": 0.6591934, "learning_rate": 4.3661460571953455e-07, "loss": 0.69663793, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.5723037719726562 }, { "auxiliary_loss_clip": 0.01224736, "auxiliary_loss_mlp": 0.01026755, "balance_loss_clip": 1.04711318, "balance_loss_mlp": 1.01971269, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.9143861099342492, "language_loss": 0.68555343, "learning_rate": 4.36128907901443e-07, "loss": 0.70806831, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.6066060066223145 }, { "auxiliary_loss_clip": 0.01323966, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.04288173, "balance_loss_mlp": 1.02090025, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 2.3459348269471336, "language_loss": 0.72722208, "learning_rate": 4.356434473213519e-07, "loss": 0.75074089, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 3.6153619289398193 }, { "auxiliary_loss_clip": 0.01270636, "auxiliary_loss_mlp": 0.01029599, "balance_loss_clip": 1.04970872, "balance_loss_mlp": 1.02283096, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.5799464025361234, "language_loss": 0.79967403, "learning_rate": 4.351582240529068e-07, "loss": 0.82267636, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.717794418334961 }, { "auxiliary_loss_clip": 0.01175975, "auxiliary_loss_mlp": 0.01000447, "balance_loss_clip": 1.00864816, "balance_loss_mlp": 0.99949932, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6768582829891229, "language_loss": 0.58209193, "learning_rate": 4.346732381697149e-07, "loss": 0.60385615, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.3079116344451904 }, { "auxiliary_loss_clip": 0.01267626, "auxiliary_loss_mlp": 0.01020737, "balance_loss_clip": 1.04815233, "balance_loss_mlp": 1.01442766, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 2.2567918586585245, "language_loss": 0.8104862, "learning_rate": 4.3418848974534825e-07, "loss": 0.83336979, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.6682419776916504 }, { "auxiliary_loss_clip": 0.01327957, "auxiliary_loss_mlp": 0.01024679, "balance_loss_clip": 1.04674792, "balance_loss_mlp": 1.01809013, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 2.120313042192147, "language_loss": 0.68730474, "learning_rate": 4.3370397885334276e-07, "loss": 0.71083111, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.7692317962646484 }, { "auxiliary_loss_clip": 0.01221326, "auxiliary_loss_mlp": 0.01025285, "balance_loss_clip": 1.04803133, "balance_loss_mlp": 1.01850843, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 2.9750981527269857, "language_loss": 0.75619775, "learning_rate": 4.3321970556719777e-07, "loss": 0.77866387, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.642742156982422 }, { "auxiliary_loss_clip": 0.01176204, "auxiliary_loss_mlp": 0.01022827, "balance_loss_clip": 1.05155504, "balance_loss_mlp": 1.0158596, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 3.0009242392483277, "language_loss": 0.72312087, "learning_rate": 4.3273566996037856e-07, "loss": 0.74511117, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.5672311782836914 }, { "auxiliary_loss_clip": 0.01271071, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.04779553, "balance_loss_mlp": 1.02147961, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 2.8139297328197723, "language_loss": 0.80505371, "learning_rate": 4.322518721063113e-07, "loss": 0.82804799, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.6597180366516113 }, { "auxiliary_loss_clip": 0.01223308, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.05057168, "balance_loss_mlp": 1.02126169, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 1.8903001131241843, "language_loss": 0.7005105, "learning_rate": 4.3176831207838906e-07, "loss": 0.72302902, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.721423625946045 }, { "auxiliary_loss_clip": 0.01219674, "auxiliary_loss_mlp": 0.01026241, "balance_loss_clip": 1.05002785, "balance_loss_mlp": 1.01915669, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 1.8130764961350527, "language_loss": 0.74653459, "learning_rate": 4.3128498994996685e-07, "loss": 0.76899374, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.6749420166015625 }, { "auxiliary_loss_clip": 0.0122911, "auxiliary_loss_mlp": 0.01030689, "balance_loss_clip": 1.05007553, "balance_loss_mlp": 1.02376342, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 4.0903082306213605, "language_loss": 0.71573693, "learning_rate": 4.308019057943646e-07, "loss": 0.73833489, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.6582255363464355 }, { "auxiliary_loss_clip": 0.01280857, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.04769373, "balance_loss_mlp": 1.02111244, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 2.0239458813922964, "language_loss": 0.7465328, "learning_rate": 4.3031905968486535e-07, "loss": 0.76962459, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.8021457195281982 }, { "auxiliary_loss_clip": 0.01368641, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.04901361, "balance_loss_mlp": 1.0197016, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.473529499068388, "language_loss": 0.68804109, "learning_rate": 4.298364516947162e-07, "loss": 0.71198654, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.6996238231658936 }, { "auxiliary_loss_clip": 0.01370838, "auxiliary_loss_mlp": 0.01025556, "balance_loss_clip": 1.04344296, "balance_loss_mlp": 1.01881433, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 2.262413517321873, "language_loss": 0.65668094, "learning_rate": 4.293540818971295e-07, "loss": 0.68064493, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.761214256286621 }, { "auxiliary_loss_clip": 0.01228737, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.04831731, "balance_loss_mlp": 1.01786506, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 2.0840774183567565, "language_loss": 0.76662505, "learning_rate": 4.2887195036527934e-07, "loss": 0.78916383, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.6247200965881348 }, { "auxiliary_loss_clip": 0.01218016, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.04480445, "balance_loss_mlp": 1.02007866, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.9665866810480988, "language_loss": 0.73440588, "learning_rate": 4.28390057172306e-07, "loss": 0.75685769, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.572831869125366 }, { "auxiliary_loss_clip": 0.01323138, "auxiliary_loss_mlp": 0.01028349, "balance_loss_clip": 1.04296446, "balance_loss_mlp": 1.02104473, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.3654628528850865, "language_loss": 0.7217024, "learning_rate": 4.279084023913111e-07, "loss": 0.74521726, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.7234859466552734 }, { "auxiliary_loss_clip": 0.01219588, "auxiliary_loss_mlp": 0.0102503, "balance_loss_clip": 1.04739165, "balance_loss_mlp": 1.01856291, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 2.102307108573259, "language_loss": 0.69564432, "learning_rate": 4.2742698609536096e-07, "loss": 0.71809053, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 3.502429962158203 }, { "auxiliary_loss_clip": 0.01276587, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.04922497, "balance_loss_mlp": 1.01915288, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 1.8721397953749253, "language_loss": 0.78235471, "learning_rate": 4.2694580835748706e-07, "loss": 0.80538094, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.6978325843811035 }, { "auxiliary_loss_clip": 0.01173003, "auxiliary_loss_mlp": 0.0102559, "balance_loss_clip": 1.04549289, "balance_loss_mlp": 1.01867008, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.093957466592328, "language_loss": 0.7381168, "learning_rate": 4.264648692506836e-07, "loss": 0.76010275, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 3.6061301231384277 }, { "auxiliary_loss_clip": 0.01270341, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.04593027, "balance_loss_mlp": 1.02267432, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 2.1045714395347965, "language_loss": 0.72103018, "learning_rate": 4.2598416884790824e-07, "loss": 0.74402976, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.717087745666504 }, { "auxiliary_loss_clip": 0.01282507, "auxiliary_loss_mlp": 0.01028284, "balance_loss_clip": 1.046417, "balance_loss_mlp": 1.02091956, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 2.3802609052777948, "language_loss": 0.80832213, "learning_rate": 4.255037072220828e-07, "loss": 0.83143008, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 3.5911757946014404 }, { "auxiliary_loss_clip": 0.01171575, "auxiliary_loss_mlp": 0.0102383, "balance_loss_clip": 1.04793286, "balance_loss_mlp": 1.01736283, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.743181465224468, "language_loss": 0.7206012, "learning_rate": 4.2502348444609293e-07, "loss": 0.74255526, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.579723358154297 }, { "auxiliary_loss_clip": 0.01369767, "auxiliary_loss_mlp": 0.01024973, "balance_loss_clip": 1.04085243, "balance_loss_mlp": 1.01821399, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 1.6644626944516545, "language_loss": 0.69379443, "learning_rate": 4.2454350059278844e-07, "loss": 0.71774185, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.7681305408477783 }, { "auxiliary_loss_clip": 0.01271366, "auxiliary_loss_mlp": 0.01027396, "balance_loss_clip": 1.04311693, "balance_loss_mlp": 1.02088702, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.9127727093014686, "language_loss": 0.84556329, "learning_rate": 4.240637557349824e-07, "loss": 0.86855096, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.674767255783081 }, { "auxiliary_loss_clip": 0.01264726, "auxiliary_loss_mlp": 0.01024387, "balance_loss_clip": 1.0458076, "balance_loss_mlp": 1.01707923, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 1.7503328366054725, "language_loss": 0.66438103, "learning_rate": 4.235842499454516e-07, "loss": 0.68727219, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.66776967048645 }, { "auxiliary_loss_clip": 0.01275371, "auxiliary_loss_mlp": 0.01030002, "balance_loss_clip": 1.04821181, "balance_loss_mlp": 1.02300119, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 2.045718487402676, "language_loss": 0.82663572, "learning_rate": 4.2310498329693687e-07, "loss": 0.84968948, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 3.6162827014923096 }, { "auxiliary_loss_clip": 0.01225462, "auxiliary_loss_mlp": 0.01019866, "balance_loss_clip": 1.04974484, "balance_loss_mlp": 1.01238585, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.8477882507103678, "language_loss": 0.80806947, "learning_rate": 4.2262595586214164e-07, "loss": 0.83052278, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.622640371322632 }, { "auxiliary_loss_clip": 0.01228029, "auxiliary_loss_mlp": 0.01028283, "balance_loss_clip": 1.05001426, "balance_loss_mlp": 1.02125823, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.7384081731205656, "language_loss": 0.77016824, "learning_rate": 4.221471677137358e-07, "loss": 0.79273134, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.6558423042297363 }, { "auxiliary_loss_clip": 0.01263528, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.04594338, "balance_loss_mlp": 1.01941633, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 2.3603069590725108, "language_loss": 0.70073247, "learning_rate": 4.216686189243492e-07, "loss": 0.72362661, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.6570870876312256 }, { "auxiliary_loss_clip": 0.01316861, "auxiliary_loss_mlp": 0.01025414, "balance_loss_clip": 1.04487669, "balance_loss_mlp": 1.01807094, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.5881220212206542, "language_loss": 0.72821677, "learning_rate": 4.211903095665785e-07, "loss": 0.75163954, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.6535756587982178 }, { "auxiliary_loss_clip": 0.01218565, "auxiliary_loss_mlp": 0.01027302, "balance_loss_clip": 1.04776609, "balance_loss_mlp": 1.0208137, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.966605937756381, "language_loss": 0.75288701, "learning_rate": 4.2071223971298277e-07, "loss": 0.77534568, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.5839157104492188 }, { "auxiliary_loss_clip": 0.01226376, "auxiliary_loss_mlp": 0.01023221, "balance_loss_clip": 1.04744124, "balance_loss_mlp": 1.01574063, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.6976608247297125, "language_loss": 0.61285633, "learning_rate": 4.2023440943608433e-07, "loss": 0.63535225, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.6712915897369385 }, { "auxiliary_loss_clip": 0.01224174, "auxiliary_loss_mlp": 0.01025307, "balance_loss_clip": 1.0478586, "balance_loss_mlp": 1.01907492, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.5805080498653639, "language_loss": 0.78519738, "learning_rate": 4.1975681880837023e-07, "loss": 0.80769217, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.6104848384857178 }, { "auxiliary_loss_clip": 0.0132031, "auxiliary_loss_mlp": 0.01026032, "balance_loss_clip": 1.04225218, "balance_loss_mlp": 1.01907587, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.90683854846277, "language_loss": 0.8254177, "learning_rate": 4.192794679022895e-07, "loss": 0.84888113, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.6752336025238037 }, { "auxiliary_loss_clip": 0.01224466, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.04695058, "balance_loss_mlp": 1.02401555, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 2.0868180937107432, "language_loss": 0.72090715, "learning_rate": 4.1880235679025743e-07, "loss": 0.74345744, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.763941764831543 }, { "auxiliary_loss_clip": 0.01425457, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.04131508, "balance_loss_mlp": 1.02241611, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.8945800486188473, "language_loss": 0.63855994, "learning_rate": 4.1832548554464986e-07, "loss": 0.6631096, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.7469444274902344 }, { "auxiliary_loss_clip": 0.01110941, "auxiliary_loss_mlp": 0.01002413, "balance_loss_clip": 1.00888407, "balance_loss_mlp": 1.00149477, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7400008932190706, "language_loss": 0.58670127, "learning_rate": 4.178488542378098e-07, "loss": 0.60783482, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.1250414848327637 }, { "auxiliary_loss_clip": 0.01177394, "auxiliary_loss_mlp": 0.01027547, "balance_loss_clip": 1.0508213, "balance_loss_mlp": 1.02012062, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.7748111154840907, "language_loss": 0.8926754, "learning_rate": 4.173724629420401e-07, "loss": 0.91472483, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.643479108810425 }, { "auxiliary_loss_clip": 0.01278687, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 1.04851544, "balance_loss_mlp": 1.01864374, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 3.7967914450180262, "language_loss": 0.68203473, "learning_rate": 4.168963117296087e-07, "loss": 0.70508051, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.627826690673828 }, { "auxiliary_loss_clip": 0.01174434, "auxiliary_loss_mlp": 0.01023694, "balance_loss_clip": 1.04960692, "balance_loss_mlp": 1.0167172, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.388201585128572, "language_loss": 0.7612949, "learning_rate": 4.1642040067274876e-07, "loss": 0.78327614, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.6335997581481934 }, { "auxiliary_loss_clip": 0.0127716, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 1.04763412, "balance_loss_mlp": 1.01994419, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 1.8539583044423484, "language_loss": 0.72597766, "learning_rate": 4.1594472984365493e-07, "loss": 0.74901521, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.645946979522705 }, { "auxiliary_loss_clip": 0.01223043, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.04910481, "balance_loss_mlp": 1.01951849, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 1.9409853836155337, "language_loss": 0.77800435, "learning_rate": 4.154692993144862e-07, "loss": 0.80049801, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.6939005851745605 }, { "auxiliary_loss_clip": 0.0117495, "auxiliary_loss_mlp": 0.02564464, "balance_loss_clip": 1.04936337, "balance_loss_mlp": 0.99992561, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.415306032995808, "language_loss": 0.71782446, "learning_rate": 4.1499410915736476e-07, "loss": 0.75521851, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 3.5175857543945312 }, { "auxiliary_loss_clip": 0.01118608, "auxiliary_loss_mlp": 0.01000358, "balance_loss_clip": 1.0100981, "balance_loss_mlp": 0.99945164, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.7657919169831581, "language_loss": 0.64176095, "learning_rate": 4.145191594443762e-07, "loss": 0.66295058, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 4.259446859359741 }, { "auxiliary_loss_clip": 0.01320621, "auxiliary_loss_mlp": 0.01029296, "balance_loss_clip": 1.04523683, "balance_loss_mlp": 1.02195585, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.7989134932137414, "language_loss": 0.70771545, "learning_rate": 4.140444502475713e-07, "loss": 0.73121464, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.689101457595825 }, { "auxiliary_loss_clip": 0.01225242, "auxiliary_loss_mlp": 0.01026118, "balance_loss_clip": 1.04793358, "balance_loss_mlp": 1.0192219, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 6.428747355059764, "language_loss": 0.70235312, "learning_rate": 4.1356998163896216e-07, "loss": 0.72486669, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 3.494041919708252 }, { "auxiliary_loss_clip": 0.01325978, "auxiliary_loss_mlp": 0.01025083, "balance_loss_clip": 1.04678547, "balance_loss_mlp": 1.0180912, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 1.9914035101663885, "language_loss": 0.74774706, "learning_rate": 4.130957536905255e-07, "loss": 0.77125764, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.776224136352539 }, { "auxiliary_loss_clip": 0.01180638, "auxiliary_loss_mlp": 0.01029399, "balance_loss_clip": 1.04838133, "balance_loss_mlp": 1.02193308, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.739028165300597, "language_loss": 0.71560186, "learning_rate": 4.1262176647420134e-07, "loss": 0.73770225, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.611011266708374 }, { "auxiliary_loss_clip": 0.01280272, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.04872203, "balance_loss_mlp": 1.0229044, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.8033313589043347, "language_loss": 0.80202639, "learning_rate": 4.121480200618923e-07, "loss": 0.82512408, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.6852447986602783 }, { "auxiliary_loss_clip": 0.01272433, "auxiliary_loss_mlp": 0.01022561, "balance_loss_clip": 1.04737389, "balance_loss_mlp": 1.01519704, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 2.055115061046622, "language_loss": 0.79873085, "learning_rate": 4.116745145254674e-07, "loss": 0.82168078, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.6521058082580566 }, { "auxiliary_loss_clip": 0.01162049, "auxiliary_loss_mlp": 0.01004236, "balance_loss_clip": 1.00692391, "balance_loss_mlp": 1.00331235, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7621568453102635, "language_loss": 0.57931942, "learning_rate": 4.1120124993675476e-07, "loss": 0.60098225, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.2148122787475586 }, { "auxiliary_loss_clip": 0.01279502, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 1.04547751, "balance_loss_mlp": 1.01870465, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 3.903859083734488, "language_loss": 0.61948192, "learning_rate": 4.107282263675498e-07, "loss": 0.64253986, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 3.600423812866211 }, { "auxiliary_loss_clip": 0.0116416, "auxiliary_loss_mlp": 0.0250768, "balance_loss_clip": 1.00711107, "balance_loss_mlp": 0.99990219, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7605436544049067, "language_loss": 0.52438164, "learning_rate": 4.1025544388960907e-07, "loss": 0.56110007, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.210226058959961 }, { "auxiliary_loss_clip": 0.01219556, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04898679, "balance_loss_mlp": 1.01764369, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 1.8230659555391795, "language_loss": 0.71447772, "learning_rate": 4.097829025746538e-07, "loss": 0.73692268, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.7129478454589844 }, { "auxiliary_loss_clip": 0.01113094, "auxiliary_loss_mlp": 0.01001707, "balance_loss_clip": 1.00831914, "balance_loss_mlp": 1.00082493, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6576465258881536, "language_loss": 0.60959661, "learning_rate": 4.0931060249436757e-07, "loss": 0.63074458, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.213172674179077 }, { "auxiliary_loss_clip": 0.01224053, "auxiliary_loss_mlp": 0.0102444, "balance_loss_clip": 1.05177569, "balance_loss_mlp": 1.0173943, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 2.190067378099445, "language_loss": 0.69624674, "learning_rate": 4.088385437203978e-07, "loss": 0.7187317, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.612171173095703 }, { "auxiliary_loss_clip": 0.01174538, "auxiliary_loss_mlp": 0.01025156, "balance_loss_clip": 1.04871297, "balance_loss_mlp": 1.01786304, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 2.5030923088194967, "language_loss": 0.77768946, "learning_rate": 4.083667263243564e-07, "loss": 0.79968643, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.577841281890869 }, { "auxiliary_loss_clip": 0.01229794, "auxiliary_loss_mlp": 0.01024069, "balance_loss_clip": 1.05461478, "balance_loss_mlp": 1.01723552, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 6.1073038973103895, "language_loss": 0.71773988, "learning_rate": 4.0789515037781653e-07, "loss": 0.74027848, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.6117942333221436 }, { "auxiliary_loss_clip": 0.01228075, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.04964781, "balance_loss_mlp": 1.02036381, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 2.0826045737615586, "language_loss": 0.82670832, "learning_rate": 4.0742381595231755e-07, "loss": 0.84926307, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.608567476272583 }, { "auxiliary_loss_clip": 0.01225608, "auxiliary_loss_mlp": 0.01024164, "balance_loss_clip": 1.04751027, "balance_loss_mlp": 1.01773214, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 1.979064265243511, "language_loss": 0.78285515, "learning_rate": 4.06952723119359e-07, "loss": 0.80535287, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.6663105487823486 }, { "auxiliary_loss_clip": 0.01267842, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04626429, "balance_loss_mlp": 1.02386165, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 2.0914517767057395, "language_loss": 0.67399764, "learning_rate": 4.0648187195040504e-07, "loss": 0.69698524, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.8233542442321777 }, { "auxiliary_loss_clip": 0.0111076, "auxiliary_loss_mlp": 0.01000927, "balance_loss_clip": 1.008865, "balance_loss_mlp": 0.99995595, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8163662007306148, "language_loss": 0.6755954, "learning_rate": 4.060112625168848e-07, "loss": 0.69671237, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.26642107963562 }, { "auxiliary_loss_clip": 0.01175563, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.04992712, "balance_loss_mlp": 1.01937473, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 2.50810137207009, "language_loss": 0.74187768, "learning_rate": 4.055408948901886e-07, "loss": 0.76390207, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.6128408908843994 }, { "auxiliary_loss_clip": 0.01229764, "auxiliary_loss_mlp": 0.01023126, "balance_loss_clip": 1.05042982, "balance_loss_mlp": 1.01582408, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 1.6871353641182028, "language_loss": 0.7134884, "learning_rate": 4.050707691416708e-07, "loss": 0.73601735, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.640617609024048 }, { "auxiliary_loss_clip": 0.01110367, "auxiliary_loss_mlp": 0.01000077, "balance_loss_clip": 1.00878263, "balance_loss_mlp": 0.99911112, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6718449193686803, "language_loss": 0.59713471, "learning_rate": 4.046008853426495e-07, "loss": 0.61823916, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.255549192428589 }, { "auxiliary_loss_clip": 0.01320697, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.04562402, "balance_loss_mlp": 1.02001405, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 2.6039228488041477, "language_loss": 0.62786257, "learning_rate": 4.0413124356440464e-07, "loss": 0.6513412, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.764796733856201 }, { "auxiliary_loss_clip": 0.0137379, "auxiliary_loss_mlp": 0.01025469, "balance_loss_clip": 1.04234493, "balance_loss_mlp": 1.01818848, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 2.0684844634992903, "language_loss": 0.82283747, "learning_rate": 4.0366184387818223e-07, "loss": 0.84683007, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.712739944458008 }, { "auxiliary_loss_clip": 0.01181676, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.05184722, "balance_loss_mlp": 1.0215416, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 2.1752781907040517, "language_loss": 0.85272014, "learning_rate": 4.0319268635518797e-07, "loss": 0.87482858, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 2.679807424545288 }, { "auxiliary_loss_clip": 0.01224531, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.04716492, "balance_loss_mlp": 1.01765966, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 2.077679397144356, "language_loss": 0.75190496, "learning_rate": 4.027237710665943e-07, "loss": 0.77438962, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 3.537364959716797 }, { "auxiliary_loss_clip": 0.01324718, "auxiliary_loss_mlp": 0.01024142, "balance_loss_clip": 1.04380906, "balance_loss_mlp": 1.01702785, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 2.2185016798233037, "language_loss": 0.69555247, "learning_rate": 4.022550980835344e-07, "loss": 0.71904105, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 3.721189022064209 }, { "auxiliary_loss_clip": 0.0131925, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.04264593, "balance_loss_mlp": 1.02297986, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.719632408062194, "language_loss": 0.79483283, "learning_rate": 4.017866674771051e-07, "loss": 0.81832415, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 3.6000237464904785 }, { "auxiliary_loss_clip": 0.01366591, "auxiliary_loss_mlp": 0.01023351, "balance_loss_clip": 1.04038239, "balance_loss_mlp": 1.01648474, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.7156909884374745, "language_loss": 0.74671519, "learning_rate": 4.013184793183688e-07, "loss": 0.77061456, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.7734885215759277 }, { "auxiliary_loss_clip": 0.01223334, "auxiliary_loss_mlp": 0.01026537, "balance_loss_clip": 1.04674673, "balance_loss_mlp": 1.01933646, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 3.2982898725232537, "language_loss": 0.72712266, "learning_rate": 4.008505336783472e-07, "loss": 0.74962133, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.6017673015594482 }, { "auxiliary_loss_clip": 0.01215862, "auxiliary_loss_mlp": 0.01026613, "balance_loss_clip": 1.04646027, "balance_loss_mlp": 1.02012515, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 2.252747977353427, "language_loss": 0.81066442, "learning_rate": 4.003828306280284e-07, "loss": 0.83308917, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.6275711059570312 }, { "auxiliary_loss_clip": 0.01123803, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.04833269, "balance_loss_mlp": 1.0191977, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 2.8779595155325115, "language_loss": 0.78045571, "learning_rate": 3.999153702383626e-07, "loss": 0.80195349, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.6399126052856445 }, { "auxiliary_loss_clip": 0.01227654, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 1.04829574, "balance_loss_mlp": 1.02161145, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 7.573085870682205, "language_loss": 0.73866951, "learning_rate": 3.9944815258026263e-07, "loss": 0.76123852, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.7393569946289062 }, { "auxiliary_loss_clip": 0.01228402, "auxiliary_loss_mlp": 0.01023318, "balance_loss_clip": 1.04876339, "balance_loss_mlp": 1.01620138, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.7228681256034981, "language_loss": 0.8309164, "learning_rate": 3.989811777246057e-07, "loss": 0.85343367, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.7058985233306885 }, { "auxiliary_loss_clip": 0.01061862, "auxiliary_loss_mlp": 0.01002387, "balance_loss_clip": 1.00751519, "balance_loss_mlp": 1.00149298, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8473748373653676, "language_loss": 0.66219085, "learning_rate": 3.985144457422305e-07, "loss": 0.68283337, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 4.068999767303467 }, { "auxiliary_loss_clip": 0.01174158, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.0494585, "balance_loss_mlp": 1.0246129, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 1.90450931134373, "language_loss": 0.76820219, "learning_rate": 3.9804795670394096e-07, "loss": 0.79025429, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.62266206741333 }, { "auxiliary_loss_clip": 0.01266984, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.04576886, "balance_loss_mlp": 1.0202055, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 2.0081702695733683, "language_loss": 0.70462835, "learning_rate": 3.975817106805022e-07, "loss": 0.72756642, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.675126791000366 }, { "auxiliary_loss_clip": 0.01323369, "auxiliary_loss_mlp": 0.01029517, "balance_loss_clip": 1.04639828, "balance_loss_mlp": 1.0223794, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 3.9007349378643292, "language_loss": 0.65277016, "learning_rate": 3.97115707742645e-07, "loss": 0.67629904, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.83388614654541 }, { "auxiliary_loss_clip": 0.01277045, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.04906726, "balance_loss_mlp": 1.01907301, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 2.290491276473115, "language_loss": 0.64973199, "learning_rate": 3.966499479610599e-07, "loss": 0.67276257, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.6538848876953125 }, { "auxiliary_loss_clip": 0.01320072, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 1.04748702, "balance_loss_mlp": 1.02514172, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.1138433176033056, "language_loss": 0.64854586, "learning_rate": 3.9618443140640225e-07, "loss": 0.67206872, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.7405736446380615 }, { "auxiliary_loss_clip": 0.01271239, "auxiliary_loss_mlp": 0.01002382, "balance_loss_clip": 1.00765753, "balance_loss_mlp": 1.00143993, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6864079553418202, "language_loss": 0.51274478, "learning_rate": 3.957191581492918e-07, "loss": 0.53548098, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.3522908687591553 }, { "auxiliary_loss_clip": 0.01268539, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.04578304, "balance_loss_mlp": 1.0199033, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 3.064995768141159, "language_loss": 0.71763325, "learning_rate": 3.952541282603097e-07, "loss": 0.740583, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.6060752868652344 }, { "auxiliary_loss_clip": 0.0122236, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.04912329, "balance_loss_mlp": 1.0231235, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 1.7378585788681622, "language_loss": 0.8368746, "learning_rate": 3.9478934181000013e-07, "loss": 0.85939854, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.6135292053222656 }, { "auxiliary_loss_clip": 0.01177958, "auxiliary_loss_mlp": 0.01030036, "balance_loss_clip": 1.04970694, "balance_loss_mlp": 1.02281189, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.4535129586636324, "language_loss": 0.849572, "learning_rate": 3.943247988688714e-07, "loss": 0.87165195, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.581153392791748 }, { "auxiliary_loss_clip": 0.01224932, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.04706287, "balance_loss_mlp": 1.02186036, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.8386457015330784, "language_loss": 0.71883476, "learning_rate": 3.938604995073933e-07, "loss": 0.74136734, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.620859384536743 }, { "auxiliary_loss_clip": 0.01276639, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.04733217, "balance_loss_mlp": 1.02453291, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.6823240748244546, "language_loss": 0.65520179, "learning_rate": 3.9339644379600157e-07, "loss": 0.67828453, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.705271005630493 }, { "auxiliary_loss_clip": 0.01126545, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.04982471, "balance_loss_mlp": 1.02082753, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 2.0691056825130656, "language_loss": 0.71219981, "learning_rate": 3.929326318050907e-07, "loss": 0.7337355, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.555757761001587 }, { "auxiliary_loss_clip": 0.01168699, "auxiliary_loss_mlp": 0.01022502, "balance_loss_clip": 1.04565287, "balance_loss_mlp": 1.01537895, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 3.9848858527152085, "language_loss": 0.79224223, "learning_rate": 3.924690636050225e-07, "loss": 0.81415427, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.582589864730835 }, { "auxiliary_loss_clip": 0.01226491, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 1.05047953, "balance_loss_mlp": 1.02095723, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 1.9071726687069162, "language_loss": 0.72933114, "learning_rate": 3.9200573926611915e-07, "loss": 0.75187641, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.643369436264038 }, { "auxiliary_loss_clip": 0.01228035, "auxiliary_loss_mlp": 0.01026734, "balance_loss_clip": 1.05186129, "balance_loss_mlp": 1.01984119, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 1.9788995699012273, "language_loss": 0.73229277, "learning_rate": 3.9154265885866613e-07, "loss": 0.75484037, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.6120049953460693 }, { "auxiliary_loss_clip": 0.01224329, "auxiliary_loss_mlp": 0.0101992, "balance_loss_clip": 1.04990959, "balance_loss_mlp": 1.01290774, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 3.3140809390015344, "language_loss": 0.74741828, "learning_rate": 3.9107982245291394e-07, "loss": 0.76986074, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 3.5193560123443604 }, { "auxiliary_loss_clip": 0.01323582, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.04888368, "balance_loss_mlp": 1.01702619, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 1.9760793177861198, "language_loss": 0.7751323, "learning_rate": 3.9061723011907245e-07, "loss": 0.79860699, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.675445556640625 }, { "auxiliary_loss_clip": 0.01270675, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 1.04571128, "balance_loss_mlp": 1.01954246, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.6904390241527318, "language_loss": 0.79710734, "learning_rate": 3.901548819273179e-07, "loss": 0.82008183, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 3.7043263912200928 }, { "auxiliary_loss_clip": 0.01224365, "auxiliary_loss_mlp": 0.01025597, "balance_loss_clip": 1.05022514, "balance_loss_mlp": 1.0189178, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.8765713946932534, "language_loss": 0.69552732, "learning_rate": 3.896927779477881e-07, "loss": 0.718027, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 3.559210777282715 }, { "auxiliary_loss_clip": 0.01323243, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 1.04458654, "balance_loss_mlp": 1.02230477, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.642549311986958, "language_loss": 0.66864729, "learning_rate": 3.892309182505833e-07, "loss": 0.69217539, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.7106645107269287 }, { "auxiliary_loss_clip": 0.01173367, "auxiliary_loss_mlp": 0.01030695, "balance_loss_clip": 1.04816151, "balance_loss_mlp": 1.02390933, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 2.21332321602218, "language_loss": 0.86257958, "learning_rate": 3.887693029057675e-07, "loss": 0.88462019, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.6855993270874023 }, { "auxiliary_loss_clip": 0.01273692, "auxiliary_loss_mlp": 0.010224, "balance_loss_clip": 1.04731083, "balance_loss_mlp": 1.01625168, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 2.064286687604893, "language_loss": 0.81299508, "learning_rate": 3.8830793198336684e-07, "loss": 0.83595604, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.6787054538726807 }, { "auxiliary_loss_clip": 0.01129374, "auxiliary_loss_mlp": 0.01025021, "balance_loss_clip": 1.0483166, "balance_loss_mlp": 1.01749003, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.6428371407147357, "language_loss": 0.70542252, "learning_rate": 3.878468055533721e-07, "loss": 0.7269665, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.7593507766723633 }, { "auxiliary_loss_clip": 0.01330424, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.04935503, "balance_loss_mlp": 1.02224064, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 3.2166834552961774, "language_loss": 0.84676993, "learning_rate": 3.8738592368573464e-07, "loss": 0.87037224, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.6781511306762695 }, { "auxiliary_loss_clip": 0.01318175, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.04557252, "balance_loss_mlp": 1.02364302, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.1436425266807313, "language_loss": 0.87982678, "learning_rate": 3.8692528645037137e-07, "loss": 0.90332079, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 3.6279428005218506 }, { "auxiliary_loss_clip": 0.01176137, "auxiliary_loss_mlp": 0.01026955, "balance_loss_clip": 1.05085754, "balance_loss_mlp": 1.01982331, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.2961588188690465, "language_loss": 0.7761085, "learning_rate": 3.8646489391715907e-07, "loss": 0.79813945, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.540644645690918 }, { "auxiliary_loss_clip": 0.01274589, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.04770219, "balance_loss_mlp": 1.01862228, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 2.556491785688585, "language_loss": 0.87671244, "learning_rate": 3.8600474615593903e-07, "loss": 0.89971888, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.658412456512451 }, { "auxiliary_loss_clip": 0.01224899, "auxiliary_loss_mlp": 0.00997848, "balance_loss_clip": 1.00913525, "balance_loss_mlp": 0.99694234, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.7855348627458713, "language_loss": 0.59695101, "learning_rate": 3.8554484323651605e-07, "loss": 0.61917847, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.3200016021728516 }, { "auxiliary_loss_clip": 0.01221909, "auxiliary_loss_mlp": 0.02567393, "balance_loss_clip": 1.04903507, "balance_loss_mlp": 0.99991369, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 2.1219496683415513, "language_loss": 0.79087651, "learning_rate": 3.85085185228657e-07, "loss": 0.82876951, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.650770664215088 }, { "auxiliary_loss_clip": 0.01271407, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 1.04669213, "balance_loss_mlp": 1.02970862, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 1.8084300153142356, "language_loss": 0.7331264, "learning_rate": 3.8462577220209114e-07, "loss": 0.75621611, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.775339126586914 }, { "auxiliary_loss_clip": 0.01061798, "auxiliary_loss_mlp": 0.01001119, "balance_loss_clip": 1.00746858, "balance_loss_mlp": 1.00027835, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7028846796817646, "language_loss": 0.58892626, "learning_rate": 3.8416660422651127e-07, "loss": 0.60955542, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.3232975006103516 }, { "auxiliary_loss_clip": 0.01323274, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04471254, "balance_loss_mlp": 1.01827288, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 2.4568489614314855, "language_loss": 0.68362087, "learning_rate": 3.837076813715723e-07, "loss": 0.70710707, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.692138910293579 }, { "auxiliary_loss_clip": 0.01326135, "auxiliary_loss_mlp": 0.01027438, "balance_loss_clip": 1.044052, "balance_loss_mlp": 1.0196147, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 1.9673640699031927, "language_loss": 0.75169855, "learning_rate": 3.832490037068941e-07, "loss": 0.77523434, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.7322816848754883 }, { "auxiliary_loss_clip": 0.01416093, "auxiliary_loss_mlp": 0.01023069, "balance_loss_clip": 1.0406673, "balance_loss_mlp": 1.01616716, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.9310553554004837, "language_loss": 0.75962663, "learning_rate": 3.827905713020554e-07, "loss": 0.78401822, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.783207893371582 }, { "auxiliary_loss_clip": 0.01329968, "auxiliary_loss_mlp": 0.01027328, "balance_loss_clip": 1.04395461, "balance_loss_mlp": 1.01940644, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.063306038362226, "language_loss": 0.68780315, "learning_rate": 3.823323842266017e-07, "loss": 0.71137607, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.7427124977111816 }, { "auxiliary_loss_clip": 0.01224649, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.04568636, "balance_loss_mlp": 1.01974297, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 3.2559413595575486, "language_loss": 0.72751713, "learning_rate": 3.818744425500393e-07, "loss": 0.75002885, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.658201217651367 }, { "auxiliary_loss_clip": 0.0132028, "auxiliary_loss_mlp": 0.0102536, "balance_loss_clip": 1.04381084, "balance_loss_mlp": 1.01916373, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 2.205149920412109, "language_loss": 0.80629587, "learning_rate": 3.8141674634183675e-07, "loss": 0.82975227, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.719902276992798 }, { "auxiliary_loss_clip": 0.0136823, "auxiliary_loss_mlp": 0.01024599, "balance_loss_clip": 1.04568398, "balance_loss_mlp": 1.01826036, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 2.2045857598090572, "language_loss": 0.6652922, "learning_rate": 3.809592956714278e-07, "loss": 0.68922049, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.7683217525482178 }, { "auxiliary_loss_clip": 0.01230733, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.05055904, "balance_loss_mlp": 1.02209878, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 2.2554455792002024, "language_loss": 0.74552441, "learning_rate": 3.805020906082057e-07, "loss": 0.76812124, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.6473777294158936 }, { "auxiliary_loss_clip": 0.0127927, "auxiliary_loss_mlp": 0.01023683, "balance_loss_clip": 1.04907823, "balance_loss_mlp": 1.01662302, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.3776203582344544, "language_loss": 0.81115985, "learning_rate": 3.8004513122152917e-07, "loss": 0.83418941, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.6786985397338867 }, { "auxiliary_loss_clip": 0.01275706, "auxiliary_loss_mlp": 0.01027311, "balance_loss_clip": 1.05118132, "balance_loss_mlp": 1.02062082, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.8299307235722482, "language_loss": 0.67455614, "learning_rate": 3.79588417580718e-07, "loss": 0.6975863, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.6912784576416016 }, { "auxiliary_loss_clip": 0.01227523, "auxiliary_loss_mlp": 0.010299, "balance_loss_clip": 1.05048537, "balance_loss_mlp": 1.02324533, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 2.110510093263405, "language_loss": 0.7641902, "learning_rate": 3.791319497550558e-07, "loss": 0.78676438, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.6142446994781494 }, { "auxiliary_loss_clip": 0.01229974, "auxiliary_loss_mlp": 0.02565778, "balance_loss_clip": 1.0488354, "balance_loss_mlp": 0.99990618, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 2.7496591662259684, "language_loss": 0.7089771, "learning_rate": 3.78675727813788e-07, "loss": 0.74693465, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 3.608246088027954 }, { "auxiliary_loss_clip": 0.01271765, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.04735303, "balance_loss_mlp": 1.02321351, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.6852887430258563, "language_loss": 0.73441982, "learning_rate": 3.782197518261225e-07, "loss": 0.75744933, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 3.707763433456421 }, { "auxiliary_loss_clip": 0.0128033, "auxiliary_loss_mlp": 0.01023946, "balance_loss_clip": 1.04869843, "balance_loss_mlp": 1.0174849, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 3.2294617272704333, "language_loss": 0.95659709, "learning_rate": 3.777640218612319e-07, "loss": 0.97963989, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.6053528785705566 }, { "auxiliary_loss_clip": 0.01221033, "auxiliary_loss_mlp": 0.0102539, "balance_loss_clip": 1.04915273, "balance_loss_mlp": 1.01867819, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.189462378581128, "language_loss": 0.72319853, "learning_rate": 3.773085379882488e-07, "loss": 0.74566275, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 3.5437817573547363 }, { "auxiliary_loss_clip": 0.01224904, "auxiliary_loss_mlp": 0.02570352, "balance_loss_clip": 1.04626989, "balance_loss_mlp": 0.9999367, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 1.8790625951769118, "language_loss": 0.75948626, "learning_rate": 3.768533002762715e-07, "loss": 0.79743886, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.7446651458740234 }, { "auxiliary_loss_clip": 0.01274258, "auxiliary_loss_mlp": 0.01022432, "balance_loss_clip": 1.04436898, "balance_loss_mlp": 1.01528287, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.8039785923485971, "language_loss": 0.77228302, "learning_rate": 3.763983087943572e-07, "loss": 0.79524994, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.665480375289917 }, { "auxiliary_loss_clip": 0.01217919, "auxiliary_loss_mlp": 0.02565395, "balance_loss_clip": 1.0449475, "balance_loss_mlp": 0.99993658, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.8089576610851728, "language_loss": 0.81270599, "learning_rate": 3.759435636115282e-07, "loss": 0.85053915, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.7549901008605957 }, { "auxiliary_loss_clip": 0.01422111, "auxiliary_loss_mlp": 0.02567801, "balance_loss_clip": 1.04571116, "balance_loss_mlp": 0.99993819, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.829771567291944, "language_loss": 0.72980082, "learning_rate": 3.7548906479676967e-07, "loss": 0.76969993, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.8344368934631348 }, { "auxiliary_loss_clip": 0.01226854, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.04662347, "balance_loss_mlp": 1.02005005, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 2.198076224325224, "language_loss": 0.71411264, "learning_rate": 3.7503481241902855e-07, "loss": 0.73665679, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 3.6452670097351074 }, { "auxiliary_loss_clip": 0.01272586, "auxiliary_loss_mlp": 0.02565141, "balance_loss_clip": 1.0452038, "balance_loss_mlp": 0.999897, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 2.2536268465887908, "language_loss": 0.80728453, "learning_rate": 3.745808065472145e-07, "loss": 0.84566182, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.6567273139953613 }, { "auxiliary_loss_clip": 0.01228662, "auxiliary_loss_mlp": 0.01026664, "balance_loss_clip": 1.0551542, "balance_loss_mlp": 1.02007425, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.9059482356878545, "language_loss": 0.76347792, "learning_rate": 3.741270472501994e-07, "loss": 0.78603125, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.7569518089294434 }, { "auxiliary_loss_clip": 0.01271179, "auxiliary_loss_mlp": 0.01025403, "balance_loss_clip": 1.0498842, "balance_loss_mlp": 1.01844168, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.7012332403831656, "language_loss": 0.72625244, "learning_rate": 3.736735345968183e-07, "loss": 0.74921834, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.679767608642578 }, { "auxiliary_loss_clip": 0.012269, "auxiliary_loss_mlp": 0.01027015, "balance_loss_clip": 1.05004907, "balance_loss_mlp": 1.0201633, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.54592283445129, "language_loss": 0.78853136, "learning_rate": 3.7322026865586986e-07, "loss": 0.81107044, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.600980281829834 }, { "auxiliary_loss_clip": 0.01233877, "auxiliary_loss_mlp": 0.0102075, "balance_loss_clip": 1.05298734, "balance_loss_mlp": 1.0137229, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 2.2704576563158314, "language_loss": 0.73497331, "learning_rate": 3.7276724949611206e-07, "loss": 0.7575196, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.6731224060058594 }, { "auxiliary_loss_clip": 0.01276017, "auxiliary_loss_mlp": 0.01029627, "balance_loss_clip": 1.04843092, "balance_loss_mlp": 1.02255774, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 2.0831425121011407, "language_loss": 0.75040948, "learning_rate": 3.723144771862694e-07, "loss": 0.77346599, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.6592373847961426 }, { "auxiliary_loss_clip": 0.01323559, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 1.04463053, "balance_loss_mlp": 1.01997972, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 3.036168602826041, "language_loss": 0.76976347, "learning_rate": 3.718619517950263e-07, "loss": 0.79326665, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.7119388580322266 }, { "auxiliary_loss_clip": 0.0117636, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.05155003, "balance_loss_mlp": 1.01761985, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 1.8445856520251995, "language_loss": 0.76755941, "learning_rate": 3.714096733910301e-07, "loss": 0.7895686, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.5486254692077637 }, { "auxiliary_loss_clip": 0.01133788, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.05199838, "balance_loss_mlp": 1.02103138, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.380040016646791, "language_loss": 0.70386159, "learning_rate": 3.709576420428926e-07, "loss": 0.72548783, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.6482186317443848 }, { "auxiliary_loss_clip": 0.01275954, "auxiliary_loss_mlp": 0.01021044, "balance_loss_clip": 1.04539704, "balance_loss_mlp": 1.01404309, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 2.0900462055796827, "language_loss": 0.73420751, "learning_rate": 3.7050585781918463e-07, "loss": 0.75717747, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.676119327545166 }, { "auxiliary_loss_clip": 0.01228424, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.04943275, "balance_loss_mlp": 1.01964939, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 4.907000450009968, "language_loss": 0.68959165, "learning_rate": 3.700543207884428e-07, "loss": 0.71214336, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.596876621246338 }, { "auxiliary_loss_clip": 0.01219812, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.04890442, "balance_loss_mlp": 1.02579319, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 1.9138602362149857, "language_loss": 0.71070206, "learning_rate": 3.6960303101916466e-07, "loss": 0.73322594, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.7082972526550293 }, { "auxiliary_loss_clip": 0.01061364, "auxiliary_loss_mlp": 0.0250596, "balance_loss_clip": 1.00711143, "balance_loss_mlp": 0.99993342, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7440441532809146, "language_loss": 0.55494654, "learning_rate": 3.6915198857981047e-07, "loss": 0.59061974, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.237215757369995 }, { "auxiliary_loss_clip": 0.01331164, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.04801464, "balance_loss_mlp": 1.0252142, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 11.660067049715586, "language_loss": 0.68193805, "learning_rate": 3.687011935388027e-07, "loss": 0.70557797, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.737178087234497 }, { "auxiliary_loss_clip": 0.0122341, "auxiliary_loss_mlp": 0.01024853, "balance_loss_clip": 1.04936385, "balance_loss_mlp": 1.01804936, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 2.1463714415549346, "language_loss": 0.72860175, "learning_rate": 3.6825064596452646e-07, "loss": 0.75108439, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.7720930576324463 }, { "auxiliary_loss_clip": 0.01226349, "auxiliary_loss_mlp": 0.01021994, "balance_loss_clip": 1.04792213, "balance_loss_mlp": 1.01528859, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.7058825741225743, "language_loss": 0.70394045, "learning_rate": 3.678003459253305e-07, "loss": 0.72642386, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.691985845565796 }, { "auxiliary_loss_clip": 0.0132025, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.04545355, "balance_loss_mlp": 1.01810384, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.9283684038263935, "language_loss": 0.74295235, "learning_rate": 3.673502934895236e-07, "loss": 0.76641291, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.674635410308838 }, { "auxiliary_loss_clip": 0.01061279, "auxiliary_loss_mlp": 0.01002343, "balance_loss_clip": 1.00694728, "balance_loss_mlp": 1.00147271, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6832412664277885, "language_loss": 0.57925606, "learning_rate": 3.669004887253802e-07, "loss": 0.59989226, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 4.16372013092041 }, { "auxiliary_loss_clip": 0.01277691, "auxiliary_loss_mlp": 0.0102586, "balance_loss_clip": 1.04905748, "balance_loss_mlp": 1.01908851, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.8593513781166882, "language_loss": 0.79294515, "learning_rate": 3.664509317011335e-07, "loss": 0.81598067, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 3.621014356613159 }, { "auxiliary_loss_clip": 0.01223917, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.04985642, "balance_loss_mlp": 1.02354264, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 1.7921676433308114, "language_loss": 0.73825002, "learning_rate": 3.6600162248498134e-07, "loss": 0.7608006, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 2.724585771560669 }, { "auxiliary_loss_clip": 0.0140432, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.03788924, "balance_loss_mlp": 1.02036047, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.715926960072817, "language_loss": 0.76316071, "learning_rate": 3.6555256114508426e-07, "loss": 0.78747332, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 3.713599443435669 }, { "auxiliary_loss_clip": 0.01274728, "auxiliary_loss_mlp": 0.01027094, "balance_loss_clip": 1.04331934, "balance_loss_mlp": 1.02019453, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 1.9466318322342349, "language_loss": 0.72983217, "learning_rate": 3.651037477495642e-07, "loss": 0.75285035, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.716322898864746 }, { "auxiliary_loss_clip": 0.01172575, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.04741967, "balance_loss_mlp": 1.02286482, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 1.9325223112421896, "language_loss": 0.67974895, "learning_rate": 3.6465518236650584e-07, "loss": 0.70178121, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.652621269226074 }, { "auxiliary_loss_clip": 0.01319137, "auxiliary_loss_mlp": 0.01029019, "balance_loss_clip": 1.04353118, "balance_loss_mlp": 1.02242362, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.9586106310060565, "language_loss": 0.78429246, "learning_rate": 3.642068650639558e-07, "loss": 0.80777407, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.7407236099243164 }, { "auxiliary_loss_clip": 0.01267964, "auxiliary_loss_mlp": 0.01025376, "balance_loss_clip": 1.04211521, "balance_loss_mlp": 1.01870036, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 2.196608212390339, "language_loss": 0.64747512, "learning_rate": 3.6375879590992334e-07, "loss": 0.67040849, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.679072141647339 }, { "auxiliary_loss_clip": 0.01271897, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.04509425, "balance_loss_mlp": 1.01872563, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 1.8247672243784028, "language_loss": 0.81216884, "learning_rate": 3.6331097497238173e-07, "loss": 0.83514851, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 3.632927417755127 }, { "auxiliary_loss_clip": 0.01320193, "auxiliary_loss_mlp": 0.01024252, "balance_loss_clip": 1.04497349, "balance_loss_mlp": 1.01791859, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.434668965330586, "language_loss": 0.79763806, "learning_rate": 3.628634023192627e-07, "loss": 0.82108247, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.705348491668701 }, { "auxiliary_loss_clip": 0.01226865, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04823291, "balance_loss_mlp": 1.01790226, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 3.8627257766080967, "language_loss": 0.75413805, "learning_rate": 3.624160780184644e-07, "loss": 0.77665913, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.5655324459075928 }, { "auxiliary_loss_clip": 0.01268915, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.04510093, "balance_loss_mlp": 1.02095222, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 1.9674641290776718, "language_loss": 0.74755454, "learning_rate": 3.6196900213784496e-07, "loss": 0.77052689, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.700800895690918 }, { "auxiliary_loss_clip": 0.01224173, "auxiliary_loss_mlp": 0.01024636, "balance_loss_clip": 1.04814315, "balance_loss_mlp": 1.01770735, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 2.567541989536935, "language_loss": 0.86882198, "learning_rate": 3.6152217474522527e-07, "loss": 0.8913101, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.613940715789795 }, { "auxiliary_loss_clip": 0.01227, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.05261719, "balance_loss_mlp": 1.01894462, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.7327727734602034, "language_loss": 0.72991067, "learning_rate": 3.6107559590838975e-07, "loss": 0.75244015, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.679319143295288 }, { "auxiliary_loss_clip": 0.01424747, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.04302263, "balance_loss_mlp": 1.02189589, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.448550034621646, "language_loss": 0.66031092, "learning_rate": 3.606292656950822e-07, "loss": 0.68484557, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.7487778663635254 }, { "auxiliary_loss_clip": 0.01271503, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.04533494, "balance_loss_mlp": 1.02217829, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 3.0294185429645695, "language_loss": 0.8636533, "learning_rate": 3.601831841730121e-07, "loss": 0.8866626, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.668571710586548 }, { "auxiliary_loss_clip": 0.01220679, "auxiliary_loss_mlp": 0.0102885, "balance_loss_clip": 1.04835272, "balance_loss_mlp": 1.02214158, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 2.035044792363082, "language_loss": 0.72689068, "learning_rate": 3.5973735140984916e-07, "loss": 0.74938589, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.6420114040374756 }, { "auxiliary_loss_clip": 0.01268326, "auxiliary_loss_mlp": 0.02564357, "balance_loss_clip": 1.04353929, "balance_loss_mlp": 0.99990356, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 2.2564622032197534, "language_loss": 0.79740095, "learning_rate": 3.5929176747322607e-07, "loss": 0.83572769, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.8492257595062256 }, { "auxiliary_loss_clip": 0.01172196, "auxiliary_loss_mlp": 0.01002435, "balance_loss_clip": 1.00695169, "balance_loss_mlp": 1.00159454, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.807768289300806, "language_loss": 0.5614382, "learning_rate": 3.588464324307372e-07, "loss": 0.58318454, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.258282423019409 }, { "auxiliary_loss_clip": 0.01227258, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.04734242, "balance_loss_mlp": 1.0187614, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 1.723160496097561, "language_loss": 0.75894356, "learning_rate": 3.584013463499391e-07, "loss": 0.78146553, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.623723268508911 }, { "auxiliary_loss_clip": 0.01166135, "auxiliary_loss_mlp": 0.01002634, "balance_loss_clip": 1.00810313, "balance_loss_mlp": 1.00174034, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7311105471600292, "language_loss": 0.64351213, "learning_rate": 3.579565092983521e-07, "loss": 0.66519982, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.0971193313598633 }, { "auxiliary_loss_clip": 0.01175015, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.0506289, "balance_loss_mlp": 1.01995277, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 1.855919411370266, "language_loss": 0.83887732, "learning_rate": 3.575119213434565e-07, "loss": 0.8608979, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.617391347885132 }, { "auxiliary_loss_clip": 0.01221061, "auxiliary_loss_mlp": 0.01023527, "balance_loss_clip": 1.04905581, "balance_loss_mlp": 1.01699114, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.8079067583546653, "language_loss": 0.81859404, "learning_rate": 3.5706758255269765e-07, "loss": 0.8410399, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.6401748657226562 }, { "auxiliary_loss_clip": 0.01276002, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.04784179, "balance_loss_mlp": 1.02078974, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.7255443293689678, "language_loss": 0.69684076, "learning_rate": 3.566234929934795e-07, "loss": 0.71987915, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.6376864910125732 }, { "auxiliary_loss_clip": 0.01229396, "auxiliary_loss_mlp": 0.01023961, "balance_loss_clip": 1.05413699, "balance_loss_mlp": 1.01736927, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.475941635379703, "language_loss": 0.71704096, "learning_rate": 3.561796527331706e-07, "loss": 0.73957455, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.7162322998046875 }, { "auxiliary_loss_clip": 0.01323051, "auxiliary_loss_mlp": 0.01025414, "balance_loss_clip": 1.0453918, "balance_loss_mlp": 1.01790643, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.9524382514215117, "language_loss": 0.77078235, "learning_rate": 3.5573606183910163e-07, "loss": 0.79426694, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.704862594604492 }, { "auxiliary_loss_clip": 0.01229688, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.04747331, "balance_loss_mlp": 1.01946282, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 2.0340169689240732, "language_loss": 0.79046547, "learning_rate": 3.5529272037856493e-07, "loss": 0.81303704, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 4.584100723266602 }, { "auxiliary_loss_clip": 0.01339189, "auxiliary_loss_mlp": 0.00998141, "balance_loss_clip": 1.00686836, "balance_loss_mlp": 0.99707395, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7220809470047134, "language_loss": 0.53794348, "learning_rate": 3.548496284188149e-07, "loss": 0.56131673, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.3679988384246826 }, { "auxiliary_loss_clip": 0.01357641, "auxiliary_loss_mlp": 0.01019865, "balance_loss_clip": 1.04290926, "balance_loss_mlp": 1.01352894, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 2.2158868537263596, "language_loss": 0.79254532, "learning_rate": 3.544067860270681e-07, "loss": 0.81632042, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 2.740636110305786 }, { "auxiliary_loss_clip": 0.01327374, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.04626763, "balance_loss_mlp": 1.01936221, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.812218395000123, "language_loss": 0.71080744, "learning_rate": 3.539641932705029e-07, "loss": 0.73434913, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 3.635033369064331 }, { "auxiliary_loss_clip": 0.01177891, "auxiliary_loss_mlp": 0.01027066, "balance_loss_clip": 1.05015099, "balance_loss_mlp": 1.01927018, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.1131955589591396, "language_loss": 0.77307045, "learning_rate": 3.53521850216262e-07, "loss": 0.79512, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.637220621109009 }, { "auxiliary_loss_clip": 0.01172677, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.04847062, "balance_loss_mlp": 1.02167511, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 2.486708687033299, "language_loss": 0.77097189, "learning_rate": 3.530797569314461e-07, "loss": 0.79298592, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.5604825019836426 }, { "auxiliary_loss_clip": 0.01176135, "auxiliary_loss_mlp": 0.01028084, "balance_loss_clip": 1.05061698, "balance_loss_mlp": 1.0201447, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 1.9298000522681296, "language_loss": 0.77664232, "learning_rate": 3.5263791348312235e-07, "loss": 0.79868454, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 3.047980785369873 }, { "auxiliary_loss_clip": 0.01272442, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.04658723, "balance_loss_mlp": 1.01870179, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 2.0407594339944835, "language_loss": 0.709113, "learning_rate": 3.521963199383171e-07, "loss": 0.73209363, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.71441650390625 }, { "auxiliary_loss_clip": 0.01376707, "auxiliary_loss_mlp": 0.01030388, "balance_loss_clip": 1.04448032, "balance_loss_mlp": 1.02231705, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 2.8768263922829584, "language_loss": 0.77067089, "learning_rate": 3.517549763640197e-07, "loss": 0.79474175, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 3.617910385131836 }, { "auxiliary_loss_clip": 0.01220125, "auxiliary_loss_mlp": 0.02563986, "balance_loss_clip": 1.04991007, "balance_loss_mlp": 0.99990332, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 1.8053192726570528, "language_loss": 0.71464515, "learning_rate": 3.513138828271829e-07, "loss": 0.75248629, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.6727659702301025 }, { "auxiliary_loss_clip": 0.01324474, "auxiliary_loss_mlp": 0.01025463, "balance_loss_clip": 1.04746485, "balance_loss_mlp": 1.01878715, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 1.837374695719706, "language_loss": 0.70008832, "learning_rate": 3.508730393947179e-07, "loss": 0.72358775, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.890540838241577 }, { "auxiliary_loss_clip": 0.01316292, "auxiliary_loss_mlp": 0.0102359, "balance_loss_clip": 1.04476047, "balance_loss_mlp": 1.01617217, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 2.7333969594044487, "language_loss": 0.72511154, "learning_rate": 3.504324461335024e-07, "loss": 0.74851036, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.730473756790161 }, { "auxiliary_loss_clip": 0.0137639, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.04496098, "balance_loss_mlp": 1.02487707, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 3.2124667549613712, "language_loss": 0.88146311, "learning_rate": 3.499921031103732e-07, "loss": 0.90555155, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.7828454971313477 }, { "auxiliary_loss_clip": 0.01233169, "auxiliary_loss_mlp": 0.0102625, "balance_loss_clip": 1.04377079, "balance_loss_mlp": 1.01950288, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.7568928587914643, "language_loss": 0.78569055, "learning_rate": 3.4955201039212987e-07, "loss": 0.80828476, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.770789861679077 }, { "auxiliary_loss_clip": 0.01129257, "auxiliary_loss_mlp": 0.01023544, "balance_loss_clip": 1.04987967, "balance_loss_mlp": 1.01675749, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 10.62729516117895, "language_loss": 0.65713966, "learning_rate": 3.4911216804553465e-07, "loss": 0.67866766, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.596358299255371 }, { "auxiliary_loss_clip": 0.01273434, "auxiliary_loss_mlp": 0.01028116, "balance_loss_clip": 1.04493022, "balance_loss_mlp": 1.0205698, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.205118681827262, "language_loss": 0.70167303, "learning_rate": 3.4867257613731017e-07, "loss": 0.72468853, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.6332342624664307 }, { "auxiliary_loss_clip": 0.01275443, "auxiliary_loss_mlp": 0.01025489, "balance_loss_clip": 1.04757369, "balance_loss_mlp": 1.01841116, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.9236986557132842, "language_loss": 0.85620487, "learning_rate": 3.4823323473414343e-07, "loss": 0.87921423, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.6682279109954834 }, { "auxiliary_loss_clip": 0.01228616, "auxiliary_loss_mlp": 0.01022119, "balance_loss_clip": 1.04500151, "balance_loss_mlp": 1.01377106, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 1.9159517263033041, "language_loss": 0.75863838, "learning_rate": 3.477941439026812e-07, "loss": 0.78114569, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.7625417709350586 }, { "auxiliary_loss_clip": 0.01174154, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 1.04895735, "balance_loss_mlp": 1.02025151, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 2.6744062831103066, "language_loss": 0.73271513, "learning_rate": 3.473553037095349e-07, "loss": 0.75472105, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.6668953895568848 }, { "auxiliary_loss_clip": 0.01271108, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.04453123, "balance_loss_mlp": 1.02053463, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.682322580232965, "language_loss": 0.83466876, "learning_rate": 3.469167142212743e-07, "loss": 0.8576479, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.6714727878570557 }, { "auxiliary_loss_clip": 0.01225497, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.04896235, "balance_loss_mlp": 1.02073145, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 16.30964690890461, "language_loss": 0.63178509, "learning_rate": 3.4647837550443337e-07, "loss": 0.65432513, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.7289083003997803 }, { "auxiliary_loss_clip": 0.01322387, "auxiliary_loss_mlp": 0.01021485, "balance_loss_clip": 1.04607844, "balance_loss_mlp": 1.01466334, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 2.1156903241986766, "language_loss": 0.74699688, "learning_rate": 3.460402876255086e-07, "loss": 0.77043557, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.650433301925659 }, { "auxiliary_loss_clip": 0.01225776, "auxiliary_loss_mlp": 0.01027111, "balance_loss_clip": 1.04773831, "balance_loss_mlp": 1.01989841, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.279355468814845, "language_loss": 0.72410667, "learning_rate": 3.456024506509574e-07, "loss": 0.74663556, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.6543045043945312 }, { "auxiliary_loss_clip": 0.01224836, "auxiliary_loss_mlp": 0.02565564, "balance_loss_clip": 1.05012035, "balance_loss_mlp": 0.99992394, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 2.5142419479259144, "language_loss": 0.74116451, "learning_rate": 3.4516486464719873e-07, "loss": 0.77906847, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.654092788696289 }, { "auxiliary_loss_clip": 0.01367786, "auxiliary_loss_mlp": 0.01027355, "balance_loss_clip": 1.04124808, "balance_loss_mlp": 1.02034545, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 1.606137512856906, "language_loss": 0.62451601, "learning_rate": 3.4472752968061445e-07, "loss": 0.64846742, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.856773614883423 }, { "auxiliary_loss_clip": 0.01223805, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04695678, "balance_loss_mlp": 1.01684153, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 1.9431368776640814, "language_loss": 0.74163115, "learning_rate": 3.442904458175475e-07, "loss": 0.76410794, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.5840935707092285 }, { "auxiliary_loss_clip": 0.0122376, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.04739952, "balance_loss_mlp": 1.01911712, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 20.961183858067592, "language_loss": 0.76250541, "learning_rate": 3.438536131243044e-07, "loss": 0.78500891, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 4.518171072006226 }, { "auxiliary_loss_clip": 0.0127263, "auxiliary_loss_mlp": 0.01026374, "balance_loss_clip": 1.04675889, "balance_loss_mlp": 1.01836872, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.19901717228257, "language_loss": 0.61776161, "learning_rate": 3.434170316671503e-07, "loss": 0.6407516, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.8035991191864014 }, { "auxiliary_loss_clip": 0.01317582, "auxiliary_loss_mlp": 0.01026682, "balance_loss_clip": 1.04978657, "balance_loss_mlp": 1.01963401, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 3.4084233023498456, "language_loss": 0.89938104, "learning_rate": 3.4298070151231583e-07, "loss": 0.92282373, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 2.6464617252349854 }, { "auxiliary_loss_clip": 0.01277738, "auxiliary_loss_mlp": 0.01028409, "balance_loss_clip": 1.04813957, "balance_loss_mlp": 1.02120626, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 2.009506368508766, "language_loss": 0.60220379, "learning_rate": 3.425446227259916e-07, "loss": 0.62526524, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 3.6455626487731934 }, { "auxiliary_loss_clip": 0.01274824, "auxiliary_loss_mlp": 0.01027022, "balance_loss_clip": 1.04717767, "balance_loss_mlp": 1.0200038, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 2.4247251860351695, "language_loss": 0.82351649, "learning_rate": 3.421087953743296e-07, "loss": 0.84653497, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.6589529514312744 }, { "auxiliary_loss_clip": 0.01226161, "auxiliary_loss_mlp": 0.01023945, "balance_loss_clip": 1.04737294, "balance_loss_mlp": 1.01658046, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 6.560661246336411, "language_loss": 0.80141181, "learning_rate": 3.416732195234464e-07, "loss": 0.82391286, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.6033036708831787 }, { "auxiliary_loss_clip": 0.01225632, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.04827166, "balance_loss_mlp": 1.01916957, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.5221281931778758, "language_loss": 0.79397523, "learning_rate": 3.4123789523941613e-07, "loss": 0.81649208, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.6432907581329346 }, { "auxiliary_loss_clip": 0.01222049, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.04554021, "balance_loss_mlp": 1.01898098, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.74745415733074, "language_loss": 0.63974035, "learning_rate": 3.4080282258827884e-07, "loss": 0.66222131, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.6281282901763916 }, { "auxiliary_loss_clip": 0.01227275, "auxiliary_loss_mlp": 0.0102519, "balance_loss_clip": 1.04847121, "balance_loss_mlp": 1.01852643, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.2706528478772263, "language_loss": 0.72810698, "learning_rate": 3.403680016360342e-07, "loss": 0.75063163, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 3.5340683460235596 }, { "auxiliary_loss_clip": 0.0122388, "auxiliary_loss_mlp": 0.01026657, "balance_loss_clip": 1.04910564, "balance_loss_mlp": 1.01962924, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.5396615459760938, "language_loss": 0.67535949, "learning_rate": 3.3993343244864403e-07, "loss": 0.69786489, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.6217939853668213 }, { "auxiliary_loss_clip": 0.01221229, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 1.04750466, "balance_loss_mlp": 1.01778913, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.6272879764520989, "language_loss": 0.72938049, "learning_rate": 3.394991150920323e-07, "loss": 0.75183809, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.655233860015869 }, { "auxiliary_loss_clip": 0.01375698, "auxiliary_loss_mlp": 0.02572054, "balance_loss_clip": 1.04243433, "balance_loss_mlp": 0.99993306, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 4.446636363375664, "language_loss": 0.74556893, "learning_rate": 3.3906504963208396e-07, "loss": 0.78504646, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.671595335006714 }, { "auxiliary_loss_clip": 0.01363273, "auxiliary_loss_mlp": 0.01026792, "balance_loss_clip": 1.04581821, "balance_loss_mlp": 1.01900733, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.870835195413321, "language_loss": 0.66908175, "learning_rate": 3.3863123613464774e-07, "loss": 0.69298238, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.741248846054077 }, { "auxiliary_loss_clip": 0.01274806, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.04195452, "balance_loss_mlp": 1.01830184, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 1.8666250342444926, "language_loss": 0.75072581, "learning_rate": 3.381976746655317e-07, "loss": 0.77372706, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.676837205886841 }, { "auxiliary_loss_clip": 0.01361235, "auxiliary_loss_mlp": 0.01023074, "balance_loss_clip": 1.04555225, "balance_loss_mlp": 1.01586509, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.035059172315375, "language_loss": 0.67749858, "learning_rate": 3.3776436529050756e-07, "loss": 0.70134169, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.781282663345337 }, { "auxiliary_loss_clip": 0.01174296, "auxiliary_loss_mlp": 0.01026177, "balance_loss_clip": 1.04983675, "balance_loss_mlp": 1.01943326, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.7482533883068616, "language_loss": 0.72428042, "learning_rate": 3.373313080753073e-07, "loss": 0.7462852, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.723553419113159 }, { "auxiliary_loss_clip": 0.0122077, "auxiliary_loss_mlp": 0.01024984, "balance_loss_clip": 1.04530334, "balance_loss_mlp": 1.01808178, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.6908780355046258, "language_loss": 0.77552247, "learning_rate": 3.3689850308562527e-07, "loss": 0.79798007, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.6377220153808594 }, { "auxiliary_loss_clip": 0.01372886, "auxiliary_loss_mlp": 0.01029015, "balance_loss_clip": 1.04908872, "balance_loss_mlp": 1.02192545, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 3.2593315133909226, "language_loss": 0.77699137, "learning_rate": 3.364659503871183e-07, "loss": 0.80101037, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.7260968685150146 }, { "auxiliary_loss_clip": 0.01319414, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.04208398, "balance_loss_mlp": 1.01751888, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 2.2754121420906173, "language_loss": 0.84084195, "learning_rate": 3.3603365004540417e-07, "loss": 0.86427575, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.6681227684020996 }, { "auxiliary_loss_clip": 0.01174257, "auxiliary_loss_mlp": 0.01024883, "balance_loss_clip": 1.05035496, "balance_loss_mlp": 1.01839471, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 2.0197955488118593, "language_loss": 0.77595568, "learning_rate": 3.356016021260624e-07, "loss": 0.79794711, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.6272759437561035 }, { "auxiliary_loss_clip": 0.01223714, "auxiliary_loss_mlp": 0.01029072, "balance_loss_clip": 1.05079091, "balance_loss_mlp": 1.02203894, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 2.4758110003360554, "language_loss": 0.65707105, "learning_rate": 3.35169806694634e-07, "loss": 0.67959887, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.7163734436035156 }, { "auxiliary_loss_clip": 0.01220108, "auxiliary_loss_mlp": 0.01003133, "balance_loss_clip": 1.00771642, "balance_loss_mlp": 1.00220287, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7153392003753575, "language_loss": 0.60622025, "learning_rate": 3.3473826381662186e-07, "loss": 0.62845266, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.3499295711517334 }, { "auxiliary_loss_clip": 0.01218652, "auxiliary_loss_mlp": 0.01020013, "balance_loss_clip": 1.04824758, "balance_loss_mlp": 1.01328683, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 2.665375156264429, "language_loss": 0.81808937, "learning_rate": 3.3430697355749216e-07, "loss": 0.84047604, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.615480661392212 }, { "auxiliary_loss_clip": 0.01366513, "auxiliary_loss_mlp": 0.01024313, "balance_loss_clip": 1.04131091, "balance_loss_mlp": 1.01725626, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 2.3433862222857904, "language_loss": 0.75363958, "learning_rate": 3.3387593598266907e-07, "loss": 0.77754784, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.6746647357940674 }, { "auxiliary_loss_clip": 0.01320129, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.04301047, "balance_loss_mlp": 1.02085066, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 2.4699324224975157, "language_loss": 0.78427529, "learning_rate": 3.3344515115754225e-07, "loss": 0.80775619, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.7727279663085938 }, { "auxiliary_loss_clip": 0.01223051, "auxiliary_loss_mlp": 0.01027101, "balance_loss_clip": 1.04339862, "balance_loss_mlp": 1.02033234, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 2.293578688059005, "language_loss": 0.79727095, "learning_rate": 3.33014619147461e-07, "loss": 0.81977242, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.791146755218506 }, { "auxiliary_loss_clip": 0.01273024, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04976964, "balance_loss_mlp": 1.01986122, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 9.443954181959024, "language_loss": 0.71739572, "learning_rate": 3.325843400177362e-07, "loss": 0.74039817, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 3.6216471195220947 }, { "auxiliary_loss_clip": 0.0122708, "auxiliary_loss_mlp": 0.0256735, "balance_loss_clip": 1.04750752, "balance_loss_mlp": 0.99994195, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 2.6601201763279203, "language_loss": 0.73972458, "learning_rate": 3.32154313833642e-07, "loss": 0.77766889, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 3.622901678085327 }, { "auxiliary_loss_clip": 0.01176362, "auxiliary_loss_mlp": 0.01026413, "balance_loss_clip": 1.049752, "balance_loss_mlp": 1.01912355, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.2421491513023977, "language_loss": 0.59749335, "learning_rate": 3.3172454066041164e-07, "loss": 0.61952108, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.5803370475769043 }, { "auxiliary_loss_clip": 0.01425771, "auxiliary_loss_mlp": 0.02561587, "balance_loss_clip": 1.0483706, "balance_loss_mlp": 0.99994195, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 1.801701723011235, "language_loss": 0.76230073, "learning_rate": 3.3129502056324234e-07, "loss": 0.80217433, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 3.7387492656707764 }, { "auxiliary_loss_clip": 0.01385019, "auxiliary_loss_mlp": 0.01000379, "balance_loss_clip": 1.01020372, "balance_loss_mlp": 0.99937803, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8152012679793276, "language_loss": 0.59710503, "learning_rate": 3.3086575360729165e-07, "loss": 0.62095904, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.359842300415039 }, { "auxiliary_loss_clip": 0.01272739, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.04679203, "balance_loss_mlp": 1.02122986, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.963748550422414, "language_loss": 0.71592933, "learning_rate": 3.3043673985767906e-07, "loss": 0.73893702, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 3.238403797149658 }, { "auxiliary_loss_clip": 0.01314234, "auxiliary_loss_mlp": 0.01024311, "balance_loss_clip": 1.03999782, "balance_loss_mlp": 1.01759923, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.9878521255193269, "language_loss": 0.77782738, "learning_rate": 3.3000797937948564e-07, "loss": 0.80121279, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.8147151470184326 }, { "auxiliary_loss_clip": 0.01120008, "auxiliary_loss_mlp": 0.01001261, "balance_loss_clip": 1.00787652, "balance_loss_mlp": 1.00036705, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9419398410811813, "language_loss": 0.65048718, "learning_rate": 3.295794722377534e-07, "loss": 0.67169988, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 3.290086507797241 }, { "auxiliary_loss_clip": 0.01171578, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.04950261, "balance_loss_mlp": 1.01959324, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 2.265964099936706, "language_loss": 0.80027509, "learning_rate": 3.291512184974876e-07, "loss": 0.8222518, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 3.4942917823791504 }, { "auxiliary_loss_clip": 0.01274027, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.04331994, "balance_loss_mlp": 1.02041209, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 2.097961677723926, "language_loss": 0.66896379, "learning_rate": 3.2872321822365346e-07, "loss": 0.69198227, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.769864559173584 }, { "auxiliary_loss_clip": 0.01223771, "auxiliary_loss_mlp": 0.01020583, "balance_loss_clip": 1.05002213, "balance_loss_mlp": 1.01387787, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.927936502782618, "language_loss": 0.73409432, "learning_rate": 3.282954714811783e-07, "loss": 0.75653791, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.620063304901123 }, { "auxiliary_loss_clip": 0.01266431, "auxiliary_loss_mlp": 0.01025047, "balance_loss_clip": 1.04190135, "balance_loss_mlp": 1.01819801, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.469487354915222, "language_loss": 0.71117258, "learning_rate": 3.2786797833495093e-07, "loss": 0.73408735, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.6245388984680176 }, { "auxiliary_loss_clip": 0.01172411, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 1.04804993, "balance_loss_mlp": 1.02211046, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 2.6451725821433567, "language_loss": 0.72601295, "learning_rate": 3.274407388498213e-07, "loss": 0.7480253, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.60349702835083 }, { "auxiliary_loss_clip": 0.01320064, "auxiliary_loss_mlp": 0.01020841, "balance_loss_clip": 1.04345679, "balance_loss_mlp": 1.01403975, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 1.9305486509705465, "language_loss": 0.74278504, "learning_rate": 3.270137530906021e-07, "loss": 0.76619411, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.693319797515869 }, { "auxiliary_loss_clip": 0.01367033, "auxiliary_loss_mlp": 0.01022458, "balance_loss_clip": 1.04760289, "balance_loss_mlp": 1.01598513, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 1.73747645972751, "language_loss": 0.83749592, "learning_rate": 3.265870211220665e-07, "loss": 0.86139083, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.7035157680511475 }, { "auxiliary_loss_clip": 0.01324889, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.04627705, "balance_loss_mlp": 1.01985121, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.011892085174537, "language_loss": 0.82086265, "learning_rate": 3.2616054300894934e-07, "loss": 0.84438443, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.848741292953491 }, { "auxiliary_loss_clip": 0.01272316, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.04804623, "balance_loss_mlp": 1.02016711, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 2.1383824555635105, "language_loss": 0.84155977, "learning_rate": 3.2573431881594693e-07, "loss": 0.86455041, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.6674158573150635 }, { "auxiliary_loss_clip": 0.0141777, "auxiliary_loss_mlp": 0.01024098, "balance_loss_clip": 1.03908801, "balance_loss_mlp": 1.01694512, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.1631205754067, "language_loss": 0.66306305, "learning_rate": 3.2530834860771663e-07, "loss": 0.6874817, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.786813974380493 }, { "auxiliary_loss_clip": 0.01225471, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.04687643, "balance_loss_mlp": 1.02097416, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 5.221646466420648, "language_loss": 0.74223304, "learning_rate": 3.248826324488794e-07, "loss": 0.76476812, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.6033363342285156 }, { "auxiliary_loss_clip": 0.01174922, "auxiliary_loss_mlp": 0.01024772, "balance_loss_clip": 1.05252028, "balance_loss_mlp": 1.01777744, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.9299328126259223, "language_loss": 0.88158536, "learning_rate": 3.244571704040138e-07, "loss": 0.90358233, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.5997700691223145 }, { "auxiliary_loss_clip": 0.01220468, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.04419231, "balance_loss_mlp": 1.01999998, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 1.9563481517546748, "language_loss": 0.73705041, "learning_rate": 3.2403196253766374e-07, "loss": 0.75952911, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.6733803749084473 }, { "auxiliary_loss_clip": 0.01226038, "auxiliary_loss_mlp": 0.01027093, "balance_loss_clip": 1.04856908, "balance_loss_mlp": 1.01926112, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.263090122009148, "language_loss": 0.790797, "learning_rate": 3.2360700891433254e-07, "loss": 0.81332833, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.6097846031188965 }, { "auxiliary_loss_clip": 0.01273342, "auxiliary_loss_mlp": 0.00998796, "balance_loss_clip": 1.0101676, "balance_loss_mlp": 0.99788439, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.7949293193217626, "language_loss": 0.57270622, "learning_rate": 3.231823095984847e-07, "loss": 0.59542751, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.218120813369751 }, { "auxiliary_loss_clip": 0.01271136, "auxiliary_loss_mlp": 0.01019548, "balance_loss_clip": 1.04736269, "balance_loss_mlp": 1.01284218, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.284721406289287, "language_loss": 0.75987053, "learning_rate": 3.2275786465454814e-07, "loss": 0.78277731, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.605473041534424 }, { "auxiliary_loss_clip": 0.01322909, "auxiliary_loss_mlp": 0.01022756, "balance_loss_clip": 1.04372644, "balance_loss_mlp": 1.0158596, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 5.567301041693867, "language_loss": 0.75799859, "learning_rate": 3.2233367414690917e-07, "loss": 0.78145528, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.766899824142456 }, { "auxiliary_loss_clip": 0.01318223, "auxiliary_loss_mlp": 0.01028142, "balance_loss_clip": 1.04197538, "balance_loss_mlp": 1.02180648, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.198407304856372, "language_loss": 0.85300118, "learning_rate": 3.219097381399183e-07, "loss": 0.87646484, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.7385849952697754 }, { "auxiliary_loss_clip": 0.01283092, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.05001962, "balance_loss_mlp": 1.01977992, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 1.8160193015676551, "language_loss": 0.81195027, "learning_rate": 3.2148605669788584e-07, "loss": 0.83505476, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 4.65327000617981 }, { "auxiliary_loss_clip": 0.01274214, "auxiliary_loss_mlp": 0.01020944, "balance_loss_clip": 1.04883552, "balance_loss_mlp": 1.01434922, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.7425572295645755, "language_loss": 0.77566087, "learning_rate": 3.2106262988508405e-07, "loss": 0.79861248, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.6476659774780273 }, { "auxiliary_loss_clip": 0.01270844, "auxiliary_loss_mlp": 0.01025068, "balance_loss_clip": 1.04674363, "balance_loss_mlp": 1.0180999, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 1.7551219264902935, "language_loss": 0.74287027, "learning_rate": 3.206394577657465e-07, "loss": 0.76582944, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.647963285446167 }, { "auxiliary_loss_clip": 0.01228053, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04963219, "balance_loss_mlp": 1.0179441, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.3375453393173586, "language_loss": 0.72553831, "learning_rate": 3.202165404040675e-07, "loss": 0.74807632, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 3.4711837768554688 }, { "auxiliary_loss_clip": 0.01425889, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.044873, "balance_loss_mlp": 1.01925421, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 10.714833472333588, "language_loss": 0.74591839, "learning_rate": 3.1979387786420396e-07, "loss": 0.77044404, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.813122510910034 }, { "auxiliary_loss_clip": 0.01272895, "auxiliary_loss_mlp": 0.0102664, "balance_loss_clip": 1.04277492, "balance_loss_mlp": 1.01959229, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 1.9671351431648865, "language_loss": 0.82231522, "learning_rate": 3.1937147021027346e-07, "loss": 0.84531057, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.700087785720825 }, { "auxiliary_loss_clip": 0.01219281, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.04779088, "balance_loss_mlp": 1.02432609, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.313914094379665, "language_loss": 0.76776212, "learning_rate": 3.189493175063547e-07, "loss": 0.79026568, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.6102447509765625 }, { "auxiliary_loss_clip": 0.01275697, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.04941297, "balance_loss_mlp": 1.01877022, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 2.59520600950338, "language_loss": 0.67486167, "learning_rate": 3.1852741981648776e-07, "loss": 0.69787836, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.68234920501709 }, { "auxiliary_loss_clip": 0.01320787, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.0471549, "balance_loss_mlp": 1.02747858, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 2.407267921916861, "language_loss": 0.69722742, "learning_rate": 3.1810577720467404e-07, "loss": 0.72077596, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 3.6897475719451904 }, { "auxiliary_loss_clip": 0.01276104, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.04800296, "balance_loss_mlp": 1.01996338, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.5949055119751934, "language_loss": 0.56731194, "learning_rate": 3.176843897348769e-07, "loss": 0.59034365, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.7959048748016357 }, { "auxiliary_loss_clip": 0.01271123, "auxiliary_loss_mlp": 0.01026204, "balance_loss_clip": 1.04624665, "balance_loss_mlp": 1.01837754, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.9578485315626253, "language_loss": 0.75892758, "learning_rate": 3.1726325747102034e-07, "loss": 0.78190082, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.637136220932007 }, { "auxiliary_loss_clip": 0.01365849, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.03689027, "balance_loss_mlp": 1.02076697, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.6857020507723248, "language_loss": 0.64200616, "learning_rate": 3.1684238047698974e-07, "loss": 0.66594177, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.0977232456207275 }, { "auxiliary_loss_clip": 0.01275643, "auxiliary_loss_mlp": 0.0102512, "balance_loss_clip": 1.0477922, "balance_loss_mlp": 1.01808095, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.1975193477272636, "language_loss": 0.52991784, "learning_rate": 3.1642175881663155e-07, "loss": 0.55292547, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.746411085128784 }, { "auxiliary_loss_clip": 0.01171394, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.04751396, "balance_loss_mlp": 1.01847291, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.247745445393768, "language_loss": 0.8394829, "learning_rate": 3.160013925537537e-07, "loss": 0.86145067, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.591095447540283 }, { "auxiliary_loss_clip": 0.01225123, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.04652369, "balance_loss_mlp": 1.01860404, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.1398394458436596, "language_loss": 0.75658637, "learning_rate": 3.155812817521266e-07, "loss": 0.77909523, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.7101058959960938 }, { "auxiliary_loss_clip": 0.01278693, "auxiliary_loss_mlp": 0.01025822, "balance_loss_clip": 1.05091679, "balance_loss_mlp": 1.01813269, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.3610709135328127, "language_loss": 0.77975512, "learning_rate": 3.151614264754787e-07, "loss": 0.8028003, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.672050714492798 }, { "auxiliary_loss_clip": 0.0117422, "auxiliary_loss_mlp": 0.0102073, "balance_loss_clip": 1.0477196, "balance_loss_mlp": 1.01386356, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.5368371574171875, "language_loss": 0.79629368, "learning_rate": 3.147418267875035e-07, "loss": 0.81824321, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.5626449584960938 }, { "auxiliary_loss_clip": 0.01421087, "auxiliary_loss_mlp": 0.02565082, "balance_loss_clip": 1.03961778, "balance_loss_mlp": 0.99991339, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 3.36237442034114, "language_loss": 0.65553236, "learning_rate": 3.1432248275185315e-07, "loss": 0.69539404, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.8278796672821045 }, { "auxiliary_loss_clip": 0.0122101, "auxiliary_loss_mlp": 0.01024196, "balance_loss_clip": 1.04795933, "balance_loss_mlp": 1.01692462, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 3.440686790732016, "language_loss": 0.76549423, "learning_rate": 3.139033944321412e-07, "loss": 0.78794628, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.5445709228515625 }, { "auxiliary_loss_clip": 0.01225878, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 1.04779267, "balance_loss_mlp": 1.01816273, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.712750835300007, "language_loss": 0.7897166, "learning_rate": 3.1348456189194507e-07, "loss": 0.81222618, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.6594972610473633 }, { "auxiliary_loss_clip": 0.01318249, "auxiliary_loss_mlp": 0.01022861, "balance_loss_clip": 1.04137492, "balance_loss_mlp": 1.0156461, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.8749240135180056, "language_loss": 0.83091676, "learning_rate": 3.1306598519479876e-07, "loss": 0.8543278, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.689815044403076 }, { "auxiliary_loss_clip": 0.0126999, "auxiliary_loss_mlp": 0.01030412, "balance_loss_clip": 1.04668331, "balance_loss_mlp": 1.02334833, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 2.5909489018878675, "language_loss": 0.78245074, "learning_rate": 3.1264766440420177e-07, "loss": 0.80545479, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.710061550140381 }, { "auxiliary_loss_clip": 0.01219358, "auxiliary_loss_mlp": 0.01025681, "balance_loss_clip": 1.04784036, "balance_loss_mlp": 1.01899004, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 1.9410662852094713, "language_loss": 0.6940161, "learning_rate": 3.122295995836124e-07, "loss": 0.71646649, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.676382064819336 }, { "auxiliary_loss_clip": 0.01226099, "auxiliary_loss_mlp": 0.01023195, "balance_loss_clip": 1.04635835, "balance_loss_mlp": 1.01626873, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.8677406558935064, "language_loss": 0.77654016, "learning_rate": 3.118117907964508e-07, "loss": 0.79903316, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.6928765773773193 }, { "auxiliary_loss_clip": 0.01329912, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.0451231, "balance_loss_mlp": 1.01829553, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.9527667043482129, "language_loss": 0.80109549, "learning_rate": 3.1139423810609856e-07, "loss": 0.82464129, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.728998899459839 }, { "auxiliary_loss_clip": 0.01171812, "auxiliary_loss_mlp": 0.01023633, "balance_loss_clip": 1.04689765, "balance_loss_mlp": 1.01691234, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 2.1099261657712596, "language_loss": 0.75353622, "learning_rate": 3.1097694157589714e-07, "loss": 0.77549064, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 3.6986193656921387 }, { "auxiliary_loss_clip": 0.01219436, "auxiliary_loss_mlp": 0.01021832, "balance_loss_clip": 1.04851162, "balance_loss_mlp": 1.01484334, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 2.4843214968512757, "language_loss": 0.76084709, "learning_rate": 3.105599012691511e-07, "loss": 0.78325975, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 3.482978105545044 }, { "auxiliary_loss_clip": 0.01219724, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 1.04812264, "balance_loss_mlp": 1.0197134, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.4375205107180464, "language_loss": 0.82595432, "learning_rate": 3.101431172491249e-07, "loss": 0.84842044, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.66448974609375 }, { "auxiliary_loss_clip": 0.0132513, "auxiliary_loss_mlp": 0.02568002, "balance_loss_clip": 1.04253078, "balance_loss_mlp": 0.99991709, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 2.1983982420043207, "language_loss": 0.72141808, "learning_rate": 3.097265895790444e-07, "loss": 0.76034939, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.6679491996765137 }, { "auxiliary_loss_clip": 0.01320125, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 1.04262793, "balance_loss_mlp": 1.02218127, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 1.8982764641070924, "language_loss": 0.83476067, "learning_rate": 3.093103183220962e-07, "loss": 0.85825813, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 3.517441987991333 }, { "auxiliary_loss_clip": 0.01115105, "auxiliary_loss_mlp": 0.01003852, "balance_loss_clip": 1.00747228, "balance_loss_mlp": 1.00298786, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8242082274866178, "language_loss": 0.59290105, "learning_rate": 3.0889430354142796e-07, "loss": 0.61409062, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.16646409034729 }, { "auxiliary_loss_clip": 0.01321984, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.04286766, "balance_loss_mlp": 1.01850581, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 3.432511771096845, "language_loss": 0.69942486, "learning_rate": 3.084785453001497e-07, "loss": 0.72289991, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.758803129196167 }, { "auxiliary_loss_clip": 0.01271699, "auxiliary_loss_mlp": 0.02567009, "balance_loss_clip": 1.0485481, "balance_loss_mlp": 0.99992251, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 4.5130259671257615, "language_loss": 0.82297146, "learning_rate": 3.080630436613314e-07, "loss": 0.86135858, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.705799102783203 }, { "auxiliary_loss_clip": 0.01218928, "auxiliary_loss_mlp": 0.01025535, "balance_loss_clip": 1.04606891, "balance_loss_mlp": 1.01865959, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 2.1044213496194586, "language_loss": 0.85895318, "learning_rate": 3.076477986880039e-07, "loss": 0.88139772, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.5802536010742188 }, { "auxiliary_loss_clip": 0.01269005, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04666543, "balance_loss_mlp": 1.01634645, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.3465040842235787, "language_loss": 0.69720829, "learning_rate": 3.0723281044315986e-07, "loss": 0.72013414, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 3.6080055236816406 }, { "auxiliary_loss_clip": 0.01167114, "auxiliary_loss_mlp": 0.01026703, "balance_loss_clip": 1.04572749, "balance_loss_mlp": 1.02020621, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 2.138434155816014, "language_loss": 0.76391959, "learning_rate": 3.068180789897521e-07, "loss": 0.78585774, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.550772190093994 }, { "auxiliary_loss_clip": 0.01227974, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 1.0490284, "balance_loss_mlp": 1.02012002, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 2.3755490906661616, "language_loss": 0.81513262, "learning_rate": 3.064036043906966e-07, "loss": 0.83769131, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.717819929122925 }, { "auxiliary_loss_clip": 0.01329701, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.04533184, "balance_loss_mlp": 1.02167642, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 2.2345010911581205, "language_loss": 0.67955077, "learning_rate": 3.059893867088668e-07, "loss": 0.70313704, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.869713544845581 }, { "auxiliary_loss_clip": 0.01220809, "auxiliary_loss_mlp": 0.01024507, "balance_loss_clip": 1.04707682, "balance_loss_mlp": 1.01750362, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.8153693788476575, "language_loss": 0.67087936, "learning_rate": 3.055754260071004e-07, "loss": 0.69333255, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.7003121376037598 }, { "auxiliary_loss_clip": 0.01222426, "auxiliary_loss_mlp": 0.01022875, "balance_loss_clip": 1.04742026, "balance_loss_mlp": 1.01607108, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 2.189257792299753, "language_loss": 0.73829222, "learning_rate": 3.051617223481948e-07, "loss": 0.76074523, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.687650442123413 }, { "auxiliary_loss_clip": 0.01231427, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.04456353, "balance_loss_mlp": 1.02334762, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 4.83644842740187, "language_loss": 0.75127506, "learning_rate": 3.047482757949078e-07, "loss": 0.77389693, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.650857925415039 }, { "auxiliary_loss_clip": 0.0131674, "auxiliary_loss_mlp": 0.02560515, "balance_loss_clip": 1.04256737, "balance_loss_mlp": 0.99991429, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 2.334594954328534, "language_loss": 0.85829675, "learning_rate": 3.043350864099605e-07, "loss": 0.89706928, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.6686511039733887 }, { "auxiliary_loss_clip": 0.01226131, "auxiliary_loss_mlp": 0.01027968, "balance_loss_clip": 1.04612112, "balance_loss_mlp": 1.02077425, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.699538401908338, "language_loss": 0.80987948, "learning_rate": 3.039221542560315e-07, "loss": 0.83242053, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.608227491378784 }, { "auxiliary_loss_clip": 0.01220124, "auxiliary_loss_mlp": 0.01024495, "balance_loss_clip": 1.04760647, "balance_loss_mlp": 1.01789129, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.9365515716355337, "language_loss": 0.73757762, "learning_rate": 3.0350947939576356e-07, "loss": 0.76002383, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.610100507736206 }, { "auxiliary_loss_clip": 0.0123023, "auxiliary_loss_mlp": 0.01026995, "balance_loss_clip": 1.04899955, "balance_loss_mlp": 1.01968443, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.8015335880173156, "language_loss": 0.72372264, "learning_rate": 3.0309706189175876e-07, "loss": 0.74629486, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.654231071472168 }, { "auxiliary_loss_clip": 0.01171309, "auxiliary_loss_mlp": 0.01000011, "balance_loss_clip": 1.00764942, "balance_loss_mlp": 0.99911708, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.754428159258695, "language_loss": 0.57309759, "learning_rate": 3.0268490180658045e-07, "loss": 0.59481078, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.2540221214294434 }, { "auxiliary_loss_clip": 0.01177862, "auxiliary_loss_mlp": 0.01030356, "balance_loss_clip": 1.05160117, "balance_loss_mlp": 1.0232991, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.9549567070758442, "language_loss": 0.79450917, "learning_rate": 3.0227299920275305e-07, "loss": 0.81659126, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.5398073196411133 }, { "auxiliary_loss_clip": 0.013223, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.04641509, "balance_loss_mlp": 1.02360654, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.0539805425995206, "language_loss": 0.85616624, "learning_rate": 3.018613541427613e-07, "loss": 0.87970084, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.710864782333374 }, { "auxiliary_loss_clip": 0.01172566, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.04773808, "balance_loss_mlp": 1.02432036, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.732597852631137, "language_loss": 0.74185508, "learning_rate": 3.0144996668905243e-07, "loss": 0.76389289, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.5511906147003174 }, { "auxiliary_loss_clip": 0.01423831, "auxiliary_loss_mlp": 0.02566032, "balance_loss_clip": 1.04100037, "balance_loss_mlp": 0.99991691, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 2.3880817732015105, "language_loss": 0.82215619, "learning_rate": 3.010388369040331e-07, "loss": 0.86205488, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.778477907180786 }, { "auxiliary_loss_clip": 0.01219392, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.02372146, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.7171590567151225, "language_loss": 0.82859206, "learning_rate": 3.0062796485007156e-07, "loss": 0.85109121, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.689568042755127 }, { "auxiliary_loss_clip": 0.01173606, "auxiliary_loss_mlp": 0.02568006, "balance_loss_clip": 1.04887938, "balance_loss_mlp": 0.99991995, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 3.716724106018293, "language_loss": 0.65730619, "learning_rate": 3.002173505894965e-07, "loss": 0.69472229, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 3.4912078380584717 }, { "auxiliary_loss_clip": 0.01229057, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.04766464, "balance_loss_mlp": 1.01895237, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 4.305123467572963, "language_loss": 0.63304007, "learning_rate": 2.998069941845973e-07, "loss": 0.6555987, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 3.599705457687378 }, { "auxiliary_loss_clip": 0.01061277, "auxiliary_loss_mlp": 0.01002674, "balance_loss_clip": 1.00710082, "balance_loss_mlp": 1.00179768, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7080599710986688, "language_loss": 0.57366323, "learning_rate": 2.993968956976258e-07, "loss": 0.59430277, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.3059377670288086 }, { "auxiliary_loss_clip": 0.0117985, "auxiliary_loss_mlp": 0.01030783, "balance_loss_clip": 1.05061102, "balance_loss_mlp": 1.02327919, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 1.8023642303324676, "language_loss": 0.70186508, "learning_rate": 2.9898705519079313e-07, "loss": 0.72397149, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.6171798706054688 }, { "auxiliary_loss_clip": 0.01267052, "auxiliary_loss_mlp": 0.01023802, "balance_loss_clip": 1.04360986, "balance_loss_mlp": 1.01665235, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 2.2601270752193963, "language_loss": 0.7492407, "learning_rate": 2.985774727262715e-07, "loss": 0.77214921, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 3.6169822216033936 }, { "auxiliary_loss_clip": 0.01171789, "auxiliary_loss_mlp": 0.01023062, "balance_loss_clip": 1.04810953, "balance_loss_mlp": 1.01690805, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 4.04637680392429, "language_loss": 0.81520879, "learning_rate": 2.981681483661949e-07, "loss": 0.83715737, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.5873208045959473 }, { "auxiliary_loss_clip": 0.01225365, "auxiliary_loss_mlp": 0.01026211, "balance_loss_clip": 1.05174756, "balance_loss_mlp": 1.01887965, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 2.1166483665659124, "language_loss": 0.71185768, "learning_rate": 2.9775908217265633e-07, "loss": 0.73437345, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 2.8773345947265625 }, { "auxiliary_loss_clip": 0.01321636, "auxiliary_loss_mlp": 0.01000298, "balance_loss_clip": 1.00676465, "balance_loss_mlp": 0.99936849, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8298432984683878, "language_loss": 0.50343776, "learning_rate": 2.9735027420771253e-07, "loss": 0.5266571, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.4237570762634277 }, { "auxiliary_loss_clip": 0.01266142, "auxiliary_loss_mlp": 0.0102501, "balance_loss_clip": 1.04746723, "balance_loss_mlp": 1.01834881, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 2.0710057178447725, "language_loss": 0.71275198, "learning_rate": 2.969417245333774e-07, "loss": 0.73566353, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 3.8427321910858154 }, { "auxiliary_loss_clip": 0.01313415, "auxiliary_loss_mlp": 0.01025011, "balance_loss_clip": 1.04549789, "balance_loss_mlp": 1.01857114, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 2.0846447782156305, "language_loss": 0.77815878, "learning_rate": 2.9653343321162915e-07, "loss": 0.80154312, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.7146289348602295 }, { "auxiliary_loss_clip": 0.01321096, "auxiliary_loss_mlp": 0.0102438, "balance_loss_clip": 1.04769826, "balance_loss_mlp": 1.01717103, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 2.0959099167835644, "language_loss": 0.64879942, "learning_rate": 2.9612540030440446e-07, "loss": 0.67225415, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.6854827404022217 }, { "auxiliary_loss_clip": 0.0116763, "auxiliary_loss_mlp": 0.01004305, "balance_loss_clip": 1.00599253, "balance_loss_mlp": 1.00343513, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.843773906398358, "language_loss": 0.64030319, "learning_rate": 2.9571762587360206e-07, "loss": 0.66202253, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.209599733352661 }, { "auxiliary_loss_clip": 0.01372472, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 1.0383141, "balance_loss_mlp": 1.01674867, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.743456936098702, "language_loss": 0.74433464, "learning_rate": 2.953101099810806e-07, "loss": 0.76829302, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.882227897644043 }, { "auxiliary_loss_clip": 0.01218288, "auxiliary_loss_mlp": 0.01025557, "balance_loss_clip": 1.04908109, "balance_loss_mlp": 1.01924181, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.103764252669273, "language_loss": 0.82990605, "learning_rate": 2.9490285268865965e-07, "loss": 0.85234451, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.5901689529418945 }, { "auxiliary_loss_clip": 0.01231503, "auxiliary_loss_mlp": 0.0102679, "balance_loss_clip": 1.05234265, "balance_loss_mlp": 1.01920199, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.260151225591108, "language_loss": 0.79675686, "learning_rate": 2.9449585405812085e-07, "loss": 0.81933981, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.706911087036133 }, { "auxiliary_loss_clip": 0.01223359, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.04680467, "balance_loss_mlp": 1.01976752, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 1.7866222348310496, "language_loss": 0.73795938, "learning_rate": 2.940891141512043e-07, "loss": 0.76045907, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.741992235183716 }, { "auxiliary_loss_clip": 0.01274941, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.0446974, "balance_loss_mlp": 1.01944542, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.6318841988185837, "language_loss": 0.72064793, "learning_rate": 2.9368263302961385e-07, "loss": 0.74366647, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.702859401702881 }, { "auxiliary_loss_clip": 0.01417694, "auxiliary_loss_mlp": 0.01023905, "balance_loss_clip": 1.04022312, "balance_loss_mlp": 1.0168364, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 1.777146384346947, "language_loss": 0.79932821, "learning_rate": 2.9327641075501075e-07, "loss": 0.82374418, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.8070156574249268 }, { "auxiliary_loss_clip": 0.01267783, "auxiliary_loss_mlp": 0.01022479, "balance_loss_clip": 1.04194319, "balance_loss_mlp": 1.01503754, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 3.3697048303296215, "language_loss": 0.66882253, "learning_rate": 2.9287044738901866e-07, "loss": 0.69172513, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.727203607559204 }, { "auxiliary_loss_clip": 0.01227291, "auxiliary_loss_mlp": 0.02565538, "balance_loss_clip": 1.04907322, "balance_loss_mlp": 0.99989522, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 2.178967488985534, "language_loss": 0.90950489, "learning_rate": 2.9246474299322274e-07, "loss": 0.94743311, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.5994694232940674 }, { "auxiliary_loss_clip": 0.01115615, "auxiliary_loss_mlp": 0.01000898, "balance_loss_clip": 1.00794625, "balance_loss_mlp": 0.99991435, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8819562740062139, "language_loss": 0.63116491, "learning_rate": 2.920592976291678e-07, "loss": 0.65233004, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.18475341796875 }, { "auxiliary_loss_clip": 0.0122439, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.04632008, "balance_loss_mlp": 1.02136993, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 2.0840556827873833, "language_loss": 0.81042898, "learning_rate": 2.916541113583595e-07, "loss": 0.83295584, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.6249215602874756 }, { "auxiliary_loss_clip": 0.01223611, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.04759896, "balance_loss_mlp": 1.02320576, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 10.876641066251473, "language_loss": 0.66291153, "learning_rate": 2.912491842422642e-07, "loss": 0.68544793, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.6369078159332275 }, { "auxiliary_loss_clip": 0.0122419, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.04759741, "balance_loss_mlp": 1.02640629, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.8846834540387183, "language_loss": 0.70869315, "learning_rate": 2.9084451634230857e-07, "loss": 0.73126996, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.6218020915985107 }, { "auxiliary_loss_clip": 0.01317841, "auxiliary_loss_mlp": 0.01026782, "balance_loss_clip": 1.04265618, "balance_loss_mlp": 1.01952219, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.5457551926742523, "language_loss": 0.71609962, "learning_rate": 2.9044010771988125e-07, "loss": 0.73954588, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.7740542888641357 }, { "auxiliary_loss_clip": 0.01269037, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 1.0454576, "balance_loss_mlp": 1.02200985, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 1.7965839409102575, "language_loss": 0.72221893, "learning_rate": 2.900359584363303e-07, "loss": 0.74519694, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.8624584674835205 }, { "auxiliary_loss_clip": 0.01371917, "auxiliary_loss_mlp": 0.0103003, "balance_loss_clip": 1.04522991, "balance_loss_mlp": 1.02226937, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.1755449054169533, "language_loss": 0.84518856, "learning_rate": 2.8963206855296494e-07, "loss": 0.86920798, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 4.056193113327026 }, { "auxiliary_loss_clip": 0.01225695, "auxiliary_loss_mlp": 0.01023466, "balance_loss_clip": 1.04745221, "balance_loss_mlp": 1.01645088, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 2.0767989667324005, "language_loss": 0.77338642, "learning_rate": 2.892284381310548e-07, "loss": 0.79587799, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 3.5085866451263428 }, { "auxiliary_loss_clip": 0.01267605, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.04560399, "balance_loss_mlp": 1.01848888, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.4738552832796663, "language_loss": 0.72418672, "learning_rate": 2.888250672318302e-07, "loss": 0.74711871, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.6608033180236816 }, { "auxiliary_loss_clip": 0.01176489, "auxiliary_loss_mlp": 0.01023933, "balance_loss_clip": 1.05003119, "balance_loss_mlp": 1.01721215, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.8353553890799, "language_loss": 0.68739438, "learning_rate": 2.884219559164831e-07, "loss": 0.70939851, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.6901814937591553 }, { "auxiliary_loss_clip": 0.01222918, "auxiliary_loss_mlp": 0.0103264, "balance_loss_clip": 1.04925704, "balance_loss_mlp": 1.02595234, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 2.146992057335875, "language_loss": 0.81454015, "learning_rate": 2.880191042461635e-07, "loss": 0.83709574, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 3.522088050842285 }, { "auxiliary_loss_clip": 0.01377608, "auxiliary_loss_mlp": 0.01025756, "balance_loss_clip": 1.04302859, "balance_loss_mlp": 1.01926458, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.76239463731894, "language_loss": 0.79850978, "learning_rate": 2.876165122819849e-07, "loss": 0.82254338, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.7109344005584717 }, { "auxiliary_loss_clip": 0.01171715, "auxiliary_loss_mlp": 0.01021712, "balance_loss_clip": 1.04855609, "balance_loss_mlp": 1.01534379, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 3.905092301915365, "language_loss": 0.79612559, "learning_rate": 2.872141800850201e-07, "loss": 0.81805986, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.537079095840454 }, { "auxiliary_loss_clip": 0.01171982, "auxiliary_loss_mlp": 0.01029237, "balance_loss_clip": 1.04801202, "balance_loss_mlp": 1.02219212, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.7319705342379115, "language_loss": 0.73702455, "learning_rate": 2.868121077163024e-07, "loss": 0.75903678, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.698092460632324 }, { "auxiliary_loss_clip": 0.01223901, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.04526591, "balance_loss_mlp": 1.01933074, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 6.458254260913927, "language_loss": 0.72386765, "learning_rate": 2.864102952368257e-07, "loss": 0.74636936, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 3.468461513519287 }, { "auxiliary_loss_clip": 0.013618, "auxiliary_loss_mlp": 0.01030798, "balance_loss_clip": 1.03579974, "balance_loss_mlp": 1.02355576, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.5207405727163525, "language_loss": 0.59554154, "learning_rate": 2.860087427075444e-07, "loss": 0.61946756, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.8231353759765625 }, { "auxiliary_loss_clip": 0.01268783, "auxiliary_loss_mlp": 0.01020624, "balance_loss_clip": 1.04382992, "balance_loss_mlp": 1.01406479, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.8132246145999726, "language_loss": 0.86183989, "learning_rate": 2.856074501893744e-07, "loss": 0.88473403, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.639317512512207 }, { "auxiliary_loss_clip": 0.01224734, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.05088115, "balance_loss_mlp": 1.02250862, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 13.405841674803577, "language_loss": 0.81749249, "learning_rate": 2.8520641774319054e-07, "loss": 0.8400352, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.607154130935669 }, { "auxiliary_loss_clip": 0.01274066, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.04186761, "balance_loss_mlp": 1.02205396, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.1402604423044727, "language_loss": 0.75803304, "learning_rate": 2.848056454298309e-07, "loss": 0.78106427, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.6168859004974365 }, { "auxiliary_loss_clip": 0.01269796, "auxiliary_loss_mlp": 0.01024097, "balance_loss_clip": 1.04751384, "balance_loss_mlp": 1.01698339, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.6997826160034406, "language_loss": 0.6552844, "learning_rate": 2.844051333100905e-07, "loss": 0.67822337, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.6249876022338867 }, { "auxiliary_loss_clip": 0.01270562, "auxiliary_loss_mlp": 0.01025907, "balance_loss_clip": 1.04830158, "balance_loss_mlp": 1.01969028, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.9364926147375603, "language_loss": 0.83754587, "learning_rate": 2.840048814447269e-07, "loss": 0.86051059, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.6255948543548584 }, { "auxiliary_loss_clip": 0.01269042, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.04314268, "balance_loss_mlp": 1.02608085, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.908789640714984, "language_loss": 0.74261725, "learning_rate": 2.836048898944587e-07, "loss": 0.76563722, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.6810662746429443 }, { "auxiliary_loss_clip": 0.0127278, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 1.04660726, "balance_loss_mlp": 1.01989412, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.586953748936526, "language_loss": 0.72916806, "learning_rate": 2.832051587199642e-07, "loss": 0.75215513, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.689176559448242 }, { "auxiliary_loss_clip": 0.0111602, "auxiliary_loss_mlp": 0.01001863, "balance_loss_clip": 1.00701022, "balance_loss_mlp": 1.00096846, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8082923431065165, "language_loss": 0.57683122, "learning_rate": 2.828056879818821e-07, "loss": 0.59801006, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.1253604888916016 }, { "auxiliary_loss_clip": 0.01321251, "auxiliary_loss_mlp": 0.01025717, "balance_loss_clip": 1.04091573, "balance_loss_mlp": 1.01934218, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 2.025056036385724, "language_loss": 0.83550274, "learning_rate": 2.824064777408117e-07, "loss": 0.85897243, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.7809760570526123 }, { "auxiliary_loss_clip": 0.01222146, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 1.04869223, "balance_loss_mlp": 1.02403593, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 2.313732438252162, "language_loss": 0.76224792, "learning_rate": 2.8200752805731263e-07, "loss": 0.78478014, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.6804275512695312 }, { "auxiliary_loss_clip": 0.0122174, "auxiliary_loss_mlp": 0.01021495, "balance_loss_clip": 1.04918635, "balance_loss_mlp": 1.01445317, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.4402777532725128, "language_loss": 0.8108725, "learning_rate": 2.8160883899190625e-07, "loss": 0.83330482, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.7079339027404785 }, { "auxiliary_loss_clip": 0.01318246, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.04676509, "balance_loss_mlp": 1.02303791, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 3.0277230091059515, "language_loss": 0.73554862, "learning_rate": 2.8121041060507234e-07, "loss": 0.75903177, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.731416940689087 }, { "auxiliary_loss_clip": 0.0122775, "auxiliary_loss_mlp": 0.01023163, "balance_loss_clip": 1.04737055, "balance_loss_mlp": 1.01639771, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.8274513334432378, "language_loss": 0.71552277, "learning_rate": 2.808122429572528e-07, "loss": 0.73803192, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.7479119300842285 }, { "auxiliary_loss_clip": 0.01329129, "auxiliary_loss_mlp": 0.0102275, "balance_loss_clip": 1.04504263, "balance_loss_mlp": 1.01606202, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 2.588692238038136, "language_loss": 0.75493282, "learning_rate": 2.804143361088489e-07, "loss": 0.77845168, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.6946322917938232 }, { "auxiliary_loss_clip": 0.01269886, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.04566097, "balance_loss_mlp": 1.01954317, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.453199428921057, "language_loss": 0.77823657, "learning_rate": 2.8001669012022277e-07, "loss": 0.80120099, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.698209762573242 }, { "auxiliary_loss_clip": 0.01223115, "auxiliary_loss_mlp": 0.01027128, "balance_loss_clip": 1.0498805, "balance_loss_mlp": 1.02012503, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 4.621431341325348, "language_loss": 0.69343984, "learning_rate": 2.7961930505169795e-07, "loss": 0.7159422, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.696486711502075 }, { "auxiliary_loss_clip": 0.01224581, "auxiliary_loss_mlp": 0.02566335, "balance_loss_clip": 1.04885948, "balance_loss_mlp": 0.99991179, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 3.4003930313250104, "language_loss": 0.76281965, "learning_rate": 2.792221809635558e-07, "loss": 0.8007288, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 3.609750509262085 }, { "auxiliary_loss_clip": 0.01469307, "auxiliary_loss_mlp": 0.01028728, "balance_loss_clip": 1.04090583, "balance_loss_mlp": 1.02133989, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.9563872782296907, "language_loss": 0.74685276, "learning_rate": 2.788253179160411e-07, "loss": 0.77183306, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 3.853145122528076 }, { "auxiliary_loss_clip": 0.01271094, "auxiliary_loss_mlp": 0.01027672, "balance_loss_clip": 1.04582751, "balance_loss_mlp": 1.02071023, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 2.7946560310935524, "language_loss": 0.65156782, "learning_rate": 2.7842871596935725e-07, "loss": 0.67455554, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.808785915374756 }, { "auxiliary_loss_clip": 0.01125583, "auxiliary_loss_mlp": 0.01022735, "balance_loss_clip": 1.04722857, "balance_loss_mlp": 1.01535904, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.7792155423429783, "language_loss": 0.69263428, "learning_rate": 2.780323751836682e-07, "loss": 0.71411753, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.6427671909332275 }, { "auxiliary_loss_clip": 0.0127222, "auxiliary_loss_mlp": 0.0256467, "balance_loss_clip": 1.04401147, "balance_loss_mlp": 0.99990654, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 2.442520966307865, "language_loss": 0.79038143, "learning_rate": 2.7763629561909876e-07, "loss": 0.82875037, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 3.9412951469421387 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.01026622, "balance_loss_clip": 1.0487349, "balance_loss_mlp": 1.01995838, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 2.123831448841596, "language_loss": 0.76815766, "learning_rate": 2.772404773357335e-07, "loss": 0.7901479, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.587252140045166 }, { "auxiliary_loss_clip": 0.01314831, "auxiliary_loss_mlp": 0.01026633, "balance_loss_clip": 1.04339015, "balance_loss_mlp": 1.01980782, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 1.834714662085777, "language_loss": 0.78457052, "learning_rate": 2.7684492039361853e-07, "loss": 0.80798519, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.7391738891601562 }, { "auxiliary_loss_clip": 0.01176254, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 1.05027294, "balance_loss_mlp": 1.02125072, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.855612512171314, "language_loss": 0.8370142, "learning_rate": 2.764496248527586e-07, "loss": 0.85905778, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.6354825496673584 }, { "auxiliary_loss_clip": 0.01228951, "auxiliary_loss_mlp": 0.01024851, "balance_loss_clip": 1.04546976, "balance_loss_mlp": 1.01808596, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 2.077607041694871, "language_loss": 0.78621441, "learning_rate": 2.760545907731211e-07, "loss": 0.80875242, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.713735818862915 }, { "auxiliary_loss_clip": 0.01224838, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.04625809, "balance_loss_mlp": 1.0244348, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.6782910810734124, "language_loss": 0.67878717, "learning_rate": 2.75659818214631e-07, "loss": 0.70135045, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 3.6054940223693848 }, { "auxiliary_loss_clip": 0.0127661, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.04798818, "balance_loss_mlp": 1.02064991, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.8730782944132267, "language_loss": 0.78294599, "learning_rate": 2.752653072371749e-07, "loss": 0.80598629, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.6986758708953857 }, { "auxiliary_loss_clip": 0.01312977, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.04659331, "balance_loss_mlp": 1.01928496, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 2.5207324046993485, "language_loss": 0.74864709, "learning_rate": 2.7487105790060105e-07, "loss": 0.77203619, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.8156509399414062 }, { "auxiliary_loss_clip": 0.01222853, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.04591966, "balance_loss_mlp": 1.01691413, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 2.145139650828975, "language_loss": 0.69610465, "learning_rate": 2.7447707026471587e-07, "loss": 0.71856749, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.788815975189209 }, { "auxiliary_loss_clip": 0.01320321, "auxiliary_loss_mlp": 0.01022918, "balance_loss_clip": 1.04487121, "balance_loss_mlp": 1.01656425, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 2.18109656368797, "language_loss": 0.80051899, "learning_rate": 2.740833443892874e-07, "loss": 0.82395136, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.7479331493377686 }, { "auxiliary_loss_clip": 0.01272564, "auxiliary_loss_mlp": 0.01022702, "balance_loss_clip": 1.04563057, "balance_loss_mlp": 1.01597822, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.9415049374115332, "language_loss": 0.79507583, "learning_rate": 2.7368988033404327e-07, "loss": 0.81802851, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.7203803062438965 }, { "auxiliary_loss_clip": 0.01320995, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 1.04589963, "balance_loss_mlp": 1.01893413, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.8329169115275334, "language_loss": 0.84572268, "learning_rate": 2.732966781586712e-07, "loss": 0.86918783, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.7367193698883057 }, { "auxiliary_loss_clip": 0.01217872, "auxiliary_loss_mlp": 0.01022533, "balance_loss_clip": 1.04525709, "balance_loss_mlp": 1.0161258, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 3.961298451414, "language_loss": 0.66943747, "learning_rate": 2.729037379228205e-07, "loss": 0.69184154, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.6965510845184326 }, { "auxiliary_loss_clip": 0.01271478, "auxiliary_loss_mlp": 0.01024728, "balance_loss_clip": 1.04782891, "balance_loss_mlp": 1.01814818, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.5551924515625721, "language_loss": 0.80670381, "learning_rate": 2.725110596860998e-07, "loss": 0.8296659, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.6930713653564453 }, { "auxiliary_loss_clip": 0.01370513, "auxiliary_loss_mlp": 0.01022537, "balance_loss_clip": 1.04532516, "balance_loss_mlp": 1.01567972, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 7.218823076356944, "language_loss": 0.70256764, "learning_rate": 2.7211864350807776e-07, "loss": 0.72649813, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.8124196529388428 }, { "auxiliary_loss_clip": 0.01174923, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.04952347, "balance_loss_mlp": 1.0227536, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.7090739433206692, "language_loss": 0.73978359, "learning_rate": 2.717264894482836e-07, "loss": 0.76183045, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.658801317214966 }, { "auxiliary_loss_clip": 0.01225653, "auxiliary_loss_mlp": 0.0102458, "balance_loss_clip": 1.04973149, "balance_loss_mlp": 1.01752591, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 4.465641354311493, "language_loss": 0.81336296, "learning_rate": 2.7133459756620646e-07, "loss": 0.83586526, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.65259051322937 }, { "auxiliary_loss_clip": 0.01222332, "auxiliary_loss_mlp": 0.01023279, "balance_loss_clip": 1.04743218, "balance_loss_mlp": 1.01630867, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 1.6716700607501263, "language_loss": 0.73809528, "learning_rate": 2.7094296792129733e-07, "loss": 0.76055145, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.636814832687378 }, { "auxiliary_loss_clip": 0.01223231, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.04788923, "balance_loss_mlp": 1.01756597, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.8030330556774168, "language_loss": 0.75590527, "learning_rate": 2.7055160057296424e-07, "loss": 0.77838242, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.7629599571228027 }, { "auxiliary_loss_clip": 0.01322368, "auxiliary_loss_mlp": 0.0102519, "balance_loss_clip": 1.04435384, "balance_loss_mlp": 1.01843643, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.822700580963271, "language_loss": 0.7256763, "learning_rate": 2.7016049558057896e-07, "loss": 0.74915189, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.7857275009155273 }, { "auxiliary_loss_clip": 0.01221335, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.04792166, "balance_loss_mlp": 1.02525365, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.9718118547578198, "language_loss": 0.7079103, "learning_rate": 2.6976965300347074e-07, "loss": 0.73044443, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.72890305519104 }, { "auxiliary_loss_clip": 0.01268329, "auxiliary_loss_mlp": 0.01024749, "balance_loss_clip": 1.04330051, "balance_loss_mlp": 1.01791239, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.6973108579610403, "language_loss": 0.69376159, "learning_rate": 2.693790729009309e-07, "loss": 0.71669233, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.716850757598877 }, { "auxiliary_loss_clip": 0.0127114, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.04580832, "balance_loss_mlp": 1.01918066, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.8717485992238072, "language_loss": 0.8841188, "learning_rate": 2.6898875533220946e-07, "loss": 0.90709287, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 3.5974512100219727 }, { "auxiliary_loss_clip": 0.01171934, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.05088997, "balance_loss_mlp": 1.01873064, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 2.1335213990408057, "language_loss": 0.81687868, "learning_rate": 2.685987003565171e-07, "loss": 0.83884537, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 3.584362268447876 }, { "auxiliary_loss_clip": 0.01317565, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.04818892, "balance_loss_mlp": 1.02227235, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 2.804961155343125, "language_loss": 0.75315738, "learning_rate": 2.6820890803302566e-07, "loss": 0.7766279, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.715165615081787 }, { "auxiliary_loss_clip": 0.01178488, "auxiliary_loss_mlp": 0.01021864, "balance_loss_clip": 1.05163372, "balance_loss_mlp": 1.01472378, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.9343085191467364, "language_loss": 0.81961918, "learning_rate": 2.6781937842086557e-07, "loss": 0.84162271, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.6390252113342285 }, { "auxiliary_loss_clip": 0.01225919, "auxiliary_loss_mlp": 0.01026845, "balance_loss_clip": 1.04794931, "balance_loss_mlp": 1.02010655, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 1.9318507936087848, "language_loss": 0.67846859, "learning_rate": 2.6743011157912933e-07, "loss": 0.70099622, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 2.660090923309326 }, { "auxiliary_loss_clip": 0.01371437, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.03991938, "balance_loss_mlp": 1.01794124, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 1.7379823836881858, "language_loss": 0.65085596, "learning_rate": 2.6704110756686725e-07, "loss": 0.67481995, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 3.7116384506225586 }, { "auxiliary_loss_clip": 0.0127228, "auxiliary_loss_mlp": 0.02566719, "balance_loss_clip": 1.04372835, "balance_loss_mlp": 0.99988616, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.7516380599222245, "language_loss": 0.83848083, "learning_rate": 2.6665236644309085e-07, "loss": 0.87687087, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.7332377433776855 }, { "auxiliary_loss_clip": 0.01224296, "auxiliary_loss_mlp": 0.01022032, "balance_loss_clip": 1.04757726, "balance_loss_mlp": 1.01559138, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 1.9728370347649666, "language_loss": 0.79515129, "learning_rate": 2.662638882667727e-07, "loss": 0.81761456, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.625753164291382 }, { "auxiliary_loss_clip": 0.01177622, "auxiliary_loss_mlp": 0.01030087, "balance_loss_clip": 1.05081892, "balance_loss_mlp": 1.0229013, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 1.9138655212848097, "language_loss": 0.73313999, "learning_rate": 2.658756730968443e-07, "loss": 0.75521708, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 3.565002202987671 }, { "auxiliary_loss_clip": 0.01275834, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.04945409, "balance_loss_mlp": 1.02341938, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 2.224239056945468, "language_loss": 0.88242602, "learning_rate": 2.654877209921975e-07, "loss": 0.90548497, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.6831705570220947 }, { "auxiliary_loss_clip": 0.01380819, "auxiliary_loss_mlp": 0.01024394, "balance_loss_clip": 1.0418222, "balance_loss_mlp": 1.01725888, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.5948364632569363, "language_loss": 0.63262105, "learning_rate": 2.651000320116843e-07, "loss": 0.65667319, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.8210246562957764 }, { "auxiliary_loss_clip": 0.01320638, "auxiliary_loss_mlp": 0.02569674, "balance_loss_clip": 1.04398644, "balance_loss_mlp": 0.99988437, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 1.8669515809627832, "language_loss": 0.7605021, "learning_rate": 2.647126062141163e-07, "loss": 0.79940522, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.7357916831970215 }, { "auxiliary_loss_clip": 0.01276647, "auxiliary_loss_mlp": 0.010265, "balance_loss_clip": 1.04478121, "balance_loss_mlp": 1.01973808, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 2.1557227280286226, "language_loss": 0.84149146, "learning_rate": 2.643254436582669e-07, "loss": 0.86452293, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.6301429271698 }, { "auxiliary_loss_clip": 0.01374276, "auxiliary_loss_mlp": 0.01024045, "balance_loss_clip": 1.04412675, "balance_loss_mlp": 1.01696432, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 3.7438401411608724, "language_loss": 0.82820672, "learning_rate": 2.6393854440286743e-07, "loss": 0.8521899, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.765263557434082 }, { "auxiliary_loss_clip": 0.01173289, "auxiliary_loss_mlp": 0.01027072, "balance_loss_clip": 1.05044031, "balance_loss_mlp": 1.02002728, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 3.386382888284045, "language_loss": 0.70848775, "learning_rate": 2.6355190850661045e-07, "loss": 0.73049134, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.663285732269287 }, { "auxiliary_loss_clip": 0.0127361, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.04834092, "balance_loss_mlp": 1.02406931, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 18.212430697750005, "language_loss": 0.86777258, "learning_rate": 2.631655360281486e-07, "loss": 0.89081478, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.68686580657959 }, { "auxiliary_loss_clip": 0.01128741, "auxiliary_loss_mlp": 0.02569381, "balance_loss_clip": 1.0490551, "balance_loss_mlp": 0.99993467, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.9508929227996006, "language_loss": 0.6557619, "learning_rate": 2.6277942702609323e-07, "loss": 0.69274306, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.6494362354278564 }, { "auxiliary_loss_clip": 0.01322838, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.0456841, "balance_loss_mlp": 1.01970446, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 1.997301140240795, "language_loss": 0.87370908, "learning_rate": 2.623935815590186e-07, "loss": 0.89720309, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.733858108520508 }, { "auxiliary_loss_clip": 0.01273917, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.04770994, "balance_loss_mlp": 1.02384686, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.8685444116564762, "language_loss": 0.81142294, "learning_rate": 2.6200799968545516e-07, "loss": 0.83447516, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.6888091564178467 }, { "auxiliary_loss_clip": 0.01162284, "auxiliary_loss_mlp": 0.01002319, "balance_loss_clip": 1.00679183, "balance_loss_mlp": 1.0013535, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.788387543996549, "language_loss": 0.56391138, "learning_rate": 2.616226814638969e-07, "loss": 0.58555746, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.226602554321289 }, { "auxiliary_loss_clip": 0.01172568, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.04778385, "balance_loss_mlp": 1.02190351, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.9946322185106191, "language_loss": 0.77450222, "learning_rate": 2.612376269527954e-07, "loss": 0.79651511, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.6563477516174316 }, { "auxiliary_loss_clip": 0.01269543, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.04787612, "balance_loss_mlp": 1.02077842, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 1.9611520227239005, "language_loss": 0.67613661, "learning_rate": 2.608528362105635e-07, "loss": 0.69911057, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.6791694164276123 }, { "auxiliary_loss_clip": 0.01324208, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 1.04320812, "balance_loss_mlp": 1.0190537, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.919641763794121, "language_loss": 0.73652941, "learning_rate": 2.6046830929557374e-07, "loss": 0.76002634, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.753127098083496 }, { "auxiliary_loss_clip": 0.01320521, "auxiliary_loss_mlp": 0.01027666, "balance_loss_clip": 1.04544115, "balance_loss_mlp": 1.02032852, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 1.8723503984206455, "language_loss": 0.85150683, "learning_rate": 2.6008404626615776e-07, "loss": 0.87498868, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.6967387199401855 }, { "auxiliary_loss_clip": 0.01227167, "auxiliary_loss_mlp": 0.01018731, "balance_loss_clip": 1.04997849, "balance_loss_mlp": 1.01199913, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 2.4606987387828854, "language_loss": 0.73846138, "learning_rate": 2.597000471806092e-07, "loss": 0.76092041, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.6547951698303223 }, { "auxiliary_loss_clip": 0.01268935, "auxiliary_loss_mlp": 0.01026642, "balance_loss_clip": 1.04833055, "balance_loss_mlp": 1.01925683, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 2.3541202433929103, "language_loss": 0.73067451, "learning_rate": 2.593163120971793e-07, "loss": 0.75363022, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.66286301612854 }, { "auxiliary_loss_clip": 0.01363245, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.03809285, "balance_loss_mlp": 1.02571726, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 2.1363827360811265, "language_loss": 0.69088113, "learning_rate": 2.5893284107408165e-07, "loss": 0.71483481, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 4.578260660171509 }, { "auxiliary_loss_clip": 0.01377785, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.04744816, "balance_loss_mlp": 1.01901639, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.763646527505341, "language_loss": 0.78293908, "learning_rate": 2.5854963416948726e-07, "loss": 0.80697393, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.7925093173980713 }, { "auxiliary_loss_clip": 0.0127255, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.03877425, "balance_loss_mlp": 1.02161694, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.7662825373072963, "language_loss": 0.69668353, "learning_rate": 2.5816669144152816e-07, "loss": 0.71969259, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.7456624507904053 }, { "auxiliary_loss_clip": 0.01060813, "auxiliary_loss_mlp": 0.01001732, "balance_loss_clip": 1.00646877, "balance_loss_mlp": 1.00085545, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8439460086025359, "language_loss": 0.6630246, "learning_rate": 2.5778401294829777e-07, "loss": 0.68365008, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.2494678497314453 }, { "auxiliary_loss_clip": 0.01220516, "auxiliary_loss_mlp": 0.02565063, "balance_loss_clip": 1.04838943, "balance_loss_mlp": 0.99988496, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 2.3949006516302718, "language_loss": 0.65442485, "learning_rate": 2.574015987478473e-07, "loss": 0.69228059, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 3.5540902614593506 }, { "auxiliary_loss_clip": 0.01277568, "auxiliary_loss_mlp": 0.0102818, "balance_loss_clip": 1.04695594, "balance_loss_mlp": 1.02024662, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 3.9319219911072585, "language_loss": 0.87255764, "learning_rate": 2.570194488981887e-07, "loss": 0.8956151, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.6505143642425537 }, { "auxiliary_loss_clip": 0.01061144, "auxiliary_loss_mlp": 0.01000913, "balance_loss_clip": 1.00694895, "balance_loss_mlp": 1.00001907, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8515036852540623, "language_loss": 0.60252786, "learning_rate": 2.566375634572939e-07, "loss": 0.62314832, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.1394131183624268 }, { "auxiliary_loss_clip": 0.01324581, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.04245007, "balance_loss_mlp": 1.01885605, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 2.2057648486642303, "language_loss": 0.762918, "learning_rate": 2.562559424830943e-07, "loss": 0.78642786, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.678239107131958 }, { "auxiliary_loss_clip": 0.01272933, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.04425132, "balance_loss_mlp": 1.01763868, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 2.214067949972543, "language_loss": 0.70792389, "learning_rate": 2.5587458603348256e-07, "loss": 0.73089939, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 3.5713565349578857 }, { "auxiliary_loss_clip": 0.0132046, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04487681, "balance_loss_mlp": 1.01766145, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 2.759303016300212, "language_loss": 0.84170234, "learning_rate": 2.554934941663085e-07, "loss": 0.86515355, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.777838706970215 }, { "auxiliary_loss_clip": 0.01322814, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.04545295, "balance_loss_mlp": 1.02286649, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 2.525990181291804, "language_loss": 0.73490113, "learning_rate": 2.5511266693938484e-07, "loss": 0.75843775, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.7402217388153076 }, { "auxiliary_loss_clip": 0.01270017, "auxiliary_loss_mlp": 0.01023077, "balance_loss_clip": 1.04753709, "balance_loss_mlp": 1.01537573, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.487879343494548, "language_loss": 0.77618456, "learning_rate": 2.547321044104822e-07, "loss": 0.79911542, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.672760009765625 }, { "auxiliary_loss_clip": 0.01176931, "auxiliary_loss_mlp": 0.01026483, "balance_loss_clip": 1.05105698, "balance_loss_mlp": 1.01955342, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.9254224989353974, "language_loss": 0.76935679, "learning_rate": 2.5435180663733113e-07, "loss": 0.7913909, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.6327967643737793 }, { "auxiliary_loss_clip": 0.0128045, "auxiliary_loss_mlp": 0.01026707, "balance_loss_clip": 1.04305148, "balance_loss_mlp": 1.01950336, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.4991797143518806, "language_loss": 0.71477592, "learning_rate": 2.539717736776241e-07, "loss": 0.73784751, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.759664297103882 }, { "auxiliary_loss_clip": 0.01219556, "auxiliary_loss_mlp": 0.01025179, "balance_loss_clip": 1.04795396, "balance_loss_mlp": 1.01851535, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.4331295574071072, "language_loss": 0.76660585, "learning_rate": 2.535920055890097e-07, "loss": 0.7890532, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.6285464763641357 }, { "auxiliary_loss_clip": 0.01370439, "auxiliary_loss_mlp": 0.0102396, "balance_loss_clip": 1.04150259, "balance_loss_mlp": 1.01620245, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 4.372176812756397, "language_loss": 0.64381343, "learning_rate": 2.5321250242910006e-07, "loss": 0.66775739, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.724247932434082 }, { "auxiliary_loss_clip": 0.01174377, "auxiliary_loss_mlp": 0.01026423, "balance_loss_clip": 1.05065119, "balance_loss_mlp": 1.01952624, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.6764383112069592, "language_loss": 0.86284482, "learning_rate": 2.5283326425546493e-07, "loss": 0.88485289, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.6184706687927246 }, { "auxiliary_loss_clip": 0.01313915, "auxiliary_loss_mlp": 0.01024831, "balance_loss_clip": 1.04704273, "balance_loss_mlp": 1.01826251, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.01018445198061, "language_loss": 0.69590378, "learning_rate": 2.5245429112563443e-07, "loss": 0.71929121, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.8739497661590576 }, { "auxiliary_loss_clip": 0.01222251, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.04909396, "balance_loss_mlp": 1.02220035, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.08715583637686, "language_loss": 0.82368481, "learning_rate": 2.5207558309709865e-07, "loss": 0.84620023, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.7245538234710693 }, { "auxiliary_loss_clip": 0.01228937, "auxiliary_loss_mlp": 0.02505906, "balance_loss_clip": 1.00650144, "balance_loss_mlp": 0.99985486, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6554455748902237, "language_loss": 0.56227279, "learning_rate": 2.516971402273065e-07, "loss": 0.59962118, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.2818844318389893 }, { "auxiliary_loss_clip": 0.01269873, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.04389524, "balance_loss_mlp": 1.02048063, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 1.751948238260273, "language_loss": 0.67777121, "learning_rate": 2.513189625736687e-07, "loss": 0.70073658, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.6959545612335205 }, { "auxiliary_loss_clip": 0.01324923, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.04430807, "balance_loss_mlp": 1.02191949, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 19.661450113071776, "language_loss": 0.71724874, "learning_rate": 2.509410501935534e-07, "loss": 0.74078953, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.7230887413024902 }, { "auxiliary_loss_clip": 0.01276142, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.04824543, "balance_loss_mlp": 1.02185524, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 3.269475351381643, "language_loss": 0.75883532, "learning_rate": 2.5056340314429116e-07, "loss": 0.7818898, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.6632111072540283 }, { "auxiliary_loss_clip": 0.0137501, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.04142976, "balance_loss_mlp": 1.01942503, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.567824047552669, "language_loss": 0.80383009, "learning_rate": 2.5018602148316904e-07, "loss": 0.82784742, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.7147743701934814 }, { "auxiliary_loss_clip": 0.01171196, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04981017, "balance_loss_mlp": 1.01670504, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 18.53417404336236, "language_loss": 0.80503786, "learning_rate": 2.498089052674359e-07, "loss": 0.82698083, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.657071352005005 }, { "auxiliary_loss_clip": 0.01223862, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.04973078, "balance_loss_mlp": 1.02121568, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 14.068440739014978, "language_loss": 0.75289905, "learning_rate": 2.494320545543007e-07, "loss": 0.77541715, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.701582908630371 }, { "auxiliary_loss_clip": 0.01178517, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.05079579, "balance_loss_mlp": 1.01831317, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.7468694521278074, "language_loss": 0.66574496, "learning_rate": 2.490554694009308e-07, "loss": 0.68778312, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 4.492710828781128 }, { "auxiliary_loss_clip": 0.01228034, "auxiliary_loss_mlp": 0.01024374, "balance_loss_clip": 1.04728425, "balance_loss_mlp": 1.01744175, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.7101307838423652, "language_loss": 0.78118527, "learning_rate": 2.4867914986445426e-07, "loss": 0.80370933, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.7526743412017822 }, { "auxiliary_loss_clip": 0.01275589, "auxiliary_loss_mlp": 0.0102441, "balance_loss_clip": 1.04333282, "balance_loss_mlp": 1.01736736, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 2.1338110082829305, "language_loss": 0.71531069, "learning_rate": 2.483030960019581e-07, "loss": 0.73831069, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.9122376441955566 }, { "auxiliary_loss_clip": 0.01277208, "auxiliary_loss_mlp": 0.00999927, "balance_loss_clip": 1.00748706, "balance_loss_mlp": 0.99894983, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7440198633075089, "language_loss": 0.55386925, "learning_rate": 2.479273078704891e-07, "loss": 0.57664061, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.1329386234283447 }, { "auxiliary_loss_clip": 0.01261062, "auxiliary_loss_mlp": 0.00999435, "balance_loss_clip": 1.00709724, "balance_loss_mlp": 0.99850476, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7803348961499635, "language_loss": 0.64706492, "learning_rate": 2.475517855270552e-07, "loss": 0.66966987, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 4.179176092147827 }, { "auxiliary_loss_clip": 0.01172379, "auxiliary_loss_mlp": 0.01026629, "balance_loss_clip": 1.04960275, "balance_loss_mlp": 1.02002144, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 2.824545013286073, "language_loss": 0.7315191, "learning_rate": 2.4717652902862143e-07, "loss": 0.75350916, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.6281204223632812 }, { "auxiliary_loss_clip": 0.01174679, "auxiliary_loss_mlp": 0.01023627, "balance_loss_clip": 1.04672837, "balance_loss_mlp": 1.01684117, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 2.137664204080232, "language_loss": 0.81399393, "learning_rate": 2.4680153843211495e-07, "loss": 0.83597696, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.698202610015869 }, { "auxiliary_loss_clip": 0.01269919, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.04958105, "balance_loss_mlp": 1.01988029, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.8060796812083695, "language_loss": 0.72571415, "learning_rate": 2.464268137944212e-07, "loss": 0.74868846, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.724576950073242 }, { "auxiliary_loss_clip": 0.01367867, "auxiliary_loss_mlp": 0.01027092, "balance_loss_clip": 1.04366565, "balance_loss_mlp": 1.01925993, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 3.827668425624646, "language_loss": 0.78368664, "learning_rate": 2.46052355172385e-07, "loss": 0.80763626, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.7969510555267334 }, { "auxiliary_loss_clip": 0.01175522, "auxiliary_loss_mlp": 0.01027194, "balance_loss_clip": 1.05125213, "balance_loss_mlp": 1.01979423, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.8959078076364035, "language_loss": 0.74766886, "learning_rate": 2.456781626228128e-07, "loss": 0.769696, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 3.5684847831726074 }, { "auxiliary_loss_clip": 0.01276692, "auxiliary_loss_mlp": 0.02506066, "balance_loss_clip": 1.00620365, "balance_loss_mlp": 0.99988502, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9183987759127491, "language_loss": 0.66192454, "learning_rate": 2.453042362024675e-07, "loss": 0.69975209, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.355557680130005 }, { "auxiliary_loss_clip": 0.01173112, "auxiliary_loss_mlp": 0.01023256, "balance_loss_clip": 1.04912174, "balance_loss_mlp": 1.01672328, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 2.445286158309845, "language_loss": 0.73289609, "learning_rate": 2.449305759680751e-07, "loss": 0.75485975, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.675636053085327 }, { "auxiliary_loss_clip": 0.01314747, "auxiliary_loss_mlp": 0.01023099, "balance_loss_clip": 1.04722559, "balance_loss_mlp": 1.01620889, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.392519197878043, "language_loss": 0.75306726, "learning_rate": 2.445571819763188e-07, "loss": 0.77644575, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.772468328475952 }, { "auxiliary_loss_clip": 0.01174426, "auxiliary_loss_mlp": 0.01029498, "balance_loss_clip": 1.05044591, "balance_loss_mlp": 1.02253318, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 3.104431144221861, "language_loss": 0.58782512, "learning_rate": 2.4418405428384227e-07, "loss": 0.60986435, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.6435599327087402 }, { "auxiliary_loss_clip": 0.01171819, "auxiliary_loss_mlp": 0.02569311, "balance_loss_clip": 1.04910398, "balance_loss_mlp": 0.999897, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 3.391276581792001, "language_loss": 0.71910119, "learning_rate": 2.4381119294724864e-07, "loss": 0.75651252, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.5936970710754395 }, { "auxiliary_loss_clip": 0.01171537, "auxiliary_loss_mlp": 0.0102362, "balance_loss_clip": 1.04809999, "balance_loss_mlp": 1.01695275, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 2.055132960719797, "language_loss": 0.54088676, "learning_rate": 2.434385980231004e-07, "loss": 0.56283832, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.534133195877075 }, { "auxiliary_loss_clip": 0.01221676, "auxiliary_loss_mlp": 0.01022284, "balance_loss_clip": 1.04724967, "balance_loss_mlp": 1.0150423, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.698273843779693, "language_loss": 0.65497833, "learning_rate": 2.4306626956792043e-07, "loss": 0.67741793, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.897120475769043 }, { "auxiliary_loss_clip": 0.01222296, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.04604316, "balance_loss_mlp": 1.02147079, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.7449002619439167, "language_loss": 0.75543702, "learning_rate": 2.4269420763819017e-07, "loss": 0.777946, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.6249570846557617 }, { "auxiliary_loss_clip": 0.01219837, "auxiliary_loss_mlp": 0.01026435, "balance_loss_clip": 1.04804802, "balance_loss_mlp": 1.02012599, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 2.799760069310872, "language_loss": 0.83413142, "learning_rate": 2.4232241229035223e-07, "loss": 0.85659415, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.6393847465515137 }, { "auxiliary_loss_clip": 0.01116591, "auxiliary_loss_mlp": 0.0100118, "balance_loss_clip": 1.00695372, "balance_loss_mlp": 1.00028598, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.750931789776879, "language_loss": 0.56718463, "learning_rate": 2.419508835808064e-07, "loss": 0.58836234, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.1498489379882812 }, { "auxiliary_loss_clip": 0.01268759, "auxiliary_loss_mlp": 0.01023466, "balance_loss_clip": 1.04596996, "balance_loss_mlp": 1.01601875, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 2.0203198500148503, "language_loss": 0.63451236, "learning_rate": 2.415796215659134e-07, "loss": 0.65743458, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.633406400680542 }, { "auxiliary_loss_clip": 0.01326152, "auxiliary_loss_mlp": 0.01022035, "balance_loss_clip": 1.04126918, "balance_loss_mlp": 1.01528454, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 2.0747288407989455, "language_loss": 0.77343285, "learning_rate": 2.412086263019939e-07, "loss": 0.79691476, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.705965995788574 }, { "auxiliary_loss_clip": 0.0117012, "auxiliary_loss_mlp": 0.01020981, "balance_loss_clip": 1.04966211, "balance_loss_mlp": 1.01446342, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 1.8731607749859291, "language_loss": 0.80041558, "learning_rate": 2.408378978453276e-07, "loss": 0.8223266, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.596576452255249 }, { "auxiliary_loss_clip": 0.01115725, "auxiliary_loss_mlp": 0.01000555, "balance_loss_clip": 1.00675607, "balance_loss_mlp": 0.99973232, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8339788756901579, "language_loss": 0.63944185, "learning_rate": 2.404674362521533e-07, "loss": 0.6606046, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.0944736003875732 }, { "auxiliary_loss_clip": 0.01218391, "auxiliary_loss_mlp": 0.01024254, "balance_loss_clip": 1.04819989, "balance_loss_mlp": 1.01761067, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.367279043849664, "language_loss": 0.74462247, "learning_rate": 2.4009724157866997e-07, "loss": 0.76704895, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.592853546142578 }, { "auxiliary_loss_clip": 0.01171457, "auxiliary_loss_mlp": 0.01024122, "balance_loss_clip": 1.04945791, "balance_loss_mlp": 1.0179317, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 2.2482424412831774, "language_loss": 0.76720268, "learning_rate": 2.3972731388103564e-07, "loss": 0.78915846, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.655508518218994 }, { "auxiliary_loss_clip": 0.0131118, "auxiliary_loss_mlp": 0.0099883, "balance_loss_clip": 1.00640678, "balance_loss_mlp": 0.99801332, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.7970537624742845, "language_loss": 0.62386847, "learning_rate": 2.393576532153687e-07, "loss": 0.64696854, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 4.450084209442139 }, { "auxiliary_loss_clip": 0.01112105, "auxiliary_loss_mlp": 0.01000426, "balance_loss_clip": 1.00739622, "balance_loss_mlp": 0.99962157, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9288163913910275, "language_loss": 0.57793653, "learning_rate": 2.389882596377453e-07, "loss": 0.59906185, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.3292903900146484 }, { "auxiliary_loss_clip": 0.01172059, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.04694867, "balance_loss_mlp": 1.01919389, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.8566012143090733, "language_loss": 0.76266086, "learning_rate": 2.386191332042031e-07, "loss": 0.78464329, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.7319445610046387 }, { "auxiliary_loss_clip": 0.01176753, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 1.05053067, "balance_loss_mlp": 1.01968646, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 2.161636181502183, "language_loss": 0.73024702, "learning_rate": 2.3825027397073794e-07, "loss": 0.75228471, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.5889928340911865 }, { "auxiliary_loss_clip": 0.01220271, "auxiliary_loss_mlp": 0.01026916, "balance_loss_clip": 1.04938316, "balance_loss_mlp": 1.01969826, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 2.6826475722981287, "language_loss": 0.66757149, "learning_rate": 2.3788168199330515e-07, "loss": 0.69004339, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.6991865634918213 }, { "auxiliary_loss_clip": 0.01261552, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.04032493, "balance_loss_mlp": 1.01946449, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.7272147691371265, "language_loss": 0.72620732, "learning_rate": 2.3751335732782074e-07, "loss": 0.74908698, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 3.7221755981445312 }, { "auxiliary_loss_clip": 0.01222864, "auxiliary_loss_mlp": 0.01023189, "balance_loss_clip": 1.0497905, "balance_loss_mlp": 1.01628137, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 2.12829469342397, "language_loss": 0.79492849, "learning_rate": 2.371453000301582e-07, "loss": 0.81738901, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.6205191612243652 }, { "auxiliary_loss_clip": 0.01316296, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.04492259, "balance_loss_mlp": 1.0188539, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.8050444995430732, "language_loss": 0.74613315, "learning_rate": 2.3677751015615222e-07, "loss": 0.76955402, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.8097801208496094 }, { "auxiliary_loss_clip": 0.01269304, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 1.04421675, "balance_loss_mlp": 1.0193404, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 2.1291257512002546, "language_loss": 0.85623431, "learning_rate": 2.3640998776159593e-07, "loss": 0.87919092, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 3.8510067462921143 }, { "auxiliary_loss_clip": 0.01273455, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.04871285, "balance_loss_mlp": 1.02015793, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.8675466290461138, "language_loss": 0.81291497, "learning_rate": 2.3604273290224253e-07, "loss": 0.83591557, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.6814444065093994 }, { "auxiliary_loss_clip": 0.01272608, "auxiliary_loss_mlp": 0.01029617, "balance_loss_clip": 1.04846072, "balance_loss_mlp": 1.02255416, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 2.2880913266947966, "language_loss": 0.7457875, "learning_rate": 2.356757456338039e-07, "loss": 0.7688098, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.670318365097046 }, { "auxiliary_loss_clip": 0.01165112, "auxiliary_loss_mlp": 0.01007557, "balance_loss_clip": 1.00929511, "balance_loss_mlp": 1.00669324, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.7667875660343402, "language_loss": 0.58988398, "learning_rate": 2.3530902601195147e-07, "loss": 0.61161065, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.360110282897949 }, { "auxiliary_loss_clip": 0.01221975, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 1.04710054, "balance_loss_mlp": 1.02098179, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.1863254123718496, "language_loss": 0.7913053, "learning_rate": 2.34942574092317e-07, "loss": 0.81380802, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.6469030380249023 }, { "auxiliary_loss_clip": 0.01227078, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.04926658, "balance_loss_mlp": 1.02211785, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 1.9431977892643022, "language_loss": 0.76787102, "learning_rate": 2.3457638993049045e-07, "loss": 0.79042816, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.6472175121307373 }, { "auxiliary_loss_clip": 0.01417451, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.04466748, "balance_loss_mlp": 1.01840878, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 2.906907614936066, "language_loss": 0.63944197, "learning_rate": 2.3421047358202252e-07, "loss": 0.66388214, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.8119475841522217 }, { "auxiliary_loss_clip": 0.01225113, "auxiliary_loss_mlp": 0.01024499, "balance_loss_clip": 1.04865813, "balance_loss_mlp": 1.01719451, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 3.2270693414447145, "language_loss": 0.83541703, "learning_rate": 2.3384482510242144e-07, "loss": 0.8579132, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.634420394897461 }, { "auxiliary_loss_clip": 0.01176026, "auxiliary_loss_mlp": 0.01021986, "balance_loss_clip": 1.04853547, "balance_loss_mlp": 1.01537621, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 2.930366293754969, "language_loss": 0.77364743, "learning_rate": 2.3347944454715575e-07, "loss": 0.79562759, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.6298670768737793 }, { "auxiliary_loss_clip": 0.01178344, "auxiliary_loss_mlp": 0.01025021, "balance_loss_clip": 1.0513469, "balance_loss_mlp": 1.01742744, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 2.1532929876027302, "language_loss": 0.67453206, "learning_rate": 2.331143319716542e-07, "loss": 0.69656569, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.6348681449890137 }, { "auxiliary_loss_clip": 0.01227566, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.04706931, "balance_loss_mlp": 1.01811433, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 1.8834869232316611, "language_loss": 0.6584909, "learning_rate": 2.3274948743130363e-07, "loss": 0.68101811, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.765509605407715 }, { "auxiliary_loss_clip": 0.01173588, "auxiliary_loss_mlp": 0.01026771, "balance_loss_clip": 1.04769576, "balance_loss_mlp": 1.01982069, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.774954211679828, "language_loss": 0.79605013, "learning_rate": 2.3238491098145085e-07, "loss": 0.81805372, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.665830612182617 }, { "auxiliary_loss_clip": 0.01222708, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.04812944, "balance_loss_mlp": 1.02402091, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.142410292985417, "language_loss": 0.73051018, "learning_rate": 2.3202060267740141e-07, "loss": 0.75305378, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.5768885612487793 }, { "auxiliary_loss_clip": 0.01367016, "auxiliary_loss_mlp": 0.01020603, "balance_loss_clip": 1.03896475, "balance_loss_mlp": 1.01373005, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.5047945058907577, "language_loss": 0.76697576, "learning_rate": 2.3165656257442044e-07, "loss": 0.79085189, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.765084743499756 }, { "auxiliary_loss_clip": 0.01220013, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.04862571, "balance_loss_mlp": 1.02090406, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 2.1646985244528896, "language_loss": 0.90028191, "learning_rate": 2.31292790727734e-07, "loss": 0.92275381, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.682459831237793 }, { "auxiliary_loss_clip": 0.01169947, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.04758954, "balance_loss_mlp": 1.01810873, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.3764304718083324, "language_loss": 0.80227774, "learning_rate": 2.3092928719252392e-07, "loss": 0.82422495, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.6079750061035156 }, { "auxiliary_loss_clip": 0.01221534, "auxiliary_loss_mlp": 0.01024012, "balance_loss_clip": 1.04561257, "balance_loss_mlp": 1.01758969, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 2.130889831247209, "language_loss": 0.78026599, "learning_rate": 2.3056605202393475e-07, "loss": 0.80272144, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.647853136062622 }, { "auxiliary_loss_clip": 0.01217427, "auxiliary_loss_mlp": 0.02568741, "balance_loss_clip": 1.04304266, "balance_loss_mlp": 0.99988377, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 2.1271434964637654, "language_loss": 0.67112231, "learning_rate": 2.3020308527706888e-07, "loss": 0.70898396, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.8079164028167725 }, { "auxiliary_loss_clip": 0.01276332, "auxiliary_loss_mlp": 0.01020551, "balance_loss_clip": 1.04366601, "balance_loss_mlp": 1.01404214, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.7842819830960281, "language_loss": 0.89419377, "learning_rate": 2.2984038700698715e-07, "loss": 0.91716266, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 4.5713207721710205 }, { "auxiliary_loss_clip": 0.01220351, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.04939842, "balance_loss_mlp": 1.02101552, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 4.556003432840408, "language_loss": 0.78823805, "learning_rate": 2.2947795726871222e-07, "loss": 0.81071889, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.73752760887146 }, { "auxiliary_loss_clip": 0.012284, "auxiliary_loss_mlp": 0.02566997, "balance_loss_clip": 1.05421436, "balance_loss_mlp": 0.99989885, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 2.07628640277676, "language_loss": 0.85918778, "learning_rate": 2.2911579611722253e-07, "loss": 0.89714175, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.776139736175537 }, { "auxiliary_loss_clip": 0.01169582, "auxiliary_loss_mlp": 0.01027076, "balance_loss_clip": 1.04483366, "balance_loss_mlp": 1.02040327, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 2.5221136370891144, "language_loss": 0.87282604, "learning_rate": 2.2875390360745905e-07, "loss": 0.89479268, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.654996156692505 }, { "auxiliary_loss_clip": 0.01329641, "auxiliary_loss_mlp": 0.01023537, "balance_loss_clip": 1.04597449, "balance_loss_mlp": 1.01598227, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 1.5735106493761657, "language_loss": 0.77684915, "learning_rate": 2.2839227979432008e-07, "loss": 0.80038089, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.7534525394439697 }, { "auxiliary_loss_clip": 0.01271128, "auxiliary_loss_mlp": 0.01025246, "balance_loss_clip": 1.0446465, "balance_loss_mlp": 1.01807249, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 1.8835502844344914, "language_loss": 0.85236758, "learning_rate": 2.2803092473266373e-07, "loss": 0.87533128, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 3.625230312347412 }, { "auxiliary_loss_clip": 0.01176277, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.05009484, "balance_loss_mlp": 1.02709377, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 3.448770308203283, "language_loss": 0.8675977, "learning_rate": 2.2766983847730724e-07, "loss": 0.88969851, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.7156238555908203 }, { "auxiliary_loss_clip": 0.01330869, "auxiliary_loss_mlp": 0.01028409, "balance_loss_clip": 1.0445981, "balance_loss_mlp": 1.02095222, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 2.0787063728805646, "language_loss": 0.66863507, "learning_rate": 2.2730902108302663e-07, "loss": 0.69222784, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 3.633113384246826 }, { "auxiliary_loss_clip": 0.01270224, "auxiliary_loss_mlp": 0.01026835, "balance_loss_clip": 1.04260278, "balance_loss_mlp": 1.01966178, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.7257088339299664, "language_loss": 0.68729913, "learning_rate": 2.269484726045583e-07, "loss": 0.71026981, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.6334805488586426 }, { "auxiliary_loss_clip": 0.01325988, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04660213, "balance_loss_mlp": 1.01849198, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 2.2319178782734954, "language_loss": 0.79141092, "learning_rate": 2.2658819309659672e-07, "loss": 0.81491989, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.9081461429595947 }, { "auxiliary_loss_clip": 0.01269729, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.0485419, "balance_loss_mlp": 1.01893926, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 2.437377520858372, "language_loss": 0.84981281, "learning_rate": 2.2622818261379706e-07, "loss": 0.87276357, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.757622003555298 }, { "auxiliary_loss_clip": 0.01273975, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.04497063, "balance_loss_mlp": 1.01950216, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.844362731171038, "language_loss": 0.7478922, "learning_rate": 2.2586844121077142e-07, "loss": 0.77089989, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.666950225830078 }, { "auxiliary_loss_clip": 0.01276354, "auxiliary_loss_mlp": 0.0102919, "balance_loss_clip": 1.04291058, "balance_loss_mlp": 1.02214789, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 1.956154870066799, "language_loss": 0.71998638, "learning_rate": 2.2550896894209215e-07, "loss": 0.74304187, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.8709681034088135 }, { "auxiliary_loss_clip": 0.01324199, "auxiliary_loss_mlp": 0.00998193, "balance_loss_clip": 1.00737154, "balance_loss_mlp": 0.99731666, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.678144030794841, "language_loss": 0.56640661, "learning_rate": 2.2514976586229184e-07, "loss": 0.5896306, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.408799409866333 }, { "auxiliary_loss_clip": 0.01115479, "auxiliary_loss_mlp": 0.00999131, "balance_loss_clip": 1.00790191, "balance_loss_mlp": 0.99827832, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7537808304098034, "language_loss": 0.54601419, "learning_rate": 2.247908320258609e-07, "loss": 0.56716025, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.1665263175964355 }, { "auxiliary_loss_clip": 0.01360968, "auxiliary_loss_mlp": 0.01025687, "balance_loss_clip": 1.04422212, "balance_loss_mlp": 1.01775408, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.251356618527483, "language_loss": 0.80019855, "learning_rate": 2.2443216748724914e-07, "loss": 0.82406515, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.7077419757843018 }, { "auxiliary_loss_clip": 0.01226542, "auxiliary_loss_mlp": 0.02565769, "balance_loss_clip": 1.0500865, "balance_loss_mlp": 0.99988735, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 1.8211191575125913, "language_loss": 0.74291831, "learning_rate": 2.2407377230086588e-07, "loss": 0.78084141, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.7235043048858643 }, { "auxiliary_loss_clip": 0.01325378, "auxiliary_loss_mlp": 0.01028383, "balance_loss_clip": 1.04947329, "balance_loss_mlp": 1.02129626, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 2.110068038332757, "language_loss": 0.83272111, "learning_rate": 2.23715646521079e-07, "loss": 0.85625875, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.720362424850464 }, { "auxiliary_loss_clip": 0.01227497, "auxiliary_loss_mlp": 0.02570626, "balance_loss_clip": 1.04836702, "balance_loss_mlp": 0.999897, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 1.7384142871118957, "language_loss": 0.84074992, "learning_rate": 2.2335779020221724e-07, "loss": 0.87873113, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.692399501800537 }, { "auxiliary_loss_clip": 0.01128022, "auxiliary_loss_mlp": 0.00999899, "balance_loss_clip": 1.02070093, "balance_loss_mlp": 0.99896282, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.8006565790539628, "language_loss": 0.56422448, "learning_rate": 2.2300020339856497e-07, "loss": 0.5855037, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.2376606464385986 }, { "auxiliary_loss_clip": 0.01267247, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.04560483, "balance_loss_mlp": 1.02430773, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.4590777249369786, "language_loss": 0.77742898, "learning_rate": 2.2264288616436966e-07, "loss": 0.80041218, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.6055119037628174 }, { "auxiliary_loss_clip": 0.01265987, "auxiliary_loss_mlp": 0.0102582, "balance_loss_clip": 1.04556632, "balance_loss_mlp": 1.01880157, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 1.9896289482356657, "language_loss": 0.72248614, "learning_rate": 2.222858385538351e-07, "loss": 0.74540424, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.579873561859131 }, { "auxiliary_loss_clip": 0.0122246, "auxiliary_loss_mlp": 0.01021778, "balance_loss_clip": 1.04744673, "balance_loss_mlp": 1.0152868, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 1.8164378744164682, "language_loss": 0.68116409, "learning_rate": 2.2192906062112527e-07, "loss": 0.70360649, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.541853904724121 }, { "auxiliary_loss_clip": 0.01171234, "auxiliary_loss_mlp": 0.01023104, "balance_loss_clip": 1.04728663, "balance_loss_mlp": 1.01616335, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 2.2832168618450015, "language_loss": 0.70663983, "learning_rate": 2.2157255242036377e-07, "loss": 0.72858322, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.6452081203460693 }, { "auxiliary_loss_clip": 0.01314037, "auxiliary_loss_mlp": 0.01029064, "balance_loss_clip": 1.04405653, "balance_loss_mlp": 1.02219772, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 1.62029700284662, "language_loss": 0.74289089, "learning_rate": 2.2121631400563135e-07, "loss": 0.7663219, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.6063220500946045 }, { "auxiliary_loss_clip": 0.01111299, "auxiliary_loss_mlp": 0.01008965, "balance_loss_clip": 1.00839186, "balance_loss_mlp": 1.00808299, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7592032689062704, "language_loss": 0.52954614, "learning_rate": 2.208603454309701e-07, "loss": 0.55074877, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.1698896884918213 }, { "auxiliary_loss_clip": 0.01366638, "auxiliary_loss_mlp": 0.01032939, "balance_loss_clip": 1.04185593, "balance_loss_mlp": 1.02522647, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 2.1585341802272033, "language_loss": 0.70945251, "learning_rate": 2.2050464675037994e-07, "loss": 0.73344827, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 3.6095168590545654 }, { "auxiliary_loss_clip": 0.01267983, "auxiliary_loss_mlp": 0.01027794, "balance_loss_clip": 1.04589295, "balance_loss_mlp": 1.02064109, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 7.181630395265267, "language_loss": 0.73796093, "learning_rate": 2.2014921801782016e-07, "loss": 0.76091874, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 3.6590042114257812 }, { "auxiliary_loss_clip": 0.01271915, "auxiliary_loss_mlp": 0.01030073, "balance_loss_clip": 1.04237545, "balance_loss_mlp": 1.02336776, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 2.629718294824743, "language_loss": 0.73770517, "learning_rate": 2.1979405928720872e-07, "loss": 0.76072514, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.7108728885650635 }, { "auxiliary_loss_clip": 0.01275008, "auxiliary_loss_mlp": 0.01025824, "balance_loss_clip": 1.04502988, "balance_loss_mlp": 1.01967001, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.6096564855310138, "language_loss": 0.79291463, "learning_rate": 2.1943917061242257e-07, "loss": 0.81592292, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.7051103115081787 }, { "auxiliary_loss_clip": 0.01230972, "auxiliary_loss_mlp": 0.02570151, "balance_loss_clip": 1.05009687, "balance_loss_mlp": 0.99987674, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.6003671746342971, "language_loss": 0.66649556, "learning_rate": 2.1908455204729903e-07, "loss": 0.70450675, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 3.577935218811035 }, { "auxiliary_loss_clip": 0.01274819, "auxiliary_loss_mlp": 0.01024092, "balance_loss_clip": 1.04379809, "balance_loss_mlp": 1.01718986, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.5481643936437055, "language_loss": 0.7832607, "learning_rate": 2.1873020364563265e-07, "loss": 0.80624974, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.6990814208984375 }, { "auxiliary_loss_clip": 0.01218328, "auxiliary_loss_mlp": 0.01023212, "balance_loss_clip": 1.04697657, "balance_loss_mlp": 1.01599348, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.2696408756559534, "language_loss": 0.76105672, "learning_rate": 2.183761254611789e-07, "loss": 0.78347212, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.6332359313964844 }, { "auxiliary_loss_clip": 0.01222038, "auxiliary_loss_mlp": 0.0102005, "balance_loss_clip": 1.04953182, "balance_loss_mlp": 1.01351118, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 2.508116020139061, "language_loss": 0.70596993, "learning_rate": 2.1802231754764987e-07, "loss": 0.72839081, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 3.790107488632202 }, { "auxiliary_loss_clip": 0.01272113, "auxiliary_loss_mlp": 0.01027747, "balance_loss_clip": 1.04407167, "balance_loss_mlp": 1.02053523, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.9080476286306662, "language_loss": 0.76438308, "learning_rate": 2.17668779958718e-07, "loss": 0.78738171, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.7463247776031494 }, { "auxiliary_loss_clip": 0.01174486, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.05043483, "balance_loss_mlp": 1.02215421, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.498296814413374, "language_loss": 0.80578262, "learning_rate": 2.1731551274801553e-07, "loss": 0.82781625, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.535413980484009 }, { "auxiliary_loss_clip": 0.01279127, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04876101, "balance_loss_mlp": 1.01667142, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.221834581778872, "language_loss": 0.61405659, "learning_rate": 2.169625159691324e-07, "loss": 0.63708681, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.7056283950805664 }, { "auxiliary_loss_clip": 0.01375373, "auxiliary_loss_mlp": 0.01024731, "balance_loss_clip": 1.04155838, "balance_loss_mlp": 1.01754856, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.9742204762396907, "language_loss": 0.73994905, "learning_rate": 2.1660978967561784e-07, "loss": 0.76395011, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.735150098800659 }, { "auxiliary_loss_clip": 0.01171144, "auxiliary_loss_mlp": 0.01021726, "balance_loss_clip": 1.0474844, "balance_loss_mlp": 1.01521134, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 7.315408951371546, "language_loss": 0.79462832, "learning_rate": 2.1625733392098035e-07, "loss": 0.81655705, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.5988078117370605 }, { "auxiliary_loss_clip": 0.01170734, "auxiliary_loss_mlp": 0.01024176, "balance_loss_clip": 1.04688728, "balance_loss_mlp": 1.01764631, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.8935783141159253, "language_loss": 0.79744446, "learning_rate": 2.159051487586867e-07, "loss": 0.81939352, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.611186981201172 }, { "auxiliary_loss_clip": 0.01278157, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.05016279, "balance_loss_mlp": 1.02127099, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.740933469933974, "language_loss": 0.72479439, "learning_rate": 2.155532342421642e-07, "loss": 0.74786806, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.7635550498962402 }, { "auxiliary_loss_clip": 0.01226263, "auxiliary_loss_mlp": 0.01025863, "balance_loss_clip": 1.0483501, "balance_loss_mlp": 1.01814711, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.938509382520293, "language_loss": 0.78413165, "learning_rate": 2.1520159042479636e-07, "loss": 0.8066529, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.6376612186431885 }, { "auxiliary_loss_clip": 0.01222437, "auxiliary_loss_mlp": 0.01028718, "balance_loss_clip": 1.05000591, "balance_loss_mlp": 1.02207804, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 1.966945365426684, "language_loss": 0.70918751, "learning_rate": 2.148502173599287e-07, "loss": 0.73169905, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.6434996128082275 }, { "auxiliary_loss_clip": 0.0126777, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.04520893, "balance_loss_mlp": 1.02128959, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.7609861839565348, "language_loss": 0.65673572, "learning_rate": 2.1449911510086372e-07, "loss": 0.67969632, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.719958543777466 }, { "auxiliary_loss_clip": 0.01218006, "auxiliary_loss_mlp": 0.01022923, "balance_loss_clip": 1.04645872, "balance_loss_mlp": 1.01607478, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.8806755534480286, "language_loss": 0.77009046, "learning_rate": 2.141482837008628e-07, "loss": 0.79249978, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.6958961486816406 }, { "auxiliary_loss_clip": 0.01221317, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.0472734, "balance_loss_mlp": 1.02117622, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.8851207139393977, "language_loss": 0.72285086, "learning_rate": 2.1379772321314826e-07, "loss": 0.74534279, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.6762945652008057 }, { "auxiliary_loss_clip": 0.01418733, "auxiliary_loss_mlp": 0.01033674, "balance_loss_clip": 1.04500961, "balance_loss_mlp": 1.02631938, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.3460211481387843, "language_loss": 0.81594896, "learning_rate": 2.1344743369089802e-07, "loss": 0.840473, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.768770694732666 }, { "auxiliary_loss_clip": 0.01280523, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.05113399, "balance_loss_mlp": 1.01962185, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.7342826281058894, "language_loss": 0.81992066, "learning_rate": 2.130974151872522e-07, "loss": 0.84299171, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.880168914794922 }, { "auxiliary_loss_clip": 0.01225705, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.04883504, "balance_loss_mlp": 1.02091503, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.7736337578173655, "language_loss": 0.78594774, "learning_rate": 2.1274766775530773e-07, "loss": 0.80848563, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.7085988521575928 }, { "auxiliary_loss_clip": 0.01175751, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.0492053, "balance_loss_mlp": 1.01754642, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 2.4428321711095213, "language_loss": 0.79554528, "learning_rate": 2.1239819144812077e-07, "loss": 0.81754643, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.5918619632720947 }, { "auxiliary_loss_clip": 0.0131568, "auxiliary_loss_mlp": 0.01024127, "balance_loss_clip": 1.04080033, "balance_loss_mlp": 1.01737726, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.7257777286178515, "language_loss": 0.69934464, "learning_rate": 2.1204898631870716e-07, "loss": 0.72274268, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.8727822303771973 }, { "auxiliary_loss_clip": 0.01273926, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.0503068, "balance_loss_mlp": 1.01765883, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 2.072365661642435, "language_loss": 0.75976539, "learning_rate": 2.1170005242004006e-07, "loss": 0.78275168, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.7887942790985107 }, { "auxiliary_loss_clip": 0.0117471, "auxiliary_loss_mlp": 0.01026717, "balance_loss_clip": 1.04445899, "balance_loss_mlp": 1.01990402, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 1.902117454151841, "language_loss": 0.78216249, "learning_rate": 2.1135138980505384e-07, "loss": 0.80417675, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 3.7117791175842285 }, { "auxiliary_loss_clip": 0.01265502, "auxiliary_loss_mlp": 0.0102715, "balance_loss_clip": 1.04596281, "balance_loss_mlp": 1.02027726, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.8403957912342845, "language_loss": 0.72230768, "learning_rate": 2.110029985266395e-07, "loss": 0.74523413, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 3.56243896484375 }, { "auxiliary_loss_clip": 0.01174328, "auxiliary_loss_mlp": 0.01021635, "balance_loss_clip": 1.04402626, "balance_loss_mlp": 1.01463199, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.8565904898130845, "language_loss": 0.73831636, "learning_rate": 2.1065487863764787e-07, "loss": 0.76027596, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.6811881065368652 }, { "auxiliary_loss_clip": 0.01368185, "auxiliary_loss_mlp": 0.01022317, "balance_loss_clip": 1.03834677, "balance_loss_mlp": 1.01578736, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.7387713271208687, "language_loss": 0.85568786, "learning_rate": 2.1030703019088846e-07, "loss": 0.8795929, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.739590883255005 }, { "auxiliary_loss_clip": 0.01216465, "auxiliary_loss_mlp": 0.01021239, "balance_loss_clip": 1.04762053, "balance_loss_mlp": 1.01435757, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 1.8682849118554685, "language_loss": 0.70723236, "learning_rate": 2.099594532391291e-07, "loss": 0.72960937, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 3.5113184452056885 }, { "auxiliary_loss_clip": 0.01218016, "auxiliary_loss_mlp": 0.01022476, "balance_loss_clip": 1.04601026, "balance_loss_mlp": 1.01580954, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 3.224048948361951, "language_loss": 0.78986323, "learning_rate": 2.0961214783509806e-07, "loss": 0.81226814, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.732994556427002 }, { "auxiliary_loss_clip": 0.01274344, "auxiliary_loss_mlp": 0.01022301, "balance_loss_clip": 1.04367852, "balance_loss_mlp": 1.01556253, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.986541758041221, "language_loss": 0.74865294, "learning_rate": 2.0926511403148051e-07, "loss": 0.77161944, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.688246011734009 }, { "auxiliary_loss_clip": 0.0122678, "auxiliary_loss_mlp": 0.01025389, "balance_loss_clip": 1.04618382, "balance_loss_mlp": 1.01877618, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 1.8506812563614037, "language_loss": 0.75713319, "learning_rate": 2.0891835188092143e-07, "loss": 0.77965486, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 3.5964648723602295 }, { "auxiliary_loss_clip": 0.01226118, "auxiliary_loss_mlp": 0.01028751, "balance_loss_clip": 1.04479671, "balance_loss_mlp": 1.02225733, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 2.0775148778607417, "language_loss": 0.81275278, "learning_rate": 2.0857186143602434e-07, "loss": 0.83530152, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.6437976360321045 }, { "auxiliary_loss_clip": 0.01316738, "auxiliary_loss_mlp": 0.01033798, "balance_loss_clip": 1.04208422, "balance_loss_mlp": 1.02656245, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 2.3167525996261604, "language_loss": 0.67753381, "learning_rate": 2.0822564274935094e-07, "loss": 0.7010392, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.727989673614502 }, { "auxiliary_loss_clip": 0.01278556, "auxiliary_loss_mlp": 0.01024127, "balance_loss_clip": 1.05098605, "balance_loss_mlp": 1.0169301, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 2.142203714824796, "language_loss": 0.66968918, "learning_rate": 2.078796958734239e-07, "loss": 0.692716, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.779693841934204 }, { "auxiliary_loss_clip": 0.01223395, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.04946029, "balance_loss_mlp": 1.01970565, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 2.010545049830162, "language_loss": 0.75378633, "learning_rate": 2.0753402086072124e-07, "loss": 0.77628988, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.5976436138153076 }, { "auxiliary_loss_clip": 0.01378242, "auxiliary_loss_mlp": 0.01027982, "balance_loss_clip": 1.04230499, "balance_loss_mlp": 1.02031112, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 2.221994954516584, "language_loss": 0.75383317, "learning_rate": 2.071886177636828e-07, "loss": 0.77789539, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.8362371921539307 }, { "auxiliary_loss_clip": 0.01217448, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.04762173, "balance_loss_mlp": 1.02041137, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.7212728703755524, "language_loss": 0.83031315, "learning_rate": 2.0684348663470575e-07, "loss": 0.85275948, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.7576563358306885 }, { "auxiliary_loss_clip": 0.01272702, "auxiliary_loss_mlp": 0.01022325, "balance_loss_clip": 1.04354405, "balance_loss_mlp": 1.0151453, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.6676308915267604, "language_loss": 0.61683059, "learning_rate": 2.0649862752614555e-07, "loss": 0.63978088, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.690241575241089 }, { "auxiliary_loss_clip": 0.01170531, "auxiliary_loss_mlp": 0.01000377, "balance_loss_clip": 1.00782657, "balance_loss_mlp": 0.99946499, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7510147933497497, "language_loss": 0.57017314, "learning_rate": 2.0615404049031838e-07, "loss": 0.59188217, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.2307791709899902 }, { "auxiliary_loss_clip": 0.01225238, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.05054939, "balance_loss_mlp": 1.02304828, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 2.4997275143379416, "language_loss": 0.77643663, "learning_rate": 2.0580972557949616e-07, "loss": 0.79899514, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.6286935806274414 }, { "auxiliary_loss_clip": 0.01120043, "auxiliary_loss_mlp": 0.01001317, "balance_loss_clip": 1.00654542, "balance_loss_mlp": 1.00039887, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.8467397844800908, "language_loss": 0.54274869, "learning_rate": 2.054656828459125e-07, "loss": 0.56396234, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.2142879962921143 }, { "auxiliary_loss_clip": 0.01369353, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.04331398, "balance_loss_mlp": 1.02043724, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.817438343621723, "language_loss": 0.7738778, "learning_rate": 2.051219123417578e-07, "loss": 0.79784942, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.8245208263397217 }, { "auxiliary_loss_clip": 0.01175458, "auxiliary_loss_mlp": 0.01024075, "balance_loss_clip": 1.04980838, "balance_loss_mlp": 1.01673794, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.4940712547793567, "language_loss": 0.59963185, "learning_rate": 2.0477841411918196e-07, "loss": 0.62162715, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.6424264907836914 }, { "auxiliary_loss_clip": 0.01215501, "auxiliary_loss_mlp": 0.01021967, "balance_loss_clip": 1.04522336, "balance_loss_mlp": 1.01578355, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 12.609717282113953, "language_loss": 0.74803883, "learning_rate": 2.0443518823029326e-07, "loss": 0.77041352, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.6270806789398193 }, { "auxiliary_loss_clip": 0.01312353, "auxiliary_loss_mlp": 0.01027337, "balance_loss_clip": 1.04254556, "balance_loss_mlp": 1.01975822, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.106494821809929, "language_loss": 0.7683751, "learning_rate": 2.0409223472715854e-07, "loss": 0.79177201, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.719205617904663 }, { "auxiliary_loss_clip": 0.01317403, "auxiliary_loss_mlp": 0.02562863, "balance_loss_clip": 1.04520059, "balance_loss_mlp": 0.99991566, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 2.1770550375841027, "language_loss": 0.74596179, "learning_rate": 2.0374955366180434e-07, "loss": 0.78476441, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.687831401824951 }, { "auxiliary_loss_clip": 0.01324202, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.04213977, "balance_loss_mlp": 1.01644492, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.6781721400680045, "language_loss": 0.72391748, "learning_rate": 2.034071450862147e-07, "loss": 0.7473948, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.7988133430480957 }, { "auxiliary_loss_clip": 0.0127879, "auxiliary_loss_mlp": 0.01027618, "balance_loss_clip": 1.04500294, "balance_loss_mlp": 1.0204531, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.5609390756973065, "language_loss": 0.76770788, "learning_rate": 2.030650090523327e-07, "loss": 0.79077196, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.6710429191589355 }, { "auxiliary_loss_clip": 0.01318523, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.04309607, "balance_loss_mlp": 1.01777792, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.7799823762569957, "language_loss": 0.5949502, "learning_rate": 2.0272314561205995e-07, "loss": 0.61838508, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.7734408378601074 }, { "auxiliary_loss_clip": 0.01314238, "auxiliary_loss_mlp": 0.01020463, "balance_loss_clip": 1.03963768, "balance_loss_mlp": 1.01406455, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 1.7853209046283556, "language_loss": 0.72763878, "learning_rate": 2.023815548172567e-07, "loss": 0.75098574, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 4.582414150238037 }, { "auxiliary_loss_clip": 0.01223691, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.04580688, "balance_loss_mlp": 1.02219391, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.8592779810667703, "language_loss": 0.66011995, "learning_rate": 2.0204023671974267e-07, "loss": 0.68265176, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.6357274055480957 }, { "auxiliary_loss_clip": 0.01217811, "auxiliary_loss_mlp": 0.01025857, "balance_loss_clip": 1.04571092, "balance_loss_mlp": 1.01904166, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 1.9828782095509374, "language_loss": 0.81324464, "learning_rate": 2.0169919137129532e-07, "loss": 0.83568132, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.62125301361084 }, { "auxiliary_loss_clip": 0.01225072, "auxiliary_loss_mlp": 0.01028, "balance_loss_clip": 1.05093288, "balance_loss_mlp": 1.01980472, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.1638525007392615, "language_loss": 0.70278633, "learning_rate": 2.013584188236508e-07, "loss": 0.725317, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 3.570202589035034 }, { "auxiliary_loss_clip": 0.0117574, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.05030239, "balance_loss_mlp": 1.01819491, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 2.380458323043156, "language_loss": 0.79587972, "learning_rate": 2.0101791912850396e-07, "loss": 0.81789029, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.5747623443603516 }, { "auxiliary_loss_clip": 0.01275334, "auxiliary_loss_mlp": 0.01021921, "balance_loss_clip": 1.04954362, "balance_loss_mlp": 1.01441383, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 1.8811023527865982, "language_loss": 0.6364454, "learning_rate": 2.006776923375082e-07, "loss": 0.65941799, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.825889825820923 }, { "auxiliary_loss_clip": 0.01172243, "auxiliary_loss_mlp": 0.01026545, "balance_loss_clip": 1.04820037, "balance_loss_mlp": 1.01987791, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.689905483320047, "language_loss": 0.71493137, "learning_rate": 2.003377385022764e-07, "loss": 0.73691928, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.5989699363708496 }, { "auxiliary_loss_clip": 0.01271257, "auxiliary_loss_mlp": 0.01024822, "balance_loss_clip": 1.04536366, "balance_loss_mlp": 1.01761866, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.4940453684385586, "language_loss": 0.76953232, "learning_rate": 1.9999805767437826e-07, "loss": 0.7924931, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 3.518247604370117 }, { "auxiliary_loss_clip": 0.01268341, "auxiliary_loss_mlp": 0.01027152, "balance_loss_clip": 1.04402184, "balance_loss_mlp": 1.02036285, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 3.798415079817361, "language_loss": 0.71510756, "learning_rate": 1.9965864990534386e-07, "loss": 0.7380625, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.6871182918548584 }, { "auxiliary_loss_clip": 0.01316322, "auxiliary_loss_mlp": 0.01024547, "balance_loss_clip": 1.03944683, "balance_loss_mlp": 1.01774335, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.9732326574087675, "language_loss": 0.77501702, "learning_rate": 1.9931951524666092e-07, "loss": 0.79842567, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.7822988033294678 }, { "auxiliary_loss_clip": 0.0122429, "auxiliary_loss_mlp": 0.02565347, "balance_loss_clip": 1.04638839, "balance_loss_mlp": 0.99990845, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.873198388277629, "language_loss": 0.81100535, "learning_rate": 1.9898065374977534e-07, "loss": 0.84890175, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.6065545082092285 }, { "auxiliary_loss_clip": 0.01218596, "auxiliary_loss_mlp": 0.01018461, "balance_loss_clip": 1.04380298, "balance_loss_mlp": 1.0124197, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 1.9499910005323788, "language_loss": 0.73438263, "learning_rate": 1.9864206546609342e-07, "loss": 0.75675321, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.6885149478912354 }, { "auxiliary_loss_clip": 0.01172475, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.04755378, "balance_loss_mlp": 1.02314639, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.823016193457825, "language_loss": 0.84370482, "learning_rate": 1.983037504469771e-07, "loss": 0.86573064, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.621830463409424 }, { "auxiliary_loss_clip": 0.01220495, "auxiliary_loss_mlp": 0.01029001, "balance_loss_clip": 1.04763126, "balance_loss_mlp": 1.02220035, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 2.0059187960322653, "language_loss": 0.66435552, "learning_rate": 1.9796570874374984e-07, "loss": 0.68685049, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.6388633251190186 }, { "auxiliary_loss_clip": 0.01272043, "auxiliary_loss_mlp": 0.01033093, "balance_loss_clip": 1.04528952, "balance_loss_mlp": 1.02614284, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 3.0333973578680387, "language_loss": 0.77476573, "learning_rate": 1.976279404076917e-07, "loss": 0.79781711, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.699483633041382 }, { "auxiliary_loss_clip": 0.01319238, "auxiliary_loss_mlp": 0.01024244, "balance_loss_clip": 1.04700291, "balance_loss_mlp": 1.01704061, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.9335854476139598, "language_loss": 0.76206052, "learning_rate": 1.9729044549004193e-07, "loss": 0.78549534, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.7774856090545654 }, { "auxiliary_loss_clip": 0.01218707, "auxiliary_loss_mlp": 0.01026516, "balance_loss_clip": 1.04781675, "balance_loss_mlp": 1.02029943, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.9064939964129115, "language_loss": 0.70284086, "learning_rate": 1.9695322404199822e-07, "loss": 0.7252931, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.705867290496826 }, { "auxiliary_loss_clip": 0.01280381, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.05013788, "balance_loss_mlp": 1.02117562, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.86758850406926, "language_loss": 0.82095194, "learning_rate": 1.9661627611471654e-07, "loss": 0.84403747, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.7076170444488525 }, { "auxiliary_loss_clip": 0.01175737, "auxiliary_loss_mlp": 0.01022975, "balance_loss_clip": 1.04525995, "balance_loss_mlp": 1.01571798, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 4.493968314864206, "language_loss": 0.7001828, "learning_rate": 1.9627960175931246e-07, "loss": 0.72216988, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 3.0105855464935303 }, { "auxiliary_loss_clip": 0.0122417, "auxiliary_loss_mlp": 0.01023601, "balance_loss_clip": 1.05042434, "balance_loss_mlp": 1.01701164, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 1.6810518969704322, "language_loss": 0.74404496, "learning_rate": 1.9594320102685847e-07, "loss": 0.76652265, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.6171040534973145 }, { "auxiliary_loss_clip": 0.01263967, "auxiliary_loss_mlp": 0.02563841, "balance_loss_clip": 1.0431236, "balance_loss_mlp": 0.99991208, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 1.940721954219524, "language_loss": 0.64087105, "learning_rate": 1.956070739683864e-07, "loss": 0.67914915, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.786644697189331 }, { "auxiliary_loss_clip": 0.01311572, "auxiliary_loss_mlp": 0.01022814, "balance_loss_clip": 1.0412513, "balance_loss_mlp": 1.01634133, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 2.070604679665568, "language_loss": 0.74486107, "learning_rate": 1.9527122063488678e-07, "loss": 0.76820493, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.713794469833374 }, { "auxiliary_loss_clip": 0.0127222, "auxiliary_loss_mlp": 0.01022285, "balance_loss_clip": 1.04336524, "balance_loss_mlp": 1.01552296, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 2.1153041797573433, "language_loss": 0.80351096, "learning_rate": 1.9493564107730755e-07, "loss": 0.82645601, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.7079355716705322 }, { "auxiliary_loss_clip": 0.01267572, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.04168701, "balance_loss_mlp": 1.02225375, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 1.8772516577209177, "language_loss": 0.61001527, "learning_rate": 1.9460033534655684e-07, "loss": 0.63297856, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.701894760131836 }, { "auxiliary_loss_clip": 0.01269862, "auxiliary_loss_mlp": 0.0102344, "balance_loss_clip": 1.04194093, "balance_loss_mlp": 1.01667833, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.5455928229661164, "language_loss": 0.8427462, "learning_rate": 1.9426530349349978e-07, "loss": 0.86567926, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.6765291690826416 }, { "auxiliary_loss_clip": 0.01219967, "auxiliary_loss_mlp": 0.02562666, "balance_loss_clip": 1.04526794, "balance_loss_mlp": 0.99990308, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 2.7346719414381475, "language_loss": 0.64729232, "learning_rate": 1.9393054556896038e-07, "loss": 0.68511868, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.6293444633483887 }, { "auxiliary_loss_clip": 0.0122153, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.04386234, "balance_loss_mlp": 1.01901031, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.266342483083716, "language_loss": 0.69496799, "learning_rate": 1.9359606162372133e-07, "loss": 0.71744514, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 3.6562130451202393 }, { "auxiliary_loss_clip": 0.01172727, "auxiliary_loss_mlp": 0.01025429, "balance_loss_clip": 1.04924583, "balance_loss_mlp": 1.01897395, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 2.1916132636969206, "language_loss": 0.70769846, "learning_rate": 1.9326185170852293e-07, "loss": 0.72968, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 3.5124011039733887 }, { "auxiliary_loss_clip": 0.0122411, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.04837394, "balance_loss_mlp": 1.02386165, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 4.511605974562868, "language_loss": 0.72151148, "learning_rate": 1.9292791587406598e-07, "loss": 0.74406189, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.672240734100342 }, { "auxiliary_loss_clip": 0.01219477, "auxiliary_loss_mlp": 0.02565467, "balance_loss_clip": 1.04409528, "balance_loss_mlp": 0.99988484, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.190167298456796, "language_loss": 0.86937094, "learning_rate": 1.9259425417100661e-07, "loss": 0.90722036, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 3.555724859237671 }, { "auxiliary_loss_clip": 0.01419702, "auxiliary_loss_mlp": 0.01024305, "balance_loss_clip": 1.03488529, "balance_loss_mlp": 1.01730728, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.0837146848930796, "language_loss": 0.74909675, "learning_rate": 1.9226086664996234e-07, "loss": 0.7735368, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.805751323699951 }, { "auxiliary_loss_clip": 0.01173216, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.04702652, "balance_loss_mlp": 1.02032185, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 1.903245459564033, "language_loss": 0.74139762, "learning_rate": 1.9192775336150712e-07, "loss": 0.76339829, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.63535213470459 }, { "auxiliary_loss_clip": 0.01111134, "auxiliary_loss_mlp": 0.00998291, "balance_loss_clip": 1.00692785, "balance_loss_mlp": 0.99735481, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7721358135522656, "language_loss": 0.56216431, "learning_rate": 1.915949143561739e-07, "loss": 0.58325857, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.1802616119384766 }, { "auxiliary_loss_clip": 0.01221216, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.04861164, "balance_loss_mlp": 1.0233326, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 2.2501137489047305, "language_loss": 0.77818286, "learning_rate": 1.9126234968445498e-07, "loss": 0.80070192, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 3.5913138389587402 }, { "auxiliary_loss_clip": 0.01172264, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.04801881, "balance_loss_mlp": 1.0176909, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.5458200634469823, "language_loss": 0.67901123, "learning_rate": 1.9093005939679884e-07, "loss": 0.70097589, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.62943172454834 }, { "auxiliary_loss_clip": 0.01222398, "auxiliary_loss_mlp": 0.01028258, "balance_loss_clip": 1.04922962, "balance_loss_mlp": 1.02133822, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.9496167194418843, "language_loss": 0.76618242, "learning_rate": 1.9059804354361452e-07, "loss": 0.78868896, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.6187753677368164 }, { "auxiliary_loss_clip": 0.01270868, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 1.04282463, "balance_loss_mlp": 1.02055693, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 1.9112062474111353, "language_loss": 0.70284939, "learning_rate": 1.902663021752684e-07, "loss": 0.72583413, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.7001821994781494 }, { "auxiliary_loss_clip": 0.0117584, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.05003166, "balance_loss_mlp": 1.026335, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.2798443996432542, "language_loss": 0.82632798, "learning_rate": 1.8993483534208556e-07, "loss": 0.84841967, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.588066339492798 }, { "auxiliary_loss_clip": 0.01279041, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.05087686, "balance_loss_mlp": 1.01985025, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.735224237645859, "language_loss": 0.74824405, "learning_rate": 1.8960364309434884e-07, "loss": 0.77130938, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.6173079013824463 }, { "auxiliary_loss_clip": 0.01415811, "auxiliary_loss_mlp": 0.02563791, "balance_loss_clip": 1.04164183, "balance_loss_mlp": 0.99990344, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.865162389482846, "language_loss": 0.78161174, "learning_rate": 1.8927272548229967e-07, "loss": 0.82140774, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.769493818283081 }, { "auxiliary_loss_clip": 0.01375087, "auxiliary_loss_mlp": 0.01028239, "balance_loss_clip": 1.04566646, "balance_loss_mlp": 1.02174544, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.4982892323362118, "language_loss": 0.83271432, "learning_rate": 1.8894208255613876e-07, "loss": 0.85674757, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.742063522338867 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.01021309, "balance_loss_clip": 1.04792178, "balance_loss_mlp": 1.01476431, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.023824958781724, "language_loss": 0.77432513, "learning_rate": 1.8861171436602397e-07, "loss": 0.79625684, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.5917980670928955 }, { "auxiliary_loss_clip": 0.01226487, "auxiliary_loss_mlp": 0.01022253, "balance_loss_clip": 1.05078876, "balance_loss_mlp": 1.01552117, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.522767661263201, "language_loss": 0.80559909, "learning_rate": 1.882816209620719e-07, "loss": 0.8280865, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.6608574390411377 }, { "auxiliary_loss_clip": 0.01283288, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 1.05179238, "balance_loss_mlp": 1.02072823, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 1.7797905438977808, "language_loss": 0.77399361, "learning_rate": 1.8795180239435738e-07, "loss": 0.79710376, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.7024707794189453 }, { "auxiliary_loss_clip": 0.01277665, "auxiliary_loss_mlp": 0.01029222, "balance_loss_clip": 1.04729128, "balance_loss_mlp": 1.02207303, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 6.13625742153834, "language_loss": 0.76148522, "learning_rate": 1.8762225871291348e-07, "loss": 0.78455406, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.73245906829834 }, { "auxiliary_loss_clip": 0.01172342, "auxiliary_loss_mlp": 0.02565554, "balance_loss_clip": 1.0481689, "balance_loss_mlp": 0.99990481, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.5584366898567708, "language_loss": 0.8090359, "learning_rate": 1.8729298996773201e-07, "loss": 0.84641492, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.5950980186462402 }, { "auxiliary_loss_clip": 0.01108346, "auxiliary_loss_mlp": 0.01001595, "balance_loss_clip": 1.00723612, "balance_loss_mlp": 1.00067735, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8299916907826251, "language_loss": 0.60887146, "learning_rate": 1.8696399620876301e-07, "loss": 0.62997091, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.1500356197357178 }, { "auxiliary_loss_clip": 0.01323008, "auxiliary_loss_mlp": 0.01022269, "balance_loss_clip": 1.04020691, "balance_loss_mlp": 1.01492524, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.3432726519294182, "language_loss": 0.79402125, "learning_rate": 1.866352774859141e-07, "loss": 0.81747401, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.6741766929626465 }, { "auxiliary_loss_clip": 0.01325055, "auxiliary_loss_mlp": 0.01024062, "balance_loss_clip": 1.0429014, "balance_loss_mlp": 1.01794362, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.3994209309085113, "language_loss": 0.69334626, "learning_rate": 1.8630683384905188e-07, "loss": 0.71683741, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.794703483581543 }, { "auxiliary_loss_clip": 0.01173807, "auxiliary_loss_mlp": 0.02567874, "balance_loss_clip": 1.05063272, "balance_loss_mlp": 0.99991518, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 2.000595194377792, "language_loss": 0.8895632, "learning_rate": 1.8597866534800045e-07, "loss": 0.92698002, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.589456558227539 }, { "auxiliary_loss_clip": 0.01224341, "auxiliary_loss_mlp": 0.02565935, "balance_loss_clip": 1.04640305, "balance_loss_mlp": 0.99989861, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.705367504852749, "language_loss": 0.74611151, "learning_rate": 1.8565077203254398e-07, "loss": 0.78401434, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 2.9976017475128174 }, { "auxiliary_loss_clip": 0.01322553, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 1.04968739, "balance_loss_mlp": 1.01809907, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 4.341621125366666, "language_loss": 0.73248386, "learning_rate": 1.8532315395242203e-07, "loss": 0.75596058, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.645419120788574 }, { "auxiliary_loss_clip": 0.01320166, "auxiliary_loss_mlp": 0.01023061, "balance_loss_clip": 1.04390478, "balance_loss_mlp": 1.01632297, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 1.9634900977293166, "language_loss": 0.72196138, "learning_rate": 1.849958111573353e-07, "loss": 0.74539369, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 3.556546449661255 }, { "auxiliary_loss_clip": 0.0116917, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.04736066, "balance_loss_mlp": 1.01945853, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.7020681872076473, "language_loss": 0.64407933, "learning_rate": 1.8466874369694074e-07, "loss": 0.66603339, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 3.5028061866760254 }, { "auxiliary_loss_clip": 0.01322904, "auxiliary_loss_mlp": 0.01019947, "balance_loss_clip": 1.04037023, "balance_loss_mlp": 1.01336658, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 3.027565860232214, "language_loss": 0.70131773, "learning_rate": 1.843419516208542e-07, "loss": 0.72474629, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.660959005355835 }, { "auxiliary_loss_clip": 0.01225574, "auxiliary_loss_mlp": 0.01025592, "balance_loss_clip": 1.04999161, "balance_loss_mlp": 1.01817703, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.1104661656395822, "language_loss": 0.79567289, "learning_rate": 1.8401543497865047e-07, "loss": 0.81818461, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 3.514117956161499 }, { "auxiliary_loss_clip": 0.01224197, "auxiliary_loss_mlp": 0.02567145, "balance_loss_clip": 1.04792309, "balance_loss_mlp": 0.99989009, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.0033810856031584, "language_loss": 0.6404177, "learning_rate": 1.836891938198608e-07, "loss": 0.67833108, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.7039260864257812 }, { "auxiliary_loss_clip": 0.01272939, "auxiliary_loss_mlp": 0.01024881, "balance_loss_clip": 1.04829717, "balance_loss_mlp": 1.01772237, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.4550616137577537, "language_loss": 0.71529543, "learning_rate": 1.8336322819397677e-07, "loss": 0.73827362, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.613107204437256 }, { "auxiliary_loss_clip": 0.0132791, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 1.04338717, "balance_loss_mlp": 1.0181663, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 1.9375337519604254, "language_loss": 0.62529677, "learning_rate": 1.8303753815044654e-07, "loss": 0.64882672, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.7591116428375244 }, { "auxiliary_loss_clip": 0.01284231, "auxiliary_loss_mlp": 0.01026988, "balance_loss_clip": 1.04775751, "balance_loss_mlp": 1.01945734, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.298840181286915, "language_loss": 0.71009421, "learning_rate": 1.827121237386773e-07, "loss": 0.73320639, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 3.569711208343506 }, { "auxiliary_loss_clip": 0.01274785, "auxiliary_loss_mlp": 0.01025159, "balance_loss_clip": 1.04673839, "balance_loss_mlp": 1.01801276, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.8430004156555984, "language_loss": 0.7517451, "learning_rate": 1.8238698500803374e-07, "loss": 0.77474451, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.697288990020752 }, { "auxiliary_loss_clip": 0.01117747, "auxiliary_loss_mlp": 0.01001042, "balance_loss_clip": 1.00650358, "balance_loss_mlp": 1.00016022, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7156548181386322, "language_loss": 0.5622319, "learning_rate": 1.820621220078391e-07, "loss": 0.5834198, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.2645740509033203 }, { "auxiliary_loss_clip": 0.01173797, "auxiliary_loss_mlp": 0.01025815, "balance_loss_clip": 1.04918134, "balance_loss_mlp": 1.01874638, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 2.46679630088311, "language_loss": 0.67644751, "learning_rate": 1.8173753478737553e-07, "loss": 0.69844359, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.5928521156311035 }, { "auxiliary_loss_clip": 0.01174349, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.01837277, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 2.225992147525365, "language_loss": 0.80364728, "learning_rate": 1.8141322339588205e-07, "loss": 0.825652, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.591423273086548 }, { "auxiliary_loss_clip": 0.01171253, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.04864216, "balance_loss_mlp": 1.01993561, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 2.415504958278649, "language_loss": 0.6998229, "learning_rate": 1.810891878825569e-07, "loss": 0.72180504, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.586688280105591 }, { "auxiliary_loss_clip": 0.01271542, "auxiliary_loss_mlp": 0.01023802, "balance_loss_clip": 1.04529583, "balance_loss_mlp": 1.01711416, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.1522811927799554, "language_loss": 0.72049844, "learning_rate": 1.8076542829655561e-07, "loss": 0.74345183, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.7121453285217285 }, { "auxiliary_loss_clip": 0.01278604, "auxiliary_loss_mlp": 0.01023015, "balance_loss_clip": 1.04985595, "balance_loss_mlp": 1.01585674, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 2.4808623224022193, "language_loss": 0.79406095, "learning_rate": 1.8044194468699203e-07, "loss": 0.81707716, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.6733429431915283 }, { "auxiliary_loss_clip": 0.01269554, "auxiliary_loss_mlp": 0.01023816, "balance_loss_clip": 1.04838121, "balance_loss_mlp": 1.01697302, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 2.104588401022987, "language_loss": 0.75677884, "learning_rate": 1.8011873710293912e-07, "loss": 0.77971256, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.7172179222106934 }, { "auxiliary_loss_clip": 0.01221892, "auxiliary_loss_mlp": 0.01024368, "balance_loss_clip": 1.0482409, "balance_loss_mlp": 1.01773357, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 2.0423517172518775, "language_loss": 0.69153291, "learning_rate": 1.7979580559342677e-07, "loss": 0.71399552, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.68587327003479 }, { "auxiliary_loss_clip": 0.01271319, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.04628408, "balance_loss_mlp": 1.0195179, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.766303338033194, "language_loss": 0.66647184, "learning_rate": 1.7947315020744358e-07, "loss": 0.68945247, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.7502055168151855 }, { "auxiliary_loss_clip": 0.01269476, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.04480457, "balance_loss_mlp": 1.02260303, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 3.325664489634154, "language_loss": 0.80545592, "learning_rate": 1.7915077099393594e-07, "loss": 0.82843804, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.6302578449249268 }, { "auxiliary_loss_clip": 0.01226202, "auxiliary_loss_mlp": 0.01021135, "balance_loss_clip": 1.04746306, "balance_loss_mlp": 1.01402986, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 2.830891944777469, "language_loss": 0.73185372, "learning_rate": 1.788286680018083e-07, "loss": 0.75432706, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.618861675262451 }, { "auxiliary_loss_clip": 0.01272075, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.0455339, "balance_loss_mlp": 1.02000237, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 3.755842915359066, "language_loss": 0.72285926, "learning_rate": 1.7850684127992443e-07, "loss": 0.74584782, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.76210880279541 }, { "auxiliary_loss_clip": 0.01322946, "auxiliary_loss_mlp": 0.01027469, "balance_loss_clip": 1.04738569, "balance_loss_mlp": 1.02040625, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 2.354893433993935, "language_loss": 0.7037437, "learning_rate": 1.7818529087710378e-07, "loss": 0.72724789, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.697479009628296 }, { "auxiliary_loss_clip": 0.01220591, "auxiliary_loss_mlp": 0.02565952, "balance_loss_clip": 1.04672146, "balance_loss_mlp": 0.99990398, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 2.1012541489526946, "language_loss": 0.84467167, "learning_rate": 1.7786401684212637e-07, "loss": 0.88253713, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.697335958480835 }, { "auxiliary_loss_clip": 0.0111628, "auxiliary_loss_mlp": 0.01001239, "balance_loss_clip": 1.00643122, "balance_loss_mlp": 1.00037479, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.7296660817934916, "language_loss": 0.55883682, "learning_rate": 1.7754301922372883e-07, "loss": 0.58001208, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.109631299972534 }, { "auxiliary_loss_clip": 0.01430915, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.04252911, "balance_loss_mlp": 1.01884913, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 2.857138800371409, "language_loss": 0.81374323, "learning_rate": 1.7722229807060617e-07, "loss": 0.83831692, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.8185858726501465 }, { "auxiliary_loss_clip": 0.01314012, "auxiliary_loss_mlp": 0.0102637, "balance_loss_clip": 1.03965974, "balance_loss_mlp": 1.01972961, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 2.3841697679748437, "language_loss": 0.82301927, "learning_rate": 1.7690185343141172e-07, "loss": 0.84642309, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.777899980545044 }, { "auxiliary_loss_clip": 0.01272073, "auxiliary_loss_mlp": 0.01022536, "balance_loss_clip": 1.04518461, "balance_loss_mlp": 1.01624775, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 1.9567845518691074, "language_loss": 0.69787502, "learning_rate": 1.7658168535475615e-07, "loss": 0.72082114, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 3.5387065410614014 }, { "auxiliary_loss_clip": 0.0127759, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.04890585, "balance_loss_mlp": 1.01808, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.5872083566348008, "language_loss": 0.64167351, "learning_rate": 1.7626179388920948e-07, "loss": 0.66469979, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 3.6919443607330322 }, { "auxiliary_loss_clip": 0.01271483, "auxiliary_loss_mlp": 0.02564464, "balance_loss_clip": 1.04727244, "balance_loss_mlp": 0.99991107, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.5744389768928813, "language_loss": 0.80669606, "learning_rate": 1.7594217908329866e-07, "loss": 0.84505558, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.690922260284424 }, { "auxiliary_loss_clip": 0.0126605, "auxiliary_loss_mlp": 0.01021751, "balance_loss_clip": 1.04622042, "balance_loss_mlp": 1.01503921, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 2.2527013836820804, "language_loss": 0.74184877, "learning_rate": 1.7562284098550895e-07, "loss": 0.76472676, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 3.666689872741699 }, { "auxiliary_loss_clip": 0.01161939, "auxiliary_loss_mlp": 0.01007506, "balance_loss_clip": 1.00853276, "balance_loss_mlp": 1.00662982, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8307521206894776, "language_loss": 0.62137765, "learning_rate": 1.753037796442838e-07, "loss": 0.64307213, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.322922945022583 }, { "auxiliary_loss_clip": 0.01174568, "auxiliary_loss_mlp": 0.01026696, "balance_loss_clip": 1.048455, "balance_loss_mlp": 1.01903713, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.2346206949545393, "language_loss": 0.74853158, "learning_rate": 1.74984995108024e-07, "loss": 0.77054423, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.600470781326294 }, { "auxiliary_loss_clip": 0.0122288, "auxiliary_loss_mlp": 0.01025425, "balance_loss_clip": 1.0467602, "balance_loss_mlp": 1.01835299, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 4.470354721182958, "language_loss": 0.83636534, "learning_rate": 1.7466648742508981e-07, "loss": 0.85884833, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.6726980209350586 }, { "auxiliary_loss_clip": 0.01267972, "auxiliary_loss_mlp": 0.01026425, "balance_loss_clip": 1.04780841, "balance_loss_mlp": 1.01994872, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 1.844518998965995, "language_loss": 0.84796143, "learning_rate": 1.7434825664379837e-07, "loss": 0.87090534, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 3.5627617835998535 }, { "auxiliary_loss_clip": 0.01220219, "auxiliary_loss_mlp": 0.01025963, "balance_loss_clip": 1.04654479, "balance_loss_mlp": 1.0188638, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 2.7528237699909837, "language_loss": 0.85977274, "learning_rate": 1.740303028124246e-07, "loss": 0.88223457, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.554696559906006 }, { "auxiliary_loss_clip": 0.01422226, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.04100645, "balance_loss_mlp": 1.01995635, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 1.907717532918966, "language_loss": 0.75833392, "learning_rate": 1.7371262597920212e-07, "loss": 0.78282726, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.8358161449432373 }, { "auxiliary_loss_clip": 0.01366424, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.04654646, "balance_loss_mlp": 1.02074289, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.6763738937739623, "language_loss": 0.76380134, "learning_rate": 1.7339522619232195e-07, "loss": 0.78774202, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.718743324279785 }, { "auxiliary_loss_clip": 0.01279483, "auxiliary_loss_mlp": 0.01024748, "balance_loss_clip": 1.04472804, "balance_loss_mlp": 1.01727974, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 2.95064586235358, "language_loss": 0.75605273, "learning_rate": 1.730781034999338e-07, "loss": 0.77909505, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.682053804397583 }, { "auxiliary_loss_clip": 0.01172107, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.05070114, "balance_loss_mlp": 1.01748276, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 1.8093878797064162, "language_loss": 0.73487794, "learning_rate": 1.7276125795014497e-07, "loss": 0.75684422, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.6612532138824463 }, { "auxiliary_loss_clip": 0.01275451, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.04297733, "balance_loss_mlp": 1.0189476, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 3.284991836287034, "language_loss": 0.67517859, "learning_rate": 1.7244468959102054e-07, "loss": 0.69819915, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.6647346019744873 }, { "auxiliary_loss_clip": 0.01220585, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.04878211, "balance_loss_mlp": 1.02161813, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 7.582020195771247, "language_loss": 0.85003275, "learning_rate": 1.7212839847058348e-07, "loss": 0.87252343, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.600057363510132 }, { "auxiliary_loss_clip": 0.01328111, "auxiliary_loss_mlp": 0.0102046, "balance_loss_clip": 1.04162705, "balance_loss_mlp": 1.01402545, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 1.7994326438096948, "language_loss": 0.74149191, "learning_rate": 1.718123846368147e-07, "loss": 0.76497757, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.7846667766571045 }, { "auxiliary_loss_clip": 0.01269765, "auxiliary_loss_mlp": 0.02563209, "balance_loss_clip": 1.04663563, "balance_loss_mlp": 0.99987864, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.9345704187677966, "language_loss": 0.71840501, "learning_rate": 1.714966481376543e-07, "loss": 0.75673473, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.6568706035614014 }, { "auxiliary_loss_clip": 0.01220444, "auxiliary_loss_mlp": 0.01024039, "balance_loss_clip": 1.0465827, "balance_loss_mlp": 1.01723182, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 1.9553927113951548, "language_loss": 0.83097583, "learning_rate": 1.7118118902099797e-07, "loss": 0.85342056, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.6947813034057617 }, { "auxiliary_loss_clip": 0.01220809, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.04616344, "balance_loss_mlp": 1.02155542, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 2.060335975416442, "language_loss": 0.80804443, "learning_rate": 1.7086600733470146e-07, "loss": 0.83053255, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.6640751361846924 }, { "auxiliary_loss_clip": 0.01215002, "auxiliary_loss_mlp": 0.01026253, "balance_loss_clip": 1.04468143, "balance_loss_mlp": 1.01910377, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.7925051296795547, "language_loss": 0.77320999, "learning_rate": 1.7055110312657738e-07, "loss": 0.79562253, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.615983724594116 }, { "auxiliary_loss_clip": 0.01268852, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.04446793, "balance_loss_mlp": 1.0211091, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 2.0088631598205082, "language_loss": 0.74551201, "learning_rate": 1.702364764443962e-07, "loss": 0.76848763, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 2.6625828742980957 }, { "auxiliary_loss_clip": 0.01417326, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.03928089, "balance_loss_mlp": 1.02108455, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 1.8395805246892423, "language_loss": 0.72538871, "learning_rate": 1.6992212733588685e-07, "loss": 0.74984074, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.8678717613220215 }, { "auxiliary_loss_clip": 0.01270488, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.0447228, "balance_loss_mlp": 1.01644039, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.8938928233976158, "language_loss": 0.75003284, "learning_rate": 1.6960805584873538e-07, "loss": 0.77296823, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.756009340286255 }, { "auxiliary_loss_clip": 0.01369032, "auxiliary_loss_mlp": 0.01023578, "balance_loss_clip": 1.0425061, "balance_loss_mlp": 1.01686668, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 3.8429575705826595, "language_loss": 0.78364843, "learning_rate": 1.6929426203058684e-07, "loss": 0.80757457, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.7248144149780273 }, { "auxiliary_loss_clip": 0.01176766, "auxiliary_loss_mlp": 0.02571456, "balance_loss_clip": 1.04851806, "balance_loss_mlp": 0.999915, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.4386539138653895, "language_loss": 0.8014226, "learning_rate": 1.689807459290431e-07, "loss": 0.8389048, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.6701109409332275 }, { "auxiliary_loss_clip": 0.01273262, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 1.04769421, "balance_loss_mlp": 1.01726305, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 1.9298300059642002, "language_loss": 0.70986384, "learning_rate": 1.6866750759166437e-07, "loss": 0.73283744, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.7512967586517334 }, { "auxiliary_loss_clip": 0.0131686, "auxiliary_loss_mlp": 0.0102531, "balance_loss_clip": 1.03945029, "balance_loss_mlp": 1.018116, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.750076820587762, "language_loss": 0.77331686, "learning_rate": 1.6835454706596865e-07, "loss": 0.79673856, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.6963298320770264 }, { "auxiliary_loss_clip": 0.01174591, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.0504607, "balance_loss_mlp": 1.02398872, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 2.416825719507602, "language_loss": 0.73906958, "learning_rate": 1.680418643994317e-07, "loss": 0.76112658, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 4.452859878540039 }, { "auxiliary_loss_clip": 0.01060727, "auxiliary_loss_mlp": 0.00999338, "balance_loss_clip": 1.00648403, "balance_loss_mlp": 0.99850982, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8828002334620544, "language_loss": 0.64503884, "learning_rate": 1.6772945963948738e-07, "loss": 0.66563952, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.2035293579101562 }, { "auxiliary_loss_clip": 0.01270502, "auxiliary_loss_mlp": 0.01023286, "balance_loss_clip": 1.04788411, "balance_loss_mlp": 1.01599002, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.1885039435933886, "language_loss": 0.77242756, "learning_rate": 1.6741733283352733e-07, "loss": 0.79536545, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 3.5899226665496826 }, { "auxiliary_loss_clip": 0.01270205, "auxiliary_loss_mlp": 0.01024762, "balance_loss_clip": 1.04418921, "balance_loss_mlp": 1.01760912, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.7900337572931107, "language_loss": 0.83853173, "learning_rate": 1.6710548402890102e-07, "loss": 0.86148143, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.739159107208252 }, { "auxiliary_loss_clip": 0.01176097, "auxiliary_loss_mlp": 0.01022599, "balance_loss_clip": 1.04804611, "balance_loss_mlp": 1.01551461, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 2.223322150686324, "language_loss": 0.66612101, "learning_rate": 1.6679391327291527e-07, "loss": 0.68810797, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.678316116333008 }, { "auxiliary_loss_clip": 0.0127332, "auxiliary_loss_mlp": 0.01025121, "balance_loss_clip": 1.04380715, "balance_loss_mlp": 1.01894307, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 2.4735930975529414, "language_loss": 0.68422478, "learning_rate": 1.6648262061283492e-07, "loss": 0.70720917, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.6932668685913086 }, { "auxiliary_loss_clip": 0.01319276, "auxiliary_loss_mlp": 0.01025382, "balance_loss_clip": 1.04093575, "balance_loss_mlp": 1.018399, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 1.9078543599500044, "language_loss": 0.73559558, "learning_rate": 1.6617160609588353e-07, "loss": 0.75904226, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 3.6529572010040283 }, { "auxiliary_loss_clip": 0.01171702, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.04551888, "balance_loss_mlp": 1.02229691, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.1562650723897008, "language_loss": 0.7191788, "learning_rate": 1.6586086976924163e-07, "loss": 0.74118268, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.6637539863586426 }, { "auxiliary_loss_clip": 0.01222308, "auxiliary_loss_mlp": 0.01027989, "balance_loss_clip": 1.04593682, "balance_loss_mlp": 1.02113748, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 1.8215966790633111, "language_loss": 0.7845965, "learning_rate": 1.6555041168004747e-07, "loss": 0.80709946, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.5863094329833984 }, { "auxiliary_loss_clip": 0.01267126, "auxiliary_loss_mlp": 0.01024525, "balance_loss_clip": 1.04456401, "balance_loss_mlp": 1.01807904, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 1.9797178599614467, "language_loss": 0.69295871, "learning_rate": 1.6524023187539715e-07, "loss": 0.71587515, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.67750883102417 }, { "auxiliary_loss_clip": 0.01273386, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.04630828, "balance_loss_mlp": 1.01991165, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 1.7738626872033276, "language_loss": 0.74653912, "learning_rate": 1.649303304023446e-07, "loss": 0.76954353, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.6677963733673096 }, { "auxiliary_loss_clip": 0.01312236, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.04609776, "balance_loss_mlp": 1.02278078, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.8545058978768867, "language_loss": 0.7872479, "learning_rate": 1.6462070730790246e-07, "loss": 0.81066394, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.6584227085113525 }, { "auxiliary_loss_clip": 0.01266409, "auxiliary_loss_mlp": 0.01023359, "balance_loss_clip": 1.04095614, "balance_loss_mlp": 1.01631987, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 3.0174759446470403, "language_loss": 0.7881223, "learning_rate": 1.6431136263903912e-07, "loss": 0.81101996, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.600555658340454 }, { "auxiliary_loss_clip": 0.01224922, "auxiliary_loss_mlp": 0.0256625, "balance_loss_clip": 1.04473019, "balance_loss_mlp": 0.99993038, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.942263662819865, "language_loss": 0.73599565, "learning_rate": 1.6400229644268282e-07, "loss": 0.7739073, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.6496424674987793 }, { "auxiliary_loss_clip": 0.01322431, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 1.05026722, "balance_loss_mlp": 1.02079141, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 2.0584340386345983, "language_loss": 0.81325531, "learning_rate": 1.6369350876571852e-07, "loss": 0.83675945, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.67555570602417 }, { "auxiliary_loss_clip": 0.01369423, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.0412581, "balance_loss_mlp": 1.02013397, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 4.557980655882612, "language_loss": 0.8191613, "learning_rate": 1.6338499965498874e-07, "loss": 0.84312147, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.7363264560699463 }, { "auxiliary_loss_clip": 0.0132556, "auxiliary_loss_mlp": 0.01024451, "balance_loss_clip": 1.04538369, "balance_loss_mlp": 1.01753128, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.4632771711812924, "language_loss": 0.77353054, "learning_rate": 1.630767691572943e-07, "loss": 0.79703069, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.80120587348938 }, { "auxiliary_loss_clip": 0.01171166, "auxiliary_loss_mlp": 0.01001065, "balance_loss_clip": 1.00675893, "balance_loss_mlp": 1.00014675, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.8339325195841324, "language_loss": 0.5341382, "learning_rate": 1.6276881731939306e-07, "loss": 0.55586052, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.2606029510498047 }, { "auxiliary_loss_clip": 0.01218223, "auxiliary_loss_mlp": 0.01025843, "balance_loss_clip": 1.04792285, "balance_loss_mlp": 1.01883054, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.8224837514830876, "language_loss": 0.7533682, "learning_rate": 1.6246114418800193e-07, "loss": 0.77580881, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.6761975288391113 }, { "auxiliary_loss_clip": 0.01221452, "auxiliary_loss_mlp": 0.01026175, "balance_loss_clip": 1.04632854, "balance_loss_mlp": 1.01874197, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.9455384201456611, "language_loss": 0.76626617, "learning_rate": 1.6215374980979423e-07, "loss": 0.78874248, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.6318647861480713 }, { "auxiliary_loss_clip": 0.01218759, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.0484128, "balance_loss_mlp": 1.02052116, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 1.9276114521344512, "language_loss": 0.68974984, "learning_rate": 1.6184663423140133e-07, "loss": 0.71220976, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.848367929458618 }, { "auxiliary_loss_clip": 0.01374198, "auxiliary_loss_mlp": 0.01027523, "balance_loss_clip": 1.04404128, "balance_loss_mlp": 1.02061749, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 1.848551771749569, "language_loss": 0.63806665, "learning_rate": 1.615397974994126e-07, "loss": 0.66208386, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.7555747032165527 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 1.04734969, "balance_loss_mlp": 1.01846516, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.6068096034591661, "language_loss": 0.80852932, "learning_rate": 1.6123323966037438e-07, "loss": 0.83047062, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.6317808628082275 }, { "auxiliary_loss_clip": 0.01174966, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.05132401, "balance_loss_mlp": 1.02097452, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 3.3867422334282935, "language_loss": 0.78866166, "learning_rate": 1.6092696076079216e-07, "loss": 0.81068909, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.5973289012908936 }, { "auxiliary_loss_clip": 0.01313119, "auxiliary_loss_mlp": 0.01022257, "balance_loss_clip": 1.04460979, "balance_loss_mlp": 1.01555157, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.7481174452637855, "language_loss": 0.73850811, "learning_rate": 1.6062096084712785e-07, "loss": 0.76186192, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.6817333698272705 }, { "auxiliary_loss_clip": 0.01268967, "auxiliary_loss_mlp": 0.02567918, "balance_loss_clip": 1.04264426, "balance_loss_mlp": 0.99990058, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 7.115094618798673, "language_loss": 0.70583993, "learning_rate": 1.6031523996580098e-07, "loss": 0.74420881, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.688387870788574 }, { "auxiliary_loss_clip": 0.01329205, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.04601002, "balance_loss_mlp": 1.0159204, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 2.2849786327536568, "language_loss": 0.66366887, "learning_rate": 1.6000979816318981e-07, "loss": 0.68719625, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 4.469428062438965 }, { "auxiliary_loss_clip": 0.01218822, "auxiliary_loss_mlp": 0.01024163, "balance_loss_clip": 1.04780579, "balance_loss_mlp": 1.01679552, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 3.1100181118712733, "language_loss": 0.75120068, "learning_rate": 1.5970463548562886e-07, "loss": 0.7736305, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.605156183242798 }, { "auxiliary_loss_clip": 0.01267633, "auxiliary_loss_mlp": 0.01020066, "balance_loss_clip": 1.04705477, "balance_loss_mlp": 1.01335168, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.7796112309965653, "language_loss": 0.71193957, "learning_rate": 1.5939975197941192e-07, "loss": 0.73481655, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.7326533794403076 }, { "auxiliary_loss_clip": 0.01169671, "auxiliary_loss_mlp": 0.01003299, "balance_loss_clip": 1.00656867, "balance_loss_mlp": 1.00248861, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8133505225788741, "language_loss": 0.53286684, "learning_rate": 1.5909514769078892e-07, "loss": 0.55459654, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 4.215396404266357 }, { "auxiliary_loss_clip": 0.01321804, "auxiliary_loss_mlp": 0.01026084, "balance_loss_clip": 1.05024314, "balance_loss_mlp": 1.01937866, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 2.006905225073446, "language_loss": 0.77939236, "learning_rate": 1.5879082266596867e-07, "loss": 0.80287123, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.726262092590332 }, { "auxiliary_loss_clip": 0.01267166, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.04186678, "balance_loss_mlp": 1.01859713, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 1.8141115377924155, "language_loss": 0.71769762, "learning_rate": 1.5848677695111645e-07, "loss": 0.74062061, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.756207227706909 }, { "auxiliary_loss_clip": 0.01330101, "auxiliary_loss_mlp": 0.01024369, "balance_loss_clip": 1.04833555, "balance_loss_mlp": 1.01685548, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.413527128464261, "language_loss": 0.69531393, "learning_rate": 1.5818301059235562e-07, "loss": 0.71885866, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 3.6210267543792725 }, { "auxiliary_loss_clip": 0.01273891, "auxiliary_loss_mlp": 0.01024387, "balance_loss_clip": 1.04801297, "balance_loss_mlp": 1.01759839, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 2.023849907593808, "language_loss": 0.81233525, "learning_rate": 1.578795236357684e-07, "loss": 0.83531803, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.7239997386932373 }, { "auxiliary_loss_clip": 0.01271581, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.04691005, "balance_loss_mlp": 1.01974201, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 2.0968628138203234, "language_loss": 0.85690475, "learning_rate": 1.5757631612739218e-07, "loss": 0.879888, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.617967128753662 }, { "auxiliary_loss_clip": 0.0106069, "auxiliary_loss_mlp": 0.01000348, "balance_loss_clip": 1.00632918, "balance_loss_mlp": 0.99949598, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7783074441011061, "language_loss": 0.61382759, "learning_rate": 1.572733881132242e-07, "loss": 0.63443798, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.2222490310668945 }, { "auxiliary_loss_clip": 0.01216268, "auxiliary_loss_mlp": 0.01008522, "balance_loss_clip": 1.01112175, "balance_loss_mlp": 1.00766373, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7827386440960511, "language_loss": 0.58476287, "learning_rate": 1.5697073963921814e-07, "loss": 0.60701078, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.1866157054901123 }, { "auxiliary_loss_clip": 0.01224936, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.0495975, "balance_loss_mlp": 1.02033305, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.450665244400367, "language_loss": 0.84910476, "learning_rate": 1.566683707512857e-07, "loss": 0.87163448, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.6184444427490234 }, { "auxiliary_loss_clip": 0.01269196, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.04463887, "balance_loss_mlp": 1.02333879, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 2.4457691024074744, "language_loss": 0.79830199, "learning_rate": 1.5636628149529553e-07, "loss": 0.82129753, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.6317007541656494 }, { "auxiliary_loss_clip": 0.01268308, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.04313982, "balance_loss_mlp": 1.02057147, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.2082863910193744, "language_loss": 0.79929906, "learning_rate": 1.560644719170743e-07, "loss": 0.82225502, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.726923704147339 }, { "auxiliary_loss_clip": 0.01321973, "auxiliary_loss_mlp": 0.01024219, "balance_loss_clip": 1.04176927, "balance_loss_mlp": 1.01692641, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 1.9077670558609257, "language_loss": 0.72302043, "learning_rate": 1.5576294206240692e-07, "loss": 0.74648237, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.856168508529663 }, { "auxiliary_loss_clip": 0.0127575, "auxiliary_loss_mlp": 0.0102355, "balance_loss_clip": 1.04772949, "balance_loss_mlp": 1.01693642, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.8683403877625862, "language_loss": 0.67743069, "learning_rate": 1.5546169197703507e-07, "loss": 0.70042366, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 2.9696688652038574 }, { "auxiliary_loss_clip": 0.01276347, "auxiliary_loss_mlp": 0.01028492, "balance_loss_clip": 1.04461884, "balance_loss_mlp": 1.02187634, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.4038268669269818, "language_loss": 0.77404141, "learning_rate": 1.5516072170665774e-07, "loss": 0.79708982, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.7235536575317383 }, { "auxiliary_loss_clip": 0.01224766, "auxiliary_loss_mlp": 0.01025008, "balance_loss_clip": 1.04871821, "balance_loss_mlp": 1.01829374, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.9333151642873636, "language_loss": 0.87116337, "learning_rate": 1.5486003129693214e-07, "loss": 0.89366102, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.5912039279937744 }, { "auxiliary_loss_clip": 0.01223457, "auxiliary_loss_mlp": 0.010222, "balance_loss_clip": 1.0472039, "balance_loss_mlp": 1.0151639, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 2.243533712588285, "language_loss": 0.7809909, "learning_rate": 1.545596207934725e-07, "loss": 0.80344748, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.6126456260681152 }, { "auxiliary_loss_clip": 0.01270363, "auxiliary_loss_mlp": 0.01025253, "balance_loss_clip": 1.04544926, "balance_loss_mlp": 1.01905704, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 2.0104849594721306, "language_loss": 0.77675992, "learning_rate": 1.5425949024185147e-07, "loss": 0.79971606, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.7162909507751465 }, { "auxiliary_loss_clip": 0.01272232, "auxiliary_loss_mlp": 0.01024449, "balance_loss_clip": 1.04381895, "balance_loss_mlp": 1.01789522, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 1.8750784925296298, "language_loss": 0.67638093, "learning_rate": 1.5395963968759818e-07, "loss": 0.69934773, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.6975655555725098 }, { "auxiliary_loss_clip": 0.01266572, "auxiliary_loss_mlp": 0.0102531, "balance_loss_clip": 1.04156983, "balance_loss_mlp": 1.01907516, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.4673151012914587, "language_loss": 0.64401066, "learning_rate": 1.536600691761998e-07, "loss": 0.66692936, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 3.0682790279388428 }, { "auxiliary_loss_clip": 0.01322387, "auxiliary_loss_mlp": 0.01023033, "balance_loss_clip": 1.0462842, "balance_loss_mlp": 1.0163691, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 2.7171434043088065, "language_loss": 0.7171151, "learning_rate": 1.5336077875310084e-07, "loss": 0.74056929, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.7175538539886475 }, { "auxiliary_loss_clip": 0.01371832, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.04283142, "balance_loss_mlp": 1.019912, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 3.059271401542427, "language_loss": 0.74304777, "learning_rate": 1.5306176846370321e-07, "loss": 0.76703155, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.7502636909484863 }, { "auxiliary_loss_clip": 0.01277572, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.04445934, "balance_loss_mlp": 1.01932108, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 2.236616860863976, "language_loss": 0.7415821, "learning_rate": 1.5276303835336712e-07, "loss": 0.76462281, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.6747007369995117 }, { "auxiliary_loss_clip": 0.01116264, "auxiliary_loss_mlp": 0.0100159, "balance_loss_clip": 1.00652575, "balance_loss_mlp": 1.00076103, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.757177431661679, "language_loss": 0.5345735, "learning_rate": 1.524645884674094e-07, "loss": 0.55575204, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.170609474182129 }, { "auxiliary_loss_clip": 0.01174657, "auxiliary_loss_mlp": 0.02571, "balance_loss_clip": 1.04856467, "balance_loss_mlp": 0.99989963, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.416323688407495, "language_loss": 0.79243779, "learning_rate": 1.521664188511047e-07, "loss": 0.8298943, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 4.4936747550964355 }, { "auxiliary_loss_clip": 0.01279163, "auxiliary_loss_mlp": 0.02563836, "balance_loss_clip": 1.05235004, "balance_loss_mlp": 0.99990362, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 2.235931855373344, "language_loss": 0.80042064, "learning_rate": 1.518685295496851e-07, "loss": 0.83885062, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.7380194664001465 }, { "auxiliary_loss_clip": 0.0122243, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.04631281, "balance_loss_mlp": 1.01711893, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.9847423514176032, "language_loss": 0.85453618, "learning_rate": 1.5157092060833975e-07, "loss": 0.87699711, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.672056198120117 }, { "auxiliary_loss_clip": 0.01172144, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.04413879, "balance_loss_mlp": 1.01924038, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.8656145751471023, "language_loss": 0.6616869, "learning_rate": 1.5127359207221658e-07, "loss": 0.68367058, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 3.6691126823425293 }, { "auxiliary_loss_clip": 0.01417892, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.03726077, "balance_loss_mlp": 1.01802731, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 2.284784536246755, "language_loss": 0.73737812, "learning_rate": 1.5097654398641923e-07, "loss": 0.76181197, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.7443835735321045 }, { "auxiliary_loss_clip": 0.01126546, "auxiliary_loss_mlp": 0.01023941, "balance_loss_clip": 1.0489192, "balance_loss_mlp": 1.01705635, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.6138612460199269, "language_loss": 0.73218554, "learning_rate": 1.5067977639601014e-07, "loss": 0.75369042, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.7289884090423584 }, { "auxiliary_loss_clip": 0.01269335, "auxiliary_loss_mlp": 0.01022469, "balance_loss_clip": 1.04632056, "balance_loss_mlp": 1.01563203, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 3.1865915987335134, "language_loss": 0.71207809, "learning_rate": 1.5038328934600864e-07, "loss": 0.73499614, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 3.540827751159668 }, { "auxiliary_loss_clip": 0.01271439, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.04894388, "balance_loss_mlp": 1.02056623, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 1.8487222649044321, "language_loss": 0.70591855, "learning_rate": 1.5008708288139161e-07, "loss": 0.72890711, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.8140828609466553 }, { "auxiliary_loss_clip": 0.0122675, "auxiliary_loss_mlp": 0.01020278, "balance_loss_clip": 1.05126977, "balance_loss_mlp": 1.012941, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 2.176562650504814, "language_loss": 0.73531139, "learning_rate": 1.497911570470931e-07, "loss": 0.75778168, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.6969447135925293 }, { "auxiliary_loss_clip": 0.01312801, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.04350853, "balance_loss_mlp": 1.0223316, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.97484434722781, "language_loss": 0.85554618, "learning_rate": 1.494955118880048e-07, "loss": 0.87896585, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.7322680950164795 }, { "auxiliary_loss_clip": 0.01222855, "auxiliary_loss_mlp": 0.01022792, "balance_loss_clip": 1.04731596, "balance_loss_mlp": 1.01583934, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.7813382610153483, "language_loss": 0.72909725, "learning_rate": 1.4920014744897634e-07, "loss": 0.75155377, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.7537519931793213 }, { "auxiliary_loss_clip": 0.01267439, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.04510856, "balance_loss_mlp": 1.02099419, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 2.1973758673051678, "language_loss": 0.86115426, "learning_rate": 1.4890506377481392e-07, "loss": 0.88410795, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.6970462799072266 }, { "auxiliary_loss_clip": 0.01326207, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 1.04631019, "balance_loss_mlp": 1.01680732, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.5537291353551215, "language_loss": 0.63945633, "learning_rate": 1.486102609102815e-07, "loss": 0.66295171, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.8720717430114746 }, { "auxiliary_loss_clip": 0.01265032, "auxiliary_loss_mlp": 0.01026538, "balance_loss_clip": 1.04478431, "balance_loss_mlp": 1.0196805, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 3.201891021006774, "language_loss": 0.85826665, "learning_rate": 1.483157389001004e-07, "loss": 0.88118231, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.567586660385132 }, { "auxiliary_loss_clip": 0.01270287, "auxiliary_loss_mlp": 0.0102519, "balance_loss_clip": 1.04191113, "balance_loss_mlp": 1.01755142, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.4360468696608586, "language_loss": 0.78822601, "learning_rate": 1.4802149778894933e-07, "loss": 0.81118077, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.702939987182617 }, { "auxiliary_loss_clip": 0.01213956, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04198229, "balance_loss_mlp": 1.01712751, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 2.0856646748944434, "language_loss": 0.87629378, "learning_rate": 1.4772753762146484e-07, "loss": 0.89866978, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.5900728702545166 }, { "auxiliary_loss_clip": 0.012178, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.04380751, "balance_loss_mlp": 1.02207398, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 2.553097955552744, "language_loss": 0.70695686, "learning_rate": 1.474338584422401e-07, "loss": 0.72943258, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.762192964553833 }, { "auxiliary_loss_clip": 0.01216424, "auxiliary_loss_mlp": 0.01021411, "balance_loss_clip": 1.04720426, "balance_loss_mlp": 1.01468205, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 1.7331008222598208, "language_loss": 0.75634396, "learning_rate": 1.4714046029582595e-07, "loss": 0.77872229, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.648693799972534 }, { "auxiliary_loss_clip": 0.01324753, "auxiliary_loss_mlp": 0.01026602, "balance_loss_clip": 1.04455805, "balance_loss_mlp": 1.01977158, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.87648745389448, "language_loss": 0.75970292, "learning_rate": 1.46847343226731e-07, "loss": 0.78321648, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.862873077392578 }, { "auxiliary_loss_clip": 0.01226931, "auxiliary_loss_mlp": 0.01024844, "balance_loss_clip": 1.04878378, "balance_loss_mlp": 1.01836848, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.2350771523976842, "language_loss": 0.69322914, "learning_rate": 1.465545072794203e-07, "loss": 0.71574688, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.616023540496826 }, { "auxiliary_loss_clip": 0.01270535, "auxiliary_loss_mlp": 0.01025444, "balance_loss_clip": 1.0454576, "balance_loss_mlp": 1.01862788, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.7175433447945296, "language_loss": 0.75800216, "learning_rate": 1.4626195249831774e-07, "loss": 0.78096199, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.787062406539917 }, { "auxiliary_loss_clip": 0.01219414, "auxiliary_loss_mlp": 0.01027146, "balance_loss_clip": 1.04444289, "balance_loss_mlp": 1.02027404, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.9193512076272574, "language_loss": 0.71863717, "learning_rate": 1.4596967892780244e-07, "loss": 0.74110276, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.642733097076416 }, { "auxiliary_loss_clip": 0.01171691, "auxiliary_loss_mlp": 0.01022251, "balance_loss_clip": 1.04912853, "balance_loss_mlp": 1.01611149, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 2.431684832132277, "language_loss": 0.74453938, "learning_rate": 1.4567768661221314e-07, "loss": 0.76647884, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.6536061763763428 }, { "auxiliary_loss_clip": 0.01227637, "auxiliary_loss_mlp": 0.02566425, "balance_loss_clip": 1.04911137, "balance_loss_mlp": 0.99992144, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 2.111432920466043, "language_loss": 0.74735117, "learning_rate": 1.4538597559584442e-07, "loss": 0.78529173, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.680534601211548 }, { "auxiliary_loss_clip": 0.01269249, "auxiliary_loss_mlp": 0.01021947, "balance_loss_clip": 1.04547513, "balance_loss_mlp": 1.0146451, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 2.304527689527815, "language_loss": 0.78853637, "learning_rate": 1.4509454592294823e-07, "loss": 0.81144834, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.7490921020507812 }, { "auxiliary_loss_clip": 0.01327597, "auxiliary_loss_mlp": 0.02567915, "balance_loss_clip": 1.04807258, "balance_loss_mlp": 0.99992251, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 2.17337299563662, "language_loss": 0.786129, "learning_rate": 1.448033976377354e-07, "loss": 0.82508409, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.734869956970215 }, { "auxiliary_loss_clip": 0.01222591, "auxiliary_loss_mlp": 0.01022938, "balance_loss_clip": 1.04508519, "balance_loss_mlp": 1.01653373, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 2.0485433799345847, "language_loss": 0.74170417, "learning_rate": 1.445125307843713e-07, "loss": 0.76415944, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 4.496086120605469 }, { "auxiliary_loss_clip": 0.01219292, "auxiliary_loss_mlp": 0.01025667, "balance_loss_clip": 1.04816508, "balance_loss_mlp": 1.01879168, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.6873821645974334, "language_loss": 0.75910187, "learning_rate": 1.442219454069813e-07, "loss": 0.78155148, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.6921231746673584 }, { "auxiliary_loss_clip": 0.01375336, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.04349041, "balance_loss_mlp": 1.0242914, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 2.8010235160232457, "language_loss": 0.66620779, "learning_rate": 1.4393164154964676e-07, "loss": 0.69027162, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.795656681060791 }, { "auxiliary_loss_clip": 0.01227566, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.05238199, "balance_loss_mlp": 1.02200365, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 1.7740600112751952, "language_loss": 0.94155294, "learning_rate": 1.4364161925640649e-07, "loss": 0.96412253, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 3.616431713104248 }, { "auxiliary_loss_clip": 0.01171591, "auxiliary_loss_mlp": 0.01024603, "balance_loss_clip": 1.04801071, "balance_loss_mlp": 1.01846933, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 2.186790461387461, "language_loss": 0.85071641, "learning_rate": 1.4335187857125663e-07, "loss": 0.87267834, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.6117806434631348 }, { "auxiliary_loss_clip": 0.01223571, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.04706883, "balance_loss_mlp": 1.02253568, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.7492575243104, "language_loss": 0.75840783, "learning_rate": 1.4306241953815023e-07, "loss": 0.78093755, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.7079453468322754 }, { "auxiliary_loss_clip": 0.01223791, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.04704714, "balance_loss_mlp": 1.02325284, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.846427176840634, "language_loss": 0.71412981, "learning_rate": 1.4277324220099862e-07, "loss": 0.73666304, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 3.556201696395874 }, { "auxiliary_loss_clip": 0.01317914, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.04182041, "balance_loss_mlp": 1.017905, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 2.0137296240438176, "language_loss": 0.74163628, "learning_rate": 1.4248434660366938e-07, "loss": 0.7650643, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.7116291522979736 }, { "auxiliary_loss_clip": 0.01271055, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 1.04832554, "balance_loss_mlp": 1.01978314, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 2.2357768874443646, "language_loss": 0.7030068, "learning_rate": 1.4219573278998808e-07, "loss": 0.72598284, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.6980764865875244 }, { "auxiliary_loss_clip": 0.01273312, "auxiliary_loss_mlp": 0.01024047, "balance_loss_clip": 1.04375625, "balance_loss_mlp": 1.01677847, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 3.0571938823276494, "language_loss": 0.65116316, "learning_rate": 1.4190740080373685e-07, "loss": 0.67413676, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.7685272693634033 }, { "auxiliary_loss_clip": 0.01368345, "auxiliary_loss_mlp": 0.01023588, "balance_loss_clip": 1.04226232, "balance_loss_mlp": 1.0160836, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 2.0678173602275627, "language_loss": 0.84038186, "learning_rate": 1.4161935068865538e-07, "loss": 0.8643012, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.6727406978607178 }, { "auxiliary_loss_clip": 0.01173889, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.04965568, "balance_loss_mlp": 1.01843464, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 2.062243846718292, "language_loss": 0.75656617, "learning_rate": 1.4133158248844113e-07, "loss": 0.77856332, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.6615543365478516 }, { "auxiliary_loss_clip": 0.0121909, "auxiliary_loss_mlp": 0.01023316, "balance_loss_clip": 1.04373741, "balance_loss_mlp": 1.01592803, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.9795344453013903, "language_loss": 0.73278463, "learning_rate": 1.4104409624674785e-07, "loss": 0.75520873, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.701359272003174 }, { "auxiliary_loss_clip": 0.01223717, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.05040801, "balance_loss_mlp": 1.01917505, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.825370580375947, "language_loss": 0.78759128, "learning_rate": 1.407568920071873e-07, "loss": 0.81008649, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.701831340789795 }, { "auxiliary_loss_clip": 0.0117801, "auxiliary_loss_mlp": 0.01027442, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.01987791, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 2.016378527691623, "language_loss": 0.68550783, "learning_rate": 1.4046996981332782e-07, "loss": 0.70756239, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.7356677055358887 }, { "auxiliary_loss_clip": 0.01222156, "auxiliary_loss_mlp": 0.01024415, "balance_loss_clip": 1.04494643, "balance_loss_mlp": 1.01710176, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 3.1558971321353972, "language_loss": 0.78087866, "learning_rate": 1.4018332970869516e-07, "loss": 0.80334443, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.6898298263549805 }, { "auxiliary_loss_clip": 0.01275611, "auxiliary_loss_mlp": 0.01025983, "balance_loss_clip": 1.05000305, "balance_loss_mlp": 1.01872957, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 3.784461360428959, "language_loss": 0.85200697, "learning_rate": 1.398969717367733e-07, "loss": 0.87502295, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.748755931854248 }, { "auxiliary_loss_clip": 0.01263988, "auxiliary_loss_mlp": 0.0102679, "balance_loss_clip": 1.04526067, "balance_loss_mlp": 1.02009904, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.8965431264166648, "language_loss": 0.76458442, "learning_rate": 1.396108959410014e-07, "loss": 0.78749216, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.682246685028076 }, { "auxiliary_loss_clip": 0.01221967, "auxiliary_loss_mlp": 0.02567005, "balance_loss_clip": 1.04870272, "balance_loss_mlp": 0.99988085, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 2.044767085568208, "language_loss": 0.81362438, "learning_rate": 1.3932510236477745e-07, "loss": 0.8515141, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.709618330001831 }, { "auxiliary_loss_clip": 0.0122235, "auxiliary_loss_mlp": 0.01025382, "balance_loss_clip": 1.0452224, "balance_loss_mlp": 1.01800036, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 2.07152122771741, "language_loss": 0.56306332, "learning_rate": 1.3903959105145636e-07, "loss": 0.58554065, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.667526960372925 }, { "auxiliary_loss_clip": 0.01169792, "auxiliary_loss_mlp": 0.01024424, "balance_loss_clip": 1.04704106, "balance_loss_mlp": 1.0180676, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.0216512598166436, "language_loss": 0.832744, "learning_rate": 1.387543620443492e-07, "loss": 0.85468614, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.6250710487365723 }, { "auxiliary_loss_clip": 0.0117064, "auxiliary_loss_mlp": 0.01026102, "balance_loss_clip": 1.04852319, "balance_loss_mlp": 1.01972723, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 1.9882581166257396, "language_loss": 0.84315526, "learning_rate": 1.3846941538672606e-07, "loss": 0.86512268, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.6054844856262207 }, { "auxiliary_loss_clip": 0.01373217, "auxiliary_loss_mlp": 0.01028897, "balance_loss_clip": 1.0451951, "balance_loss_mlp": 1.02221513, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.2774921862776263, "language_loss": 0.8093859, "learning_rate": 1.3818475112181193e-07, "loss": 0.83340704, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.8032939434051514 }, { "auxiliary_loss_clip": 0.01168589, "auxiliary_loss_mlp": 0.01024801, "balance_loss_clip": 1.04653525, "balance_loss_mlp": 1.01822019, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 2.166088324066519, "language_loss": 0.79624707, "learning_rate": 1.3790036929279091e-07, "loss": 0.81818092, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.6044211387634277 }, { "auxiliary_loss_clip": 0.01220728, "auxiliary_loss_mlp": 0.02564581, "balance_loss_clip": 1.04760718, "balance_loss_mlp": 0.99988657, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.2066664756451293, "language_loss": 0.59407628, "learning_rate": 1.3761626994280363e-07, "loss": 0.63192934, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.738797903060913 }, { "auxiliary_loss_clip": 0.01326093, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.04482365, "balance_loss_mlp": 1.02018952, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.8035340292053146, "language_loss": 0.73662883, "learning_rate": 1.3733245311494735e-07, "loss": 0.76015967, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 3.657203435897827 }, { "auxiliary_loss_clip": 0.01222452, "auxiliary_loss_mlp": 0.01022843, "balance_loss_clip": 1.04832029, "balance_loss_mlp": 1.01594353, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 2.2718099462960883, "language_loss": 0.70425606, "learning_rate": 1.3704891885227676e-07, "loss": 0.72670901, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.628399610519409 }, { "auxiliary_loss_clip": 0.01325247, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.04190588, "balance_loss_mlp": 1.02125275, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 3.4571200140530163, "language_loss": 0.78032184, "learning_rate": 1.367656671978037e-07, "loss": 0.80386293, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 3.6607017517089844 }, { "auxiliary_loss_clip": 0.01278443, "auxiliary_loss_mlp": 0.01024152, "balance_loss_clip": 1.04628563, "balance_loss_mlp": 1.01733947, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 1.8920042621806061, "language_loss": 0.73377997, "learning_rate": 1.36482698194498e-07, "loss": 0.7568059, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.671027421951294 }, { "auxiliary_loss_clip": 0.01269704, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.04469025, "balance_loss_mlp": 1.02019882, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.8382819463688291, "language_loss": 0.72190213, "learning_rate": 1.3620001188528506e-07, "loss": 0.74487102, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 3.7116382122039795 }, { "auxiliary_loss_clip": 0.01223163, "auxiliary_loss_mlp": 0.01024252, "balance_loss_clip": 1.04526162, "balance_loss_mlp": 1.01751995, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.471766360138975, "language_loss": 0.73717701, "learning_rate": 1.3591760831304865e-07, "loss": 0.75965118, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.6521761417388916 }, { "auxiliary_loss_clip": 0.01172416, "auxiliary_loss_mlp": 0.01026014, "balance_loss_clip": 1.04905343, "balance_loss_mlp": 1.01887894, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.7638770351518518, "language_loss": 0.79220796, "learning_rate": 1.356354875206287e-07, "loss": 0.81419218, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.5754919052124023 }, { "auxiliary_loss_clip": 0.01317416, "auxiliary_loss_mlp": 0.01025611, "balance_loss_clip": 1.04640532, "balance_loss_mlp": 1.01861966, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 2.242904258333885, "language_loss": 0.69870919, "learning_rate": 1.3535364955082296e-07, "loss": 0.72213948, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 3.6201729774475098 }, { "auxiliary_loss_clip": 0.01173246, "auxiliary_loss_mlp": 0.01023771, "balance_loss_clip": 1.05132747, "balance_loss_mlp": 1.01695204, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.9766735866031175, "language_loss": 0.64991766, "learning_rate": 1.3507209444638613e-07, "loss": 0.67188787, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.654050588607788 }, { "auxiliary_loss_clip": 0.01221184, "auxiliary_loss_mlp": 0.01024346, "balance_loss_clip": 1.04892516, "balance_loss_mlp": 1.01761949, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.943862046912065, "language_loss": 0.74070781, "learning_rate": 1.347908222500298e-07, "loss": 0.76316315, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.601515531539917 }, { "auxiliary_loss_clip": 0.01316453, "auxiliary_loss_mlp": 0.01021728, "balance_loss_clip": 1.04671144, "balance_loss_mlp": 1.01526141, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 3.7059924241699727, "language_loss": 0.70000553, "learning_rate": 1.3450983300442276e-07, "loss": 0.72338736, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.747530221939087 }, { "auxiliary_loss_clip": 0.01224894, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.04837048, "balance_loss_mlp": 1.01958525, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 1.9746201122831275, "language_loss": 0.7377702, "learning_rate": 1.3422912675219068e-07, "loss": 0.76028049, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.6386117935180664 }, { "auxiliary_loss_clip": 0.01171878, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.05020213, "balance_loss_mlp": 1.02231956, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.5720991744848314, "language_loss": 0.79138064, "learning_rate": 1.339487035359166e-07, "loss": 0.81338805, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.623685359954834 }, { "auxiliary_loss_clip": 0.01274242, "auxiliary_loss_mlp": 0.02562072, "balance_loss_clip": 1.04907227, "balance_loss_mlp": 0.99987918, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.5892674647402252, "language_loss": 0.8470186, "learning_rate": 1.336685633981409e-07, "loss": 0.88538176, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.7033698558807373 }, { "auxiliary_loss_clip": 0.01225237, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.04810619, "balance_loss_mlp": 1.02048659, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 5.530329441921911, "language_loss": 0.74949569, "learning_rate": 1.333887063813597e-07, "loss": 0.77203095, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.651745557785034 }, { "auxiliary_loss_clip": 0.01272395, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.04475832, "balance_loss_mlp": 1.02088141, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.8177045008327999, "language_loss": 0.66448891, "learning_rate": 1.331091325280278e-07, "loss": 0.68748534, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.642770290374756 }, { "auxiliary_loss_clip": 0.01372449, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.04642582, "balance_loss_mlp": 1.02201009, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 2.3072593327739486, "language_loss": 0.78629494, "learning_rate": 1.3282984188055625e-07, "loss": 0.81031227, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.8295605182647705 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.04912186, "balance_loss_mlp": 1.02263403, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.6721933654793701, "language_loss": 0.79528856, "learning_rate": 1.3255083448131288e-07, "loss": 0.81732172, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.616567611694336 }, { "auxiliary_loss_clip": 0.01224959, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.04564297, "balance_loss_mlp": 1.02215862, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.0741285204780335, "language_loss": 0.79201901, "learning_rate": 1.3227211037262365e-07, "loss": 0.81456488, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.633108615875244 }, { "auxiliary_loss_clip": 0.01274578, "auxiliary_loss_mlp": 0.01024058, "balance_loss_clip": 1.04264736, "balance_loss_mlp": 1.01700056, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 5.683834652954261, "language_loss": 0.85298836, "learning_rate": 1.319936695967696e-07, "loss": 0.87597477, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.6847503185272217 }, { "auxiliary_loss_clip": 0.01177562, "auxiliary_loss_mlp": 0.01026533, "balance_loss_clip": 1.04862046, "balance_loss_mlp": 1.01893377, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.204810601250175, "language_loss": 0.81790155, "learning_rate": 1.3171551219599097e-07, "loss": 0.83994251, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.639313220977783 }, { "auxiliary_loss_clip": 0.01174079, "auxiliary_loss_mlp": 0.01024247, "balance_loss_clip": 1.05052078, "balance_loss_mlp": 1.017398, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.3223994692888743, "language_loss": 0.7779513, "learning_rate": 1.3143763821248377e-07, "loss": 0.79993451, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.63461971282959 }, { "auxiliary_loss_clip": 0.01171851, "auxiliary_loss_mlp": 0.01022837, "balance_loss_clip": 1.04929662, "balance_loss_mlp": 1.01648617, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 1.7570825118254785, "language_loss": 0.7223503, "learning_rate": 1.3116004768840118e-07, "loss": 0.74429721, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.708155393600464 }, { "auxiliary_loss_clip": 0.01174309, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 1.04928803, "balance_loss_mlp": 1.01997471, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.7228635281586109, "language_loss": 0.74278444, "learning_rate": 1.3088274066585348e-07, "loss": 0.76479697, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.6291143894195557 }, { "auxiliary_loss_clip": 0.01324168, "auxiliary_loss_mlp": 0.01028012, "balance_loss_clip": 1.04281139, "balance_loss_mlp": 1.0216608, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.397269711014476, "language_loss": 0.90645951, "learning_rate": 1.3060571718690749e-07, "loss": 0.92998135, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.706164836883545 }, { "auxiliary_loss_clip": 0.01223266, "auxiliary_loss_mlp": 0.02505433, "balance_loss_clip": 1.00686336, "balance_loss_mlp": 0.99991113, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.7478632655142272, "language_loss": 0.56885737, "learning_rate": 1.3032897729358805e-07, "loss": 0.60614437, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.3171682357788086 }, { "auxiliary_loss_clip": 0.0141067, "auxiliary_loss_mlp": 0.02568327, "balance_loss_clip": 1.03763485, "balance_loss_mlp": 0.99990785, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 1.9570256220478626, "language_loss": 0.79973578, "learning_rate": 1.3005252102787645e-07, "loss": 0.83952576, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 3.78452730178833 }, { "auxiliary_loss_clip": 0.01226156, "auxiliary_loss_mlp": 0.01024222, "balance_loss_clip": 1.04868722, "balance_loss_mlp": 1.01737905, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.6440704825572914, "language_loss": 0.73643637, "learning_rate": 1.297763484317105e-07, "loss": 0.7589401, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.780329704284668 }, { "auxiliary_loss_clip": 0.01270272, "auxiliary_loss_mlp": 0.02567712, "balance_loss_clip": 1.04007924, "balance_loss_mlp": 0.99988687, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.2110640799479606, "language_loss": 0.70196527, "learning_rate": 1.2950045954698551e-07, "loss": 0.74034512, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 3.6594748497009277 }, { "auxiliary_loss_clip": 0.01314605, "auxiliary_loss_mlp": 0.01023852, "balance_loss_clip": 1.0441674, "balance_loss_mlp": 1.01731575, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.5793746909982582, "language_loss": 0.75635505, "learning_rate": 1.2922485441555343e-07, "loss": 0.77973968, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.7026126384735107 }, { "auxiliary_loss_clip": 0.01168897, "auxiliary_loss_mlp": 0.01021112, "balance_loss_clip": 1.04615521, "balance_loss_mlp": 1.01436794, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.857981985745319, "language_loss": 0.81832492, "learning_rate": 1.2894953307922363e-07, "loss": 0.84022498, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 3.522310733795166 }, { "auxiliary_loss_clip": 0.01318562, "auxiliary_loss_mlp": 0.01025566, "balance_loss_clip": 1.04318261, "balance_loss_mlp": 1.01822257, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.8772549921267627, "language_loss": 0.83897412, "learning_rate": 1.2867449557976208e-07, "loss": 0.86241543, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.7546064853668213 }, { "auxiliary_loss_clip": 0.01219927, "auxiliary_loss_mlp": 0.01025838, "balance_loss_clip": 1.0475992, "balance_loss_mlp": 1.01885879, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 1.8307414463301817, "language_loss": 0.76026374, "learning_rate": 1.283997419588916e-07, "loss": 0.7827214, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.625218152999878 }, { "auxiliary_loss_clip": 0.01226953, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.04817867, "balance_loss_mlp": 1.01749301, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 1.853844851391151, "language_loss": 0.62224114, "learning_rate": 1.2812527225829216e-07, "loss": 0.64475155, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 3.48183274269104 }, { "auxiliary_loss_clip": 0.01233141, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.05200112, "balance_loss_mlp": 1.02604282, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 2.1583089960833193, "language_loss": 0.76693892, "learning_rate": 1.2785108651960052e-07, "loss": 0.78960586, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.651188850402832 }, { "auxiliary_loss_clip": 0.01226966, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.04833615, "balance_loss_mlp": 1.02361155, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 2.1210235880176023, "language_loss": 0.80867559, "learning_rate": 1.2757718478441094e-07, "loss": 0.83124781, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.7574827671051025 }, { "auxiliary_loss_clip": 0.01267276, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.04236984, "balance_loss_mlp": 1.01677024, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 1.6745013988603412, "language_loss": 0.77232307, "learning_rate": 1.2730356709427302e-07, "loss": 0.79523242, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.6885931491851807 }, { "auxiliary_loss_clip": 0.01227591, "auxiliary_loss_mlp": 0.01026653, "balance_loss_clip": 1.05237246, "balance_loss_mlp": 1.01975381, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 2.30470123456414, "language_loss": 0.59780079, "learning_rate": 1.2703023349069542e-07, "loss": 0.62034321, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.7803781032562256 }, { "auxiliary_loss_clip": 0.01219151, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.0474894, "balance_loss_mlp": 1.02140939, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 4.347554356809147, "language_loss": 0.61937535, "learning_rate": 1.2675718401514223e-07, "loss": 0.6418488, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.744286060333252 }, { "auxiliary_loss_clip": 0.01271171, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04589295, "balance_loss_mlp": 1.01775777, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 2.3748885280668177, "language_loss": 0.74425548, "learning_rate": 1.264844187090346e-07, "loss": 0.76721191, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.6506099700927734 }, { "auxiliary_loss_clip": 0.01270697, "auxiliary_loss_mlp": 0.01025497, "balance_loss_clip": 1.04484403, "balance_loss_mlp": 1.01882672, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 2.940849952269287, "language_loss": 0.75230479, "learning_rate": 1.262119376137516e-07, "loss": 0.77526671, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.812908172607422 }, { "auxiliary_loss_clip": 0.01217014, "auxiliary_loss_mlp": 0.01023723, "balance_loss_clip": 1.04489338, "balance_loss_mlp": 1.01677871, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.8035758718627675, "language_loss": 0.8520726, "learning_rate": 1.2593974077062707e-07, "loss": 0.87447995, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.6679067611694336 }, { "auxiliary_loss_clip": 0.01313194, "auxiliary_loss_mlp": 0.01026012, "balance_loss_clip": 1.04267728, "balance_loss_mlp": 1.01901698, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.862212924776728, "language_loss": 0.63347864, "learning_rate": 1.2566782822095423e-07, "loss": 0.65687066, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.790428638458252 }, { "auxiliary_loss_clip": 0.01326654, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.0483135, "balance_loss_mlp": 1.02000189, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.303497908134964, "language_loss": 0.71178484, "learning_rate": 1.2539620000598162e-07, "loss": 0.73532069, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.827194929122925 }, { "auxiliary_loss_clip": 0.01170249, "auxiliary_loss_mlp": 0.01022435, "balance_loss_clip": 1.04714298, "balance_loss_mlp": 1.01524055, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 4.370343002639944, "language_loss": 0.7984035, "learning_rate": 1.2512485616691492e-07, "loss": 0.82033032, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.6182937622070312 }, { "auxiliary_loss_clip": 0.01324041, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.04316187, "balance_loss_mlp": 1.01836658, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.5065724449459545, "language_loss": 0.80750293, "learning_rate": 1.2485379674491681e-07, "loss": 0.83100218, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.7968783378601074 }, { "auxiliary_loss_clip": 0.01272329, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.04860485, "balance_loss_mlp": 1.01876199, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.5600525529138545, "language_loss": 0.79320669, "learning_rate": 1.2458302178110657e-07, "loss": 0.8161853, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.6930782794952393 }, { "auxiliary_loss_clip": 0.01310812, "auxiliary_loss_mlp": 0.01020861, "balance_loss_clip": 1.0416193, "balance_loss_mlp": 1.01431596, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 3.3962297712759426, "language_loss": 0.82558924, "learning_rate": 1.2431253131656118e-07, "loss": 0.84890592, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.6981492042541504 }, { "auxiliary_loss_clip": 0.01264509, "auxiliary_loss_mlp": 0.01022906, "balance_loss_clip": 1.04533482, "balance_loss_mlp": 1.01601005, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 1.9821885283209937, "language_loss": 0.7669971, "learning_rate": 1.240423253923133e-07, "loss": 0.78987134, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.655198574066162 }, { "auxiliary_loss_clip": 0.01224023, "auxiliary_loss_mlp": 0.01025153, "balance_loss_clip": 1.04646039, "balance_loss_mlp": 1.01833475, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 3.3860200820568456, "language_loss": 0.69736761, "learning_rate": 1.237724040493533e-07, "loss": 0.71985936, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.674344778060913 }, { "auxiliary_loss_clip": 0.0118061, "auxiliary_loss_mlp": 0.01032179, "balance_loss_clip": 1.05338907, "balance_loss_mlp": 1.02431107, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 3.883372888213518, "language_loss": 0.72724199, "learning_rate": 1.2350276732862773e-07, "loss": 0.74936992, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.6162221431732178 }, { "auxiliary_loss_clip": 0.0111539, "auxiliary_loss_mlp": 0.01000824, "balance_loss_clip": 1.00669837, "balance_loss_mlp": 0.99992961, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8361473658592499, "language_loss": 0.56688654, "learning_rate": 1.2323341527103993e-07, "loss": 0.58804864, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.1401209831237793 }, { "auxiliary_loss_clip": 0.01172418, "auxiliary_loss_mlp": 0.01020804, "balance_loss_clip": 1.04901159, "balance_loss_mlp": 1.014256, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 1.9486360456092378, "language_loss": 0.85221815, "learning_rate": 1.2296434791745135e-07, "loss": 0.8741504, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 3.4504992961883545 }, { "auxiliary_loss_clip": 0.01222463, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.0473907, "balance_loss_mlp": 1.01689315, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 1.7648082287906823, "language_loss": 0.76615512, "learning_rate": 1.2269556530867875e-07, "loss": 0.78862011, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.6642749309539795 }, { "auxiliary_loss_clip": 0.01178127, "auxiliary_loss_mlp": 0.01032211, "balance_loss_clip": 1.05094457, "balance_loss_mlp": 1.02477527, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 1.9456588301723987, "language_loss": 0.82048386, "learning_rate": 1.2242706748549614e-07, "loss": 0.84258723, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 3.5924248695373535 }, { "auxiliary_loss_clip": 0.01273805, "auxiliary_loss_mlp": 0.01023415, "balance_loss_clip": 1.04250765, "balance_loss_mlp": 1.01646769, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 1.8760207925346373, "language_loss": 0.82348728, "learning_rate": 1.2215885448863473e-07, "loss": 0.84645951, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.6440443992614746 }, { "auxiliary_loss_clip": 0.01269853, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.04681301, "balance_loss_mlp": 1.02293777, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 1.9237207411845587, "language_loss": 0.80587387, "learning_rate": 1.2189092635878152e-07, "loss": 0.82886839, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 3.5700223445892334 }, { "auxiliary_loss_clip": 0.01312938, "auxiliary_loss_mlp": 0.01023724, "balance_loss_clip": 1.04140592, "balance_loss_mlp": 1.01643741, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.638336111605152, "language_loss": 0.77326506, "learning_rate": 1.216232831365822e-07, "loss": 0.79663169, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.6726181507110596 }, { "auxiliary_loss_clip": 0.0128123, "auxiliary_loss_mlp": 0.01030351, "balance_loss_clip": 1.04786587, "balance_loss_mlp": 1.02295673, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 1.7894264856593751, "language_loss": 0.80885303, "learning_rate": 1.2135592486263678e-07, "loss": 0.83196884, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.746429443359375 }, { "auxiliary_loss_clip": 0.01269303, "auxiliary_loss_mlp": 0.01025822, "balance_loss_clip": 1.04512143, "balance_loss_mlp": 1.01934338, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 2.08542561000155, "language_loss": 0.61509508, "learning_rate": 1.2108885157750415e-07, "loss": 0.63804632, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 3.74702787399292 }, { "auxiliary_loss_clip": 0.01314252, "auxiliary_loss_mlp": 0.02564575, "balance_loss_clip": 1.04745388, "balance_loss_mlp": 0.99989754, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.8798390599378836, "language_loss": 0.80302495, "learning_rate": 1.2082206332169897e-07, "loss": 0.84181321, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.6902048587799072 }, { "auxiliary_loss_clip": 0.01267332, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.04717016, "balance_loss_mlp": 1.01877463, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 4.467434391869748, "language_loss": 0.73576033, "learning_rate": 1.2055556013569225e-07, "loss": 0.75868869, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.6753575801849365 }, { "auxiliary_loss_clip": 0.01170964, "auxiliary_loss_mlp": 0.01026626, "balance_loss_clip": 1.04681921, "balance_loss_mlp": 1.01980174, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.699026548396117, "language_loss": 0.82066101, "learning_rate": 1.2028934205991315e-07, "loss": 0.84263688, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.681149482727051 }, { "auxiliary_loss_clip": 0.01221524, "auxiliary_loss_mlp": 0.01022456, "balance_loss_clip": 1.04450989, "balance_loss_mlp": 1.01566696, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.703928870901403, "language_loss": 0.76672852, "learning_rate": 1.2002340913474607e-07, "loss": 0.78916824, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.721454381942749 }, { "auxiliary_loss_clip": 0.01177081, "auxiliary_loss_mlp": 0.01023707, "balance_loss_clip": 1.05048084, "balance_loss_mlp": 1.0166322, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 2.8019508743824373, "language_loss": 0.74046415, "learning_rate": 1.1975776140053317e-07, "loss": 0.76247203, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.637305736541748 }, { "auxiliary_loss_clip": 0.01370897, "auxiliary_loss_mlp": 0.01028989, "balance_loss_clip": 1.04411769, "balance_loss_mlp": 1.02126408, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 2.319736781769992, "language_loss": 0.73782921, "learning_rate": 1.194923988975729e-07, "loss": 0.76182806, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.7085745334625244 }, { "auxiliary_loss_clip": 0.01315229, "auxiliary_loss_mlp": 0.01023509, "balance_loss_clip": 1.04383707, "balance_loss_mlp": 1.01618361, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 3.8024022098676156, "language_loss": 0.7325964, "learning_rate": 1.192273216661206e-07, "loss": 0.75598377, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.649780511856079 }, { "auxiliary_loss_clip": 0.01326143, "auxiliary_loss_mlp": 0.00999753, "balance_loss_clip": 1.00782645, "balance_loss_mlp": 0.99890107, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.766753728525189, "language_loss": 0.57428825, "learning_rate": 1.189625297463881e-07, "loss": 0.59754723, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.362109661102295 }, { "auxiliary_loss_clip": 0.01417386, "auxiliary_loss_mlp": 0.01024467, "balance_loss_clip": 1.03876984, "balance_loss_mlp": 1.01738906, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 2.0346585891092484, "language_loss": 0.79946548, "learning_rate": 1.1869802317854394e-07, "loss": 0.82388401, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.2331345081329346 }, { "auxiliary_loss_clip": 0.01370542, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.04273367, "balance_loss_mlp": 1.02475119, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 2.0801419532988135, "language_loss": 0.72116125, "learning_rate": 1.1843380200271425e-07, "loss": 0.74518967, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.7760350704193115 }, { "auxiliary_loss_clip": 0.01218198, "auxiliary_loss_mlp": 0.01021527, "balance_loss_clip": 1.0446049, "balance_loss_mlp": 1.01488113, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.983653176616118, "language_loss": 0.80606115, "learning_rate": 1.181698662589805e-07, "loss": 0.82845843, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.7385826110839844 }, { "auxiliary_loss_clip": 0.01221169, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.04786956, "balance_loss_mlp": 1.02086425, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 2.4862890489443203, "language_loss": 0.76037651, "learning_rate": 1.1790621598738249e-07, "loss": 0.7828666, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.606382131576538 }, { "auxiliary_loss_clip": 0.01168182, "auxiliary_loss_mlp": 0.01026161, "balance_loss_clip": 1.04766989, "balance_loss_mlp": 1.01964951, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 1.950574595682022, "language_loss": 0.74693412, "learning_rate": 1.1764285122791461e-07, "loss": 0.76887751, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.6603198051452637 }, { "auxiliary_loss_clip": 0.01221331, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.04529023, "balance_loss_mlp": 1.01995122, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 2.09888445231494, "language_loss": 0.76835513, "learning_rate": 1.173797720205294e-07, "loss": 0.79083622, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.595998764038086 }, { "auxiliary_loss_clip": 0.01223638, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04829931, "balance_loss_mlp": 1.01697826, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 2.735672757818518, "language_loss": 0.71787536, "learning_rate": 1.1711697840513602e-07, "loss": 0.74035239, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.7270569801330566 }, { "auxiliary_loss_clip": 0.01215977, "auxiliary_loss_mlp": 0.01020575, "balance_loss_clip": 1.04396331, "balance_loss_mlp": 1.01388431, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 3.200135976877204, "language_loss": 0.71203959, "learning_rate": 1.1685447042160012e-07, "loss": 0.73440516, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.7879831790924072 }, { "auxiliary_loss_clip": 0.01176022, "auxiliary_loss_mlp": 0.01023175, "balance_loss_clip": 1.04978931, "balance_loss_mlp": 1.01614475, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.7274605621155177, "language_loss": 0.71718419, "learning_rate": 1.1659224810974367e-07, "loss": 0.73917615, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.6253976821899414 }, { "auxiliary_loss_clip": 0.01268511, "auxiliary_loss_mlp": 0.01024557, "balance_loss_clip": 1.04817533, "balance_loss_mlp": 1.01754212, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.4708108284795507, "language_loss": 0.68520498, "learning_rate": 1.1633031150934591e-07, "loss": 0.70813566, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.757740020751953 }, { "auxiliary_loss_clip": 0.01222455, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.04774427, "balance_loss_mlp": 1.02439213, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 1.8880088633221186, "language_loss": 0.80113721, "learning_rate": 1.1606866066014176e-07, "loss": 0.82367718, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 3.513551712036133 }, { "auxiliary_loss_clip": 0.01316803, "auxiliary_loss_mlp": 0.01025939, "balance_loss_clip": 1.04494524, "balance_loss_mlp": 1.01954019, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.6227735301043844, "language_loss": 0.75460011, "learning_rate": 1.1580729560182434e-07, "loss": 0.77802753, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.679056406021118 }, { "auxiliary_loss_clip": 0.01172035, "auxiliary_loss_mlp": 0.02563212, "balance_loss_clip": 1.04852223, "balance_loss_mlp": 0.99990857, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 1.8291268290991458, "language_loss": 0.71089709, "learning_rate": 1.1554621637404171e-07, "loss": 0.74824953, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 3.6175131797790527 }, { "auxiliary_loss_clip": 0.01221475, "auxiliary_loss_mlp": 0.0102661, "balance_loss_clip": 1.04681945, "balance_loss_mlp": 1.0192728, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 3.811906606866283, "language_loss": 0.61113703, "learning_rate": 1.1528542301639999e-07, "loss": 0.63361788, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.655818223953247 }, { "auxiliary_loss_clip": 0.0131884, "auxiliary_loss_mlp": 0.0102258, "balance_loss_clip": 1.04246283, "balance_loss_mlp": 1.01547503, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 3.158675711061776, "language_loss": 0.82595748, "learning_rate": 1.1502491556846105e-07, "loss": 0.84937167, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 3.508023977279663 }, { "auxiliary_loss_clip": 0.01267541, "auxiliary_loss_mlp": 0.01022525, "balance_loss_clip": 1.04558384, "balance_loss_mlp": 1.01584959, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.2824985270352522, "language_loss": 0.81598788, "learning_rate": 1.1476469406974331e-07, "loss": 0.83888853, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.6619369983673096 }, { "auxiliary_loss_clip": 0.01171449, "auxiliary_loss_mlp": 0.01022726, "balance_loss_clip": 1.04948092, "balance_loss_mlp": 1.01604474, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.8470569852025782, "language_loss": 0.7680034, "learning_rate": 1.1450475855972341e-07, "loss": 0.78994513, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.6106791496276855 }, { "auxiliary_loss_clip": 0.01270434, "auxiliary_loss_mlp": 0.02565798, "balance_loss_clip": 1.04399228, "balance_loss_mlp": 0.99991864, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 2.872988061616327, "language_loss": 0.70730484, "learning_rate": 1.1424510907783158e-07, "loss": 0.74566716, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 3.613391399383545 }, { "auxiliary_loss_clip": 0.0127495, "auxiliary_loss_mlp": 0.01023224, "balance_loss_clip": 1.04331303, "balance_loss_mlp": 1.01696849, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.8258074210867783, "language_loss": 0.83136523, "learning_rate": 1.1398574566345787e-07, "loss": 0.85434699, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.8397269248962402 }, { "auxiliary_loss_clip": 0.01274431, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 1.04286206, "balance_loss_mlp": 1.01725268, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.0306358963170177, "language_loss": 0.82266021, "learning_rate": 1.1372666835594702e-07, "loss": 0.84565258, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.719120740890503 }, { "auxiliary_loss_clip": 0.01267771, "auxiliary_loss_mlp": 0.01021069, "balance_loss_clip": 1.04481506, "balance_loss_mlp": 1.01432824, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 2.3062669365785657, "language_loss": 0.71871161, "learning_rate": 1.1346787719460071e-07, "loss": 0.74160004, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.6243927478790283 }, { "auxiliary_loss_clip": 0.01263673, "auxiliary_loss_mlp": 0.01023111, "balance_loss_clip": 1.0434978, "balance_loss_mlp": 1.01615214, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.8309592158875168, "language_loss": 0.72581756, "learning_rate": 1.1320937221867732e-07, "loss": 0.74868536, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.708491563796997 }, { "auxiliary_loss_clip": 0.01271219, "auxiliary_loss_mlp": 0.01024573, "balance_loss_clip": 1.04415321, "balance_loss_mlp": 1.01870489, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.859633369194452, "language_loss": 0.79760867, "learning_rate": 1.1295115346739192e-07, "loss": 0.82056659, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.71567964553833 }, { "auxiliary_loss_clip": 0.01275894, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.04665494, "balance_loss_mlp": 1.01846981, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 2.7568388437982865, "language_loss": 0.72908819, "learning_rate": 1.1269322097991629e-07, "loss": 0.75210714, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.920205593109131 }, { "auxiliary_loss_clip": 0.01226686, "auxiliary_loss_mlp": 0.01024164, "balance_loss_clip": 1.04909861, "balance_loss_mlp": 1.01661849, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 3.3742909957778724, "language_loss": 0.68125933, "learning_rate": 1.1243557479537846e-07, "loss": 0.7037679, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.64849853515625 }, { "auxiliary_loss_clip": 0.01171413, "auxiliary_loss_mlp": 0.01026197, "balance_loss_clip": 1.04722667, "balance_loss_mlp": 1.01920867, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.1579578568283626, "language_loss": 0.68830919, "learning_rate": 1.121782149528634e-07, "loss": 0.71028537, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.588974714279175 }, { "auxiliary_loss_clip": 0.0118148, "auxiliary_loss_mlp": 0.01025433, "balance_loss_clip": 1.05143917, "balance_loss_mlp": 1.01872802, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 2.066919308968981, "language_loss": 0.7885235, "learning_rate": 1.1192114149141208e-07, "loss": 0.81059265, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.738297700881958 }, { "auxiliary_loss_clip": 0.01272996, "auxiliary_loss_mlp": 0.01025128, "balance_loss_clip": 1.04311335, "balance_loss_mlp": 1.0181067, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.9272641775470047, "language_loss": 0.65946448, "learning_rate": 1.1166435445002197e-07, "loss": 0.6824457, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.6650853157043457 }, { "auxiliary_loss_clip": 0.01226697, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.05113947, "balance_loss_mlp": 1.02147293, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 2.1144603638334383, "language_loss": 0.68307322, "learning_rate": 1.1140785386764818e-07, "loss": 0.70562875, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.694751024246216 }, { "auxiliary_loss_clip": 0.0121878, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.04538798, "balance_loss_mlp": 1.01955903, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 2.273352555165449, "language_loss": 0.69960946, "learning_rate": 1.1115163978320153e-07, "loss": 0.72206223, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.660167932510376 }, { "auxiliary_loss_clip": 0.01228694, "auxiliary_loss_mlp": 0.02567451, "balance_loss_clip": 1.04931855, "balance_loss_mlp": 0.99991131, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.0431072746153283, "language_loss": 0.82509387, "learning_rate": 1.1089571223554917e-07, "loss": 0.86305535, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.7142207622528076 }, { "auxiliary_loss_clip": 0.01222832, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.0447551, "balance_loss_mlp": 1.02036095, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.946244636229548, "language_loss": 0.85356247, "learning_rate": 1.1064007126351537e-07, "loss": 0.87605989, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.6650357246398926 }, { "auxiliary_loss_clip": 0.01268553, "auxiliary_loss_mlp": 0.01029939, "balance_loss_clip": 1.04831374, "balance_loss_mlp": 1.02261615, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.271810247489334, "language_loss": 0.76382416, "learning_rate": 1.1038471690588003e-07, "loss": 0.78680909, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.730560541152954 }, { "auxiliary_loss_clip": 0.01376834, "auxiliary_loss_mlp": 0.01024605, "balance_loss_clip": 1.04808187, "balance_loss_mlp": 1.01780701, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 2.1198183809265445, "language_loss": 0.79906881, "learning_rate": 1.1012964920138145e-07, "loss": 0.82308328, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.7862727642059326 }, { "auxiliary_loss_clip": 0.0126908, "auxiliary_loss_mlp": 0.01025106, "balance_loss_clip": 1.04215586, "balance_loss_mlp": 1.01826978, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.99096799982235, "language_loss": 0.759341, "learning_rate": 1.0987486818871205e-07, "loss": 0.78228283, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.7263381481170654 }, { "auxiliary_loss_clip": 0.01224118, "auxiliary_loss_mlp": 0.02568371, "balance_loss_clip": 1.05024731, "balance_loss_mlp": 0.99993587, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.2555295089053096, "language_loss": 0.73140419, "learning_rate": 1.0962037390652245e-07, "loss": 0.76932907, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.6176726818084717 }, { "auxiliary_loss_clip": 0.01279495, "auxiliary_loss_mlp": 0.01023731, "balance_loss_clip": 1.05000925, "balance_loss_mlp": 1.01652527, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.865926534015747, "language_loss": 0.71854395, "learning_rate": 1.0936616639341911e-07, "loss": 0.74157619, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 3.650050401687622 }, { "auxiliary_loss_clip": 0.0110757, "auxiliary_loss_mlp": 0.01002146, "balance_loss_clip": 1.00658059, "balance_loss_mlp": 1.00127554, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7515293860408219, "language_loss": 0.5469805, "learning_rate": 1.0911224568796473e-07, "loss": 0.56807768, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.2196896076202393 }, { "auxiliary_loss_clip": 0.01220109, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.04888582, "balance_loss_mlp": 1.02193773, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 2.105001595506374, "language_loss": 0.7086823, "learning_rate": 1.0885861182867984e-07, "loss": 0.73117268, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 3.6072964668273926 }, { "auxiliary_loss_clip": 0.01274021, "auxiliary_loss_mlp": 0.01023225, "balance_loss_clip": 1.0458889, "balance_loss_mlp": 1.0162816, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 6.704834944609961, "language_loss": 0.70965993, "learning_rate": 1.0860526485403942e-07, "loss": 0.7326324, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.7508723735809326 }, { "auxiliary_loss_clip": 0.01172502, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04863405, "balance_loss_mlp": 1.01880217, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.853415290471086, "language_loss": 0.7722007, "learning_rate": 1.0835220480247675e-07, "loss": 0.79418278, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 3.4982364177703857 }, { "auxiliary_loss_clip": 0.01268937, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.04581821, "balance_loss_mlp": 1.024544, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 2.6024824638014614, "language_loss": 0.83871281, "learning_rate": 1.0809943171238067e-07, "loss": 0.86171961, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.6138577461242676 }, { "auxiliary_loss_clip": 0.01280853, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.04820573, "balance_loss_mlp": 1.02118993, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.6663712577394048, "language_loss": 0.63135219, "learning_rate": 1.078469456220965e-07, "loss": 0.65445167, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.682008981704712 }, { "auxiliary_loss_clip": 0.01122736, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.0467689, "balance_loss_mlp": 1.02028561, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 1.8064353644501332, "language_loss": 0.69543916, "learning_rate": 1.0759474656992606e-07, "loss": 0.71694201, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 3.6774489879608154 }, { "auxiliary_loss_clip": 0.01274846, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.0433954, "balance_loss_mlp": 1.01627624, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.5389913183402952, "language_loss": 0.77677435, "learning_rate": 1.0734283459412785e-07, "loss": 0.7997632, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.635111093521118 }, { "auxiliary_loss_clip": 0.01374277, "auxiliary_loss_mlp": 0.01022286, "balance_loss_clip": 1.04308391, "balance_loss_mlp": 1.0146805, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.634634097166387, "language_loss": 0.80165792, "learning_rate": 1.0709120973291707e-07, "loss": 0.82562363, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.7310078144073486 }, { "auxiliary_loss_clip": 0.01174979, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.0497508, "balance_loss_mlp": 1.02002656, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 3.6204237901506695, "language_loss": 0.77568388, "learning_rate": 1.0683987202446475e-07, "loss": 0.79770416, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.5579640865325928 }, { "auxiliary_loss_clip": 0.01227096, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.04763186, "balance_loss_mlp": 1.02024531, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 2.3505575453807133, "language_loss": 0.69899327, "learning_rate": 1.0658882150689862e-07, "loss": 0.72154033, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.6168203353881836 }, { "auxiliary_loss_clip": 0.01322075, "auxiliary_loss_mlp": 0.01021835, "balance_loss_clip": 1.04575276, "balance_loss_mlp": 1.01477814, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.454648144626863, "language_loss": 0.78675234, "learning_rate": 1.0633805821830288e-07, "loss": 0.81019151, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.6135222911834717 }, { "auxiliary_loss_clip": 0.01274225, "auxiliary_loss_mlp": 0.01023593, "balance_loss_clip": 1.04665256, "balance_loss_mlp": 1.0167861, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 4.106360224198777, "language_loss": 0.83086103, "learning_rate": 1.0608758219671753e-07, "loss": 0.85383922, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.738093852996826 }, { "auxiliary_loss_clip": 0.01278029, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.04658771, "balance_loss_mlp": 1.02100646, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.5428873629979003, "language_loss": 0.7042864, "learning_rate": 1.0583739348014065e-07, "loss": 0.72734535, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.6709177494049072 }, { "auxiliary_loss_clip": 0.01176369, "auxiliary_loss_mlp": 0.01030368, "balance_loss_clip": 1.0514394, "balance_loss_mlp": 1.02276576, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 2.29154686661569, "language_loss": 0.84668732, "learning_rate": 1.0558749210652518e-07, "loss": 0.86875468, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.8208038806915283 }, { "auxiliary_loss_clip": 0.01324278, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.0448091, "balance_loss_mlp": 1.02405643, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.736854734918298, "language_loss": 0.85555029, "learning_rate": 1.053378781137808e-07, "loss": 0.87910461, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.7536044120788574 }, { "auxiliary_loss_clip": 0.01173407, "auxiliary_loss_mlp": 0.01030559, "balance_loss_clip": 1.04563189, "balance_loss_mlp": 1.02324831, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 1.9355726600668008, "language_loss": 0.77589869, "learning_rate": 1.0508855153977392e-07, "loss": 0.79793835, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.6563968658447266 }, { "auxiliary_loss_clip": 0.01221793, "auxiliary_loss_mlp": 0.01022343, "balance_loss_clip": 1.04486907, "balance_loss_mlp": 1.01520276, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.8920908665007956, "language_loss": 0.67190742, "learning_rate": 1.0483951242232669e-07, "loss": 0.69434875, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.628389596939087 }, { "auxiliary_loss_clip": 0.01060649, "auxiliary_loss_mlp": 0.00997951, "balance_loss_clip": 1.00633025, "balance_loss_mlp": 0.99707472, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9696953615608013, "language_loss": 0.57700312, "learning_rate": 1.0459076079921936e-07, "loss": 0.59758902, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.224740982055664 }, { "auxiliary_loss_clip": 0.0126797, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 1.04575109, "balance_loss_mlp": 1.02119517, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.481346458767828, "language_loss": 0.85167187, "learning_rate": 1.0434229670818618e-07, "loss": 0.87463892, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.6706936359405518 }, { "auxiliary_loss_clip": 0.01265196, "auxiliary_loss_mlp": 0.01028356, "balance_loss_clip": 1.04362881, "balance_loss_mlp": 1.02185631, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 2.369761021471158, "language_loss": 0.8015337, "learning_rate": 1.0409412018691944e-07, "loss": 0.82446927, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.6955106258392334 }, { "auxiliary_loss_clip": 0.01270777, "auxiliary_loss_mlp": 0.01031502, "balance_loss_clip": 1.04529834, "balance_loss_mlp": 1.02454019, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 1.8416445492711069, "language_loss": 0.75114036, "learning_rate": 1.0384623127306724e-07, "loss": 0.77416313, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.709456443786621 }, { "auxiliary_loss_clip": 0.01320474, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.0423305, "balance_loss_mlp": 1.02038848, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 2.0439805831730005, "language_loss": 0.79504788, "learning_rate": 1.0359863000423397e-07, "loss": 0.81852472, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.7102298736572266 }, { "auxiliary_loss_clip": 0.01173825, "auxiliary_loss_mlp": 0.0102376, "balance_loss_clip": 1.04847646, "balance_loss_mlp": 1.01726627, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.5551303531018585, "language_loss": 0.7188834, "learning_rate": 1.0335131641798112e-07, "loss": 0.74085927, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.7350993156433105 }, { "auxiliary_loss_clip": 0.01165521, "auxiliary_loss_mlp": 0.01002408, "balance_loss_clip": 1.00582445, "balance_loss_mlp": 1.0015738, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8358813554376188, "language_loss": 0.55542231, "learning_rate": 1.0310429055182512e-07, "loss": 0.57710159, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 3.0950400829315186 }, { "auxiliary_loss_clip": 0.01321466, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.044451, "balance_loss_mlp": 1.02547002, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.8380410420872626, "language_loss": 0.73928571, "learning_rate": 1.0285755244324024e-07, "loss": 0.76282597, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.766084909439087 }, { "auxiliary_loss_clip": 0.01272954, "auxiliary_loss_mlp": 0.0256223, "balance_loss_clip": 1.0436542, "balance_loss_mlp": 0.99991298, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.6905846878227084, "language_loss": 0.69032073, "learning_rate": 1.0261110212965629e-07, "loss": 0.72867262, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 3.6712255477905273 }, { "auxiliary_loss_clip": 0.01271008, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.0458647, "balance_loss_mlp": 1.021613, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 2.047707951794858, "language_loss": 0.79010326, "learning_rate": 1.023649396484596e-07, "loss": 0.81309605, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 3.613166570663452 }, { "auxiliary_loss_clip": 0.01173823, "auxiliary_loss_mlp": 0.01023711, "balance_loss_clip": 1.04942274, "balance_loss_mlp": 1.01693368, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 3.8193669455907067, "language_loss": 0.67765427, "learning_rate": 1.0211906503699275e-07, "loss": 0.69962966, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 3.7057671546936035 }, { "auxiliary_loss_clip": 0.0122452, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.05028403, "balance_loss_mlp": 1.02162313, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 3.1905690613604545, "language_loss": 0.82415223, "learning_rate": 1.0187347833255455e-07, "loss": 0.84668791, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.625375270843506 }, { "auxiliary_loss_clip": 0.01171429, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.04959643, "balance_loss_mlp": 1.02459192, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.8358632121547218, "language_loss": 0.7933234, "learning_rate": 1.0162817957240056e-07, "loss": 0.8153497, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.580335855484009 }, { "auxiliary_loss_clip": 0.01114411, "auxiliary_loss_mlp": 0.00998736, "balance_loss_clip": 1.00681853, "balance_loss_mlp": 0.9978773, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.886071762828719, "language_loss": 0.63015676, "learning_rate": 1.0138316879374253e-07, "loss": 0.65128821, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.3735499382019043 }, { "auxiliary_loss_clip": 0.01270135, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.04762268, "balance_loss_mlp": 1.02316213, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 2.202398254874286, "language_loss": 0.74664831, "learning_rate": 1.0113844603374833e-07, "loss": 0.76965117, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.5921730995178223 }, { "auxiliary_loss_clip": 0.01272149, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 1.04500937, "balance_loss_mlp": 1.02024722, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 2.790567710390583, "language_loss": 0.72166419, "learning_rate": 1.0089401132954178e-07, "loss": 0.7446655, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 3.555449962615967 }, { "auxiliary_loss_clip": 0.01270994, "auxiliary_loss_mlp": 0.01025916, "balance_loss_clip": 1.04588842, "balance_loss_mlp": 1.01949978, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.9105340086231528, "language_loss": 0.72360879, "learning_rate": 1.006498647182037e-07, "loss": 0.74657786, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.696904420852661 }, { "auxiliary_loss_clip": 0.01421118, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.03952694, "balance_loss_mlp": 1.02384579, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.341608087218323, "language_loss": 0.71763724, "learning_rate": 1.004060062367713e-07, "loss": 0.74216062, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.809385299682617 }, { "auxiliary_loss_clip": 0.0122483, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.04810238, "balance_loss_mlp": 1.02252114, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.8700754611425907, "language_loss": 0.69837832, "learning_rate": 1.0016243592223728e-07, "loss": 0.72092688, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.6733274459838867 }, { "auxiliary_loss_clip": 0.01416003, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.04120326, "balance_loss_mlp": 1.01676369, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 2.0774841038783625, "language_loss": 0.65575558, "learning_rate": 9.991915381155114e-08, "loss": 0.68015319, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.8910129070281982 }, { "auxiliary_loss_clip": 0.01226024, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.04775643, "balance_loss_mlp": 1.02271295, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.484042692348454, "language_loss": 0.75234711, "learning_rate": 9.967615994161871e-08, "loss": 0.77491093, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.695986747741699 }, { "auxiliary_loss_clip": 0.0117186, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.04894328, "balance_loss_mlp": 1.02152956, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.8063508278906053, "language_loss": 0.78607059, "learning_rate": 9.943345434930161e-08, "loss": 0.80807209, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.640256643295288 }, { "auxiliary_loss_clip": 0.01320915, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.04855204, "balance_loss_mlp": 1.02068353, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 4.223339988793139, "language_loss": 0.69531602, "learning_rate": 9.919103707141885e-08, "loss": 0.71880162, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.8407580852508545 }, { "auxiliary_loss_clip": 0.01224573, "auxiliary_loss_mlp": 0.01022137, "balance_loss_clip": 1.04934037, "balance_loss_mlp": 1.01476097, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 2.1875777265237044, "language_loss": 0.76885998, "learning_rate": 9.89489081447441e-08, "loss": 0.79132712, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.5926425457000732 }, { "auxiliary_loss_clip": 0.01269371, "auxiliary_loss_mlp": 0.01026283, "balance_loss_clip": 1.0434649, "balance_loss_mlp": 1.01894617, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.8559899479002695, "language_loss": 0.83010793, "learning_rate": 9.870706760600844e-08, "loss": 0.85306448, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.7051470279693604 }, { "auxiliary_loss_clip": 0.0127367, "auxiliary_loss_mlp": 0.01030707, "balance_loss_clip": 1.04827189, "balance_loss_mlp": 1.02350426, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.9146514529508956, "language_loss": 0.72864532, "learning_rate": 9.846551549189918e-08, "loss": 0.75168914, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.7136874198913574 }, { "auxiliary_loss_clip": 0.01268434, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.04670918, "balance_loss_mlp": 1.01930678, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 3.5028851586095753, "language_loss": 0.68788987, "learning_rate": 9.822425183905902e-08, "loss": 0.71083772, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.733797550201416 }, { "auxiliary_loss_clip": 0.01219844, "auxiliary_loss_mlp": 0.01001584, "balance_loss_clip": 1.00651741, "balance_loss_mlp": 1.0006901, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9199189282899558, "language_loss": 0.75053149, "learning_rate": 9.798327668408823e-08, "loss": 0.77274579, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.4521212577819824 }, { "auxiliary_loss_clip": 0.01175083, "auxiliary_loss_mlp": 0.01024423, "balance_loss_clip": 1.04833353, "balance_loss_mlp": 1.01678717, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 2.0839712458436406, "language_loss": 0.68922776, "learning_rate": 9.774259006354158e-08, "loss": 0.71122277, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.569611072540283 }, { "auxiliary_loss_clip": 0.01276853, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.04596734, "balance_loss_mlp": 1.01830554, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 3.3142292181327955, "language_loss": 0.76575935, "learning_rate": 9.750219201393184e-08, "loss": 0.78877783, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.7352912425994873 }, { "auxiliary_loss_clip": 0.01221613, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.04768825, "balance_loss_mlp": 1.02313066, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 2.267444862976855, "language_loss": 0.7820434, "learning_rate": 9.726208257172697e-08, "loss": 0.80456126, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.672280788421631 }, { "auxiliary_loss_clip": 0.01172341, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.04909003, "balance_loss_mlp": 1.01955509, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 2.0059591334146423, "language_loss": 0.74614179, "learning_rate": 9.702226177335115e-08, "loss": 0.76813108, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.6243467330932617 }, { "auxiliary_loss_clip": 0.01272383, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.04838562, "balance_loss_mlp": 1.02253723, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.6510426508389227, "language_loss": 0.7270925, "learning_rate": 9.67827296551853e-08, "loss": 0.75011086, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.667962074279785 }, { "auxiliary_loss_clip": 0.01267996, "auxiliary_loss_mlp": 0.02563967, "balance_loss_clip": 1.04453218, "balance_loss_mlp": 0.99993706, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 1.926111310567, "language_loss": 0.68498391, "learning_rate": 9.65434862535659e-08, "loss": 0.7233035, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 3.6485180854797363 }, { "auxiliary_loss_clip": 0.01278333, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.02523732, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 3.258028218521879, "language_loss": 0.65159637, "learning_rate": 9.630453160478635e-08, "loss": 0.67470294, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.614276885986328 }, { "auxiliary_loss_clip": 0.01364258, "auxiliary_loss_mlp": 0.01025226, "balance_loss_clip": 1.0404532, "balance_loss_mlp": 1.01873195, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 2.2533589307828583, "language_loss": 0.82512647, "learning_rate": 9.60658657450959e-08, "loss": 0.84902132, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 3.716310977935791 }, { "auxiliary_loss_clip": 0.01261956, "auxiliary_loss_mlp": 0.01023044, "balance_loss_clip": 1.04209471, "balance_loss_mlp": 1.01663923, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.7407568061763332, "language_loss": 0.79613137, "learning_rate": 9.582748871069979e-08, "loss": 0.81898141, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 3.6157875061035156 }, { "auxiliary_loss_clip": 0.0127528, "auxiliary_loss_mlp": 0.02562681, "balance_loss_clip": 1.04572988, "balance_loss_mlp": 0.99992156, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 2.0423994017439346, "language_loss": 0.8309468, "learning_rate": 9.558940053775954e-08, "loss": 0.86932635, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.762542247772217 }, { "auxiliary_loss_clip": 0.01220029, "auxiliary_loss_mlp": 0.01026948, "balance_loss_clip": 1.04784071, "balance_loss_mlp": 1.0196104, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 1.8162696981499886, "language_loss": 0.68008351, "learning_rate": 9.535160126239294e-08, "loss": 0.70255327, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.566317081451416 }, { "auxiliary_loss_clip": 0.01220145, "auxiliary_loss_mlp": 0.01024635, "balance_loss_clip": 1.04828095, "balance_loss_mlp": 1.01810551, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.616475689565195, "language_loss": 0.70962441, "learning_rate": 9.511409092067424e-08, "loss": 0.73207223, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.6775503158569336 }, { "auxiliary_loss_clip": 0.0126808, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.04801273, "balance_loss_mlp": 1.02000213, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.9555799208537195, "language_loss": 0.67362118, "learning_rate": 9.487686954863327e-08, "loss": 0.69657457, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.6533730030059814 }, { "auxiliary_loss_clip": 0.01218324, "auxiliary_loss_mlp": 0.01026586, "balance_loss_clip": 1.04736614, "balance_loss_mlp": 1.01982069, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 4.1337302987647195, "language_loss": 0.77862471, "learning_rate": 9.46399371822566e-08, "loss": 0.80107379, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 3.852790355682373 }, { "auxiliary_loss_clip": 0.0117365, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 1.04917502, "balance_loss_mlp": 1.0189724, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 2.3770772638479345, "language_loss": 0.72253102, "learning_rate": 9.440329385748657e-08, "loss": 0.74452853, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.613947629928589 }, { "auxiliary_loss_clip": 0.01320628, "auxiliary_loss_mlp": 0.01018819, "balance_loss_clip": 1.0473876, "balance_loss_mlp": 1.01230681, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.7895748133024145, "language_loss": 0.70810604, "learning_rate": 9.416693961022137e-08, "loss": 0.73150051, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.6340103149414062 }, { "auxiliary_loss_clip": 0.0140844, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.03884923, "balance_loss_mlp": 1.01777649, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.7101327101743922, "language_loss": 0.77054024, "learning_rate": 9.393087447631654e-08, "loss": 0.79486734, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.783454179763794 }, { "auxiliary_loss_clip": 0.01170557, "auxiliary_loss_mlp": 0.01024528, "balance_loss_clip": 1.04522061, "balance_loss_mlp": 1.01833785, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 1.6104582193015373, "language_loss": 0.73013073, "learning_rate": 9.36950984915823e-08, "loss": 0.75208151, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.612396240234375 }, { "auxiliary_loss_clip": 0.01172889, "auxiliary_loss_mlp": 0.01023577, "balance_loss_clip": 1.04921722, "balance_loss_mlp": 1.01656771, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 2.0611069313049315, "language_loss": 0.6941216, "learning_rate": 9.345961169178607e-08, "loss": 0.71608627, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.6093201637268066 }, { "auxiliary_loss_clip": 0.01321558, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.05115247, "balance_loss_mlp": 1.01768184, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.470980672708247, "language_loss": 0.72562009, "learning_rate": 9.322441411265081e-08, "loss": 0.74908692, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.702442169189453 }, { "auxiliary_loss_clip": 0.01271687, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.04596186, "balance_loss_mlp": 1.02031446, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 1.9431566291543405, "language_loss": 0.73084533, "learning_rate": 9.298950578985554e-08, "loss": 0.75383544, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.749070405960083 }, { "auxiliary_loss_clip": 0.0122059, "auxiliary_loss_mlp": 0.02567315, "balance_loss_clip": 1.04880631, "balance_loss_mlp": 0.99987406, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.6875204207844825, "language_loss": 0.70667458, "learning_rate": 9.275488675903665e-08, "loss": 0.74455369, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.6348352432250977 }, { "auxiliary_loss_clip": 0.01374524, "auxiliary_loss_mlp": 0.0102455, "balance_loss_clip": 1.04560602, "balance_loss_mlp": 1.01750445, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 4.383111815625869, "language_loss": 0.73818779, "learning_rate": 9.252055705578454e-08, "loss": 0.76217854, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.759201765060425 }, { "auxiliary_loss_clip": 0.01222197, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.04643273, "balance_loss_mlp": 1.01691997, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 2.03275959456739, "language_loss": 0.72189224, "learning_rate": 9.228651671564747e-08, "loss": 0.74434519, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.6896867752075195 }, { "auxiliary_loss_clip": 0.01365183, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.04592824, "balance_loss_mlp": 1.01929235, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.5411700860182593, "language_loss": 0.78083223, "learning_rate": 9.205276577412901e-08, "loss": 0.80474114, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.7731142044067383 }, { "auxiliary_loss_clip": 0.01278015, "auxiliary_loss_mlp": 0.02566229, "balance_loss_clip": 1.04577529, "balance_loss_mlp": 0.99991494, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.912690853902717, "language_loss": 0.7669524, "learning_rate": 9.181930426668905e-08, "loss": 0.80539489, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.662313461303711 }, { "auxiliary_loss_clip": 0.01274775, "auxiliary_loss_mlp": 0.01026629, "balance_loss_clip": 1.04468393, "balance_loss_mlp": 1.0198189, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.561676719377741, "language_loss": 0.67874438, "learning_rate": 9.158613222874346e-08, "loss": 0.70175838, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.8976027965545654 }, { "auxiliary_loss_clip": 0.01269385, "auxiliary_loss_mlp": 0.01021929, "balance_loss_clip": 1.0441885, "balance_loss_mlp": 1.01536047, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.622584414445404, "language_loss": 0.82166004, "learning_rate": 9.135324969566394e-08, "loss": 0.84457326, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.9210472106933594 }, { "auxiliary_loss_clip": 0.01228134, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 1.05016541, "balance_loss_mlp": 1.01877403, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 1.7968038019788235, "language_loss": 0.75206029, "learning_rate": 9.112065670277913e-08, "loss": 0.77459967, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.6140897274017334 }, { "auxiliary_loss_clip": 0.0126993, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.04369044, "balance_loss_mlp": 1.01828527, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 1.8906081070582756, "language_loss": 0.72817999, "learning_rate": 9.088835328537303e-08, "loss": 0.75112402, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.768148899078369 }, { "auxiliary_loss_clip": 0.01273126, "auxiliary_loss_mlp": 0.01027882, "balance_loss_clip": 1.04627967, "balance_loss_mlp": 1.02052987, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 3.851998002201823, "language_loss": 0.71569967, "learning_rate": 9.065633947868568e-08, "loss": 0.73870975, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.661752700805664 }, { "auxiliary_loss_clip": 0.01316493, "auxiliary_loss_mlp": 0.02562514, "balance_loss_clip": 1.04697859, "balance_loss_mlp": 0.99989033, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.184165836940251, "language_loss": 0.80139935, "learning_rate": 9.042461531791379e-08, "loss": 0.84018946, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 3.6434593200683594 }, { "auxiliary_loss_clip": 0.01168725, "auxiliary_loss_mlp": 0.01022461, "balance_loss_clip": 1.04812527, "balance_loss_mlp": 1.01584768, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.7210103265438566, "language_loss": 0.7800318, "learning_rate": 9.019318083820903e-08, "loss": 0.80194366, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.6327402591705322 }, { "auxiliary_loss_clip": 0.01220124, "auxiliary_loss_mlp": 0.01025125, "balance_loss_clip": 1.04714084, "balance_loss_mlp": 1.01820779, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.735459062756827, "language_loss": 0.84965932, "learning_rate": 8.996203607468045e-08, "loss": 0.8721118, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 3.666023015975952 }, { "auxiliary_loss_clip": 0.01219714, "auxiliary_loss_mlp": 0.01024743, "balance_loss_clip": 1.0433743, "balance_loss_mlp": 1.01756048, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.4884574503709511, "language_loss": 0.7543931, "learning_rate": 8.973118106239241e-08, "loss": 0.77683771, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 2.736302137374878 }, { "auxiliary_loss_clip": 0.01422947, "auxiliary_loss_mlp": 0.01024403, "balance_loss_clip": 1.03852344, "balance_loss_mlp": 1.01784635, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 1.9766408173081598, "language_loss": 0.94701767, "learning_rate": 8.95006158363656e-08, "loss": 0.97149122, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 3.7341220378875732 }, { "auxiliary_loss_clip": 0.01229075, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.05173576, "balance_loss_mlp": 1.0213834, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 2.237316622818048, "language_loss": 0.77604473, "learning_rate": 8.9270340431576e-08, "loss": 0.79862809, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.614132881164551 }, { "auxiliary_loss_clip": 0.012232, "auxiliary_loss_mlp": 0.01023188, "balance_loss_clip": 1.04672861, "balance_loss_mlp": 1.01600885, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 6.2876781967164685, "language_loss": 0.73682415, "learning_rate": 8.904035488295658e-08, "loss": 0.75928807, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.7679173946380615 }, { "auxiliary_loss_clip": 0.01113774, "auxiliary_loss_mlp": 0.02505062, "balance_loss_clip": 1.00644636, "balance_loss_mlp": 0.99993652, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.6749293508983584, "language_loss": 0.53234601, "learning_rate": 8.881065922539632e-08, "loss": 0.56853437, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.128706932067871 }, { "auxiliary_loss_clip": 0.01313854, "auxiliary_loss_mlp": 0.01025462, "balance_loss_clip": 1.04576433, "balance_loss_mlp": 1.01911664, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 2.0164824056689676, "language_loss": 0.73640478, "learning_rate": 8.85812534937389e-08, "loss": 0.75979793, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 3.5682995319366455 }, { "auxiliary_loss_clip": 0.01129517, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.05081034, "balance_loss_mlp": 1.02491367, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 2.6455919073185403, "language_loss": 0.67957497, "learning_rate": 8.835213772278583e-08, "loss": 0.70119351, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.634061813354492 }, { "auxiliary_loss_clip": 0.01317977, "auxiliary_loss_mlp": 0.01020371, "balance_loss_clip": 1.04795575, "balance_loss_mlp": 1.01370764, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.9942270889796516, "language_loss": 0.78826094, "learning_rate": 8.812331194729373e-08, "loss": 0.81164443, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.731265068054199 }, { "auxiliary_loss_clip": 0.01179608, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.05346274, "balance_loss_mlp": 1.02544546, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 1.9572778563972915, "language_loss": 0.72511387, "learning_rate": 8.789477620197461e-08, "loss": 0.74723637, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.718407392501831 }, { "auxiliary_loss_clip": 0.0127123, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 1.04592037, "balance_loss_mlp": 1.01729333, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 3.2894664677818555, "language_loss": 0.78862953, "learning_rate": 8.766653052149831e-08, "loss": 0.81158578, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.6193885803222656 }, { "auxiliary_loss_clip": 0.01267607, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.04634404, "balance_loss_mlp": 1.02039158, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 2.638039523908064, "language_loss": 0.74514705, "learning_rate": 8.743857494048823e-08, "loss": 0.76809669, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.6555545330047607 }, { "auxiliary_loss_clip": 0.01322544, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.04556346, "balance_loss_mlp": 1.02213192, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 2.312431465750946, "language_loss": 0.62570572, "learning_rate": 8.721090949352605e-08, "loss": 0.64922249, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.6748809814453125 }, { "auxiliary_loss_clip": 0.01132511, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.05053866, "balance_loss_mlp": 1.01997089, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 3.316448742562777, "language_loss": 0.73387957, "learning_rate": 8.698353421514793e-08, "loss": 0.75548375, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.62380313873291 }, { "auxiliary_loss_clip": 0.01219026, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.04772437, "balance_loss_mlp": 1.02256036, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 3.6852032908939534, "language_loss": 0.80065662, "learning_rate": 8.67564491398467e-08, "loss": 0.8231355, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.596177816390991 }, { "auxiliary_loss_clip": 0.01226796, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 1.04792023, "balance_loss_mlp": 1.01774907, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 1.746330257033229, "language_loss": 0.74076021, "learning_rate": 8.652965430207104e-08, "loss": 0.76327211, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.6998112201690674 }, { "auxiliary_loss_clip": 0.01224474, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.04616725, "balance_loss_mlp": 1.01824021, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 2.1652503606870024, "language_loss": 0.65710032, "learning_rate": 8.630314973622521e-08, "loss": 0.67959654, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.5725057125091553 }, { "auxiliary_loss_clip": 0.01217325, "auxiliary_loss_mlp": 0.01020826, "balance_loss_clip": 1.04820657, "balance_loss_mlp": 1.01443005, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 2.2282944982148742, "language_loss": 0.7084614, "learning_rate": 8.607693547666995e-08, "loss": 0.73084289, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.7540194988250732 }, { "auxiliary_loss_clip": 0.01223674, "auxiliary_loss_mlp": 0.00998436, "balance_loss_clip": 1.00601661, "balance_loss_mlp": 0.9975422, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.8753520999168358, "language_loss": 0.57909131, "learning_rate": 8.585101155772201e-08, "loss": 0.6013124, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.3690836429595947 }, { "auxiliary_loss_clip": 0.01268782, "auxiliary_loss_mlp": 0.01028951, "balance_loss_clip": 1.04199398, "balance_loss_mlp": 1.0218966, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 1.8271349804239712, "language_loss": 0.68634492, "learning_rate": 8.562537801365377e-08, "loss": 0.70932233, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.684295892715454 }, { "auxiliary_loss_clip": 0.01175099, "auxiliary_loss_mlp": 0.01025681, "balance_loss_clip": 1.04906631, "balance_loss_mlp": 1.01854038, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.8996454409213293, "language_loss": 0.70146048, "learning_rate": 8.540003487869362e-08, "loss": 0.7234683, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.6528258323669434 }, { "auxiliary_loss_clip": 0.0130819, "auxiliary_loss_mlp": 0.01024826, "balance_loss_clip": 1.04217672, "balance_loss_mlp": 1.01728308, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 2.6679176820698154, "language_loss": 0.79693192, "learning_rate": 8.517498218702557e-08, "loss": 0.82026207, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.718322277069092 }, { "auxiliary_loss_clip": 0.01318145, "auxiliary_loss_mlp": 0.01027685, "balance_loss_clip": 1.0426718, "balance_loss_mlp": 1.02080667, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.8253110696969452, "language_loss": 0.69941449, "learning_rate": 8.49502199727905e-08, "loss": 0.72287273, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.7079927921295166 }, { "auxiliary_loss_clip": 0.01218919, "auxiliary_loss_mlp": 0.0102565, "balance_loss_clip": 1.04411864, "balance_loss_mlp": 1.01868796, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 5.409002431113223, "language_loss": 0.66274464, "learning_rate": 8.472574827008428e-08, "loss": 0.68519038, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.705028533935547 }, { "auxiliary_loss_clip": 0.01221618, "auxiliary_loss_mlp": 0.01023114, "balance_loss_clip": 1.0463016, "balance_loss_mlp": 1.01636922, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.7284709362899893, "language_loss": 0.8378911, "learning_rate": 8.450156711295942e-08, "loss": 0.86033845, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 3.4975571632385254 }, { "auxiliary_loss_clip": 0.01270807, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.04833293, "balance_loss_mlp": 1.01934922, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.6023457574424933, "language_loss": 0.86574459, "learning_rate": 8.427767653542383e-08, "loss": 0.88871431, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.7338473796844482 }, { "auxiliary_loss_clip": 0.01364454, "auxiliary_loss_mlp": 0.01021988, "balance_loss_clip": 1.04014897, "balance_loss_mlp": 1.01549399, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 1.8365415681018058, "language_loss": 0.70247614, "learning_rate": 8.405407657144125e-08, "loss": 0.72634053, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 3.701667308807373 }, { "auxiliary_loss_clip": 0.0126646, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.04488301, "balance_loss_mlp": 1.02087891, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.9026518190379227, "language_loss": 0.72676343, "learning_rate": 8.383076725493232e-08, "loss": 0.74969804, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.7026431560516357 }, { "auxiliary_loss_clip": 0.01223504, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.04710507, "balance_loss_mlp": 1.02467322, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 1.8950795801555236, "language_loss": 0.67913073, "learning_rate": 8.360774861977216e-08, "loss": 0.70168447, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 3.508399248123169 }, { "auxiliary_loss_clip": 0.01270709, "auxiliary_loss_mlp": 0.01024657, "balance_loss_clip": 1.04189897, "balance_loss_mlp": 1.01797831, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 1.8796840095762815, "language_loss": 0.74920994, "learning_rate": 8.338502069979281e-08, "loss": 0.77216363, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.721954584121704 }, { "auxiliary_loss_clip": 0.01222514, "auxiliary_loss_mlp": 0.01020859, "balance_loss_clip": 1.0442766, "balance_loss_mlp": 1.01397419, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 2.9587753239208303, "language_loss": 0.79728049, "learning_rate": 8.316258352878214e-08, "loss": 0.81971425, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.5853769779205322 }, { "auxiliary_loss_clip": 0.01228134, "auxiliary_loss_mlp": 0.01027036, "balance_loss_clip": 1.04793572, "balance_loss_mlp": 1.01939499, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 2.198098666131195, "language_loss": 0.71420968, "learning_rate": 8.294043714048338e-08, "loss": 0.73676133, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.6365127563476562 }, { "auxiliary_loss_clip": 0.01166723, "auxiliary_loss_mlp": 0.01002299, "balance_loss_clip": 1.00583196, "balance_loss_mlp": 1.00139904, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7523857416289549, "language_loss": 0.60413384, "learning_rate": 8.271858156859624e-08, "loss": 0.62582409, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 4.1903674602508545 }, { "auxiliary_loss_clip": 0.01170997, "auxiliary_loss_mlp": 0.01025543, "balance_loss_clip": 1.04779804, "balance_loss_mlp": 1.01871836, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.7321912407217455, "language_loss": 0.7372154, "learning_rate": 8.249701684677557e-08, "loss": 0.75918078, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.6117520332336426 }, { "auxiliary_loss_clip": 0.01219749, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.04853559, "balance_loss_mlp": 1.01942623, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 2.148220495032876, "language_loss": 0.81175327, "learning_rate": 8.227574300863294e-08, "loss": 0.83421153, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.799767017364502 }, { "auxiliary_loss_clip": 0.0127621, "auxiliary_loss_mlp": 0.01021739, "balance_loss_clip": 1.0472393, "balance_loss_mlp": 1.01436865, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.7632201900926827, "language_loss": 0.69543993, "learning_rate": 8.205476008773548e-08, "loss": 0.71841943, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.939774751663208 }, { "auxiliary_loss_clip": 0.0131313, "auxiliary_loss_mlp": 0.01023895, "balance_loss_clip": 1.04533935, "balance_loss_mlp": 1.01738286, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 2.3702936812089965, "language_loss": 0.82174873, "learning_rate": 8.183406811760596e-08, "loss": 0.84511894, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.7141034603118896 }, { "auxiliary_loss_clip": 0.01313498, "auxiliary_loss_mlp": 0.01028268, "balance_loss_clip": 1.04076576, "balance_loss_mlp": 1.0216434, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.7151870792096728, "language_loss": 0.73817432, "learning_rate": 8.161366713172313e-08, "loss": 0.76159203, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.7537527084350586 }, { "auxiliary_loss_clip": 0.01328735, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.04292941, "balance_loss_mlp": 1.02293861, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 2.8179307646447134, "language_loss": 0.843714, "learning_rate": 8.139355716352137e-08, "loss": 0.86730683, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.6194937229156494 }, { "auxiliary_loss_clip": 0.01177247, "auxiliary_loss_mlp": 0.01025168, "balance_loss_clip": 1.04569149, "balance_loss_mlp": 1.0182178, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 2.231809196817548, "language_loss": 0.70172942, "learning_rate": 8.117373824639196e-08, "loss": 0.72375357, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.8070926666259766 }, { "auxiliary_loss_clip": 0.0106029, "auxiliary_loss_mlp": 0.01000923, "balance_loss_clip": 1.00628459, "balance_loss_mlp": 1.00001085, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.722798772391123, "language_loss": 0.59256959, "learning_rate": 8.095421041368067e-08, "loss": 0.61318171, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.074864149093628 }, { "auxiliary_loss_clip": 0.01266702, "auxiliary_loss_mlp": 0.02565753, "balance_loss_clip": 1.04652286, "balance_loss_mlp": 0.99993289, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 2.793469101684912, "language_loss": 0.70766878, "learning_rate": 8.073497369868999e-08, "loss": 0.74599332, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.6740458011627197 }, { "auxiliary_loss_clip": 0.01279687, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.0461117, "balance_loss_mlp": 1.02388096, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.6338775113822197, "language_loss": 0.75442481, "learning_rate": 8.051602813467772e-08, "loss": 0.77753413, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.680936336517334 }, { "auxiliary_loss_clip": 0.01225294, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.04777169, "balance_loss_mlp": 1.01919246, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 1.7443691225399316, "language_loss": 0.71268517, "learning_rate": 8.029737375485756e-08, "loss": 0.73519635, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.652338743209839 }, { "auxiliary_loss_clip": 0.01174135, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.04968262, "balance_loss_mlp": 1.01773024, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.8193009354767153, "language_loss": 0.72410667, "learning_rate": 8.007901059239986e-08, "loss": 0.74609554, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.5703210830688477 }, { "auxiliary_loss_clip": 0.0126985, "auxiliary_loss_mlp": 0.01024699, "balance_loss_clip": 1.04460573, "balance_loss_mlp": 1.01810646, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.863465015382947, "language_loss": 0.80414486, "learning_rate": 7.986093868042964e-08, "loss": 0.82709032, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.7107160091400146 }, { "auxiliary_loss_clip": 0.01216704, "auxiliary_loss_mlp": 0.01023968, "balance_loss_clip": 1.045524, "balance_loss_mlp": 1.01763558, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 2.23688370226302, "language_loss": 0.67891347, "learning_rate": 7.964315805202826e-08, "loss": 0.70132017, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.6821858882904053 }, { "auxiliary_loss_clip": 0.01277922, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.04904437, "balance_loss_mlp": 1.02395105, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 2.3722250203650734, "language_loss": 0.73303896, "learning_rate": 7.942566874023304e-08, "loss": 0.75612891, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.7011170387268066 }, { "auxiliary_loss_clip": 0.01171452, "auxiliary_loss_mlp": 0.01022796, "balance_loss_clip": 1.04415214, "balance_loss_mlp": 1.01564896, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.206756551964659, "language_loss": 0.69669676, "learning_rate": 7.920847077803649e-08, "loss": 0.71863925, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.661616325378418 }, { "auxiliary_loss_clip": 0.0136406, "auxiliary_loss_mlp": 0.01022923, "balance_loss_clip": 1.03674638, "balance_loss_mlp": 1.01576447, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 1.8607159988264674, "language_loss": 0.82206601, "learning_rate": 7.899156419838826e-08, "loss": 0.84593582, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.684455156326294 }, { "auxiliary_loss_clip": 0.0132002, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.04515386, "balance_loss_mlp": 1.02085483, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 2.2945439229076503, "language_loss": 0.65886915, "learning_rate": 7.87749490341918e-08, "loss": 0.68235242, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 3.6518964767456055 }, { "auxiliary_loss_clip": 0.01177561, "auxiliary_loss_mlp": 0.01024168, "balance_loss_clip": 1.0508132, "balance_loss_mlp": 1.01685464, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.3246822823315876, "language_loss": 0.8341319, "learning_rate": 7.855862531830836e-08, "loss": 0.8561492, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.5909533500671387 }, { "auxiliary_loss_clip": 0.01223324, "auxiliary_loss_mlp": 0.01022077, "balance_loss_clip": 1.04658842, "balance_loss_mlp": 1.01484668, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.6112898141343444, "language_loss": 0.72555327, "learning_rate": 7.834259308355373e-08, "loss": 0.7480073, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 3.569679021835327 }, { "auxiliary_loss_clip": 0.01406309, "auxiliary_loss_mlp": 0.01026775, "balance_loss_clip": 1.03910446, "balance_loss_mlp": 1.01967907, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 2.4725091642506385, "language_loss": 0.75350952, "learning_rate": 7.812685236269989e-08, "loss": 0.77784038, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.7518177032470703 }, { "auxiliary_loss_clip": 0.01215967, "auxiliary_loss_mlp": 0.01003579, "balance_loss_clip": 1.00674522, "balance_loss_mlp": 1.00267911, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7914697998193851, "language_loss": 0.58644938, "learning_rate": 7.791140318847445e-08, "loss": 0.60864484, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 4.187063932418823 }, { "auxiliary_loss_clip": 0.01271561, "auxiliary_loss_mlp": 0.01025065, "balance_loss_clip": 1.04943204, "balance_loss_mlp": 1.0185535, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.4269760798260303, "language_loss": 0.80353773, "learning_rate": 7.769624559356081e-08, "loss": 0.82650399, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.7301881313323975 }, { "auxiliary_loss_clip": 0.01224185, "auxiliary_loss_mlp": 0.01027511, "balance_loss_clip": 1.0500071, "balance_loss_mlp": 1.02021241, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 2.722172589798478, "language_loss": 0.75797731, "learning_rate": 7.748137961059842e-08, "loss": 0.78049421, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.625005006790161 }, { "auxiliary_loss_clip": 0.01170589, "auxiliary_loss_mlp": 0.01023943, "balance_loss_clip": 1.04934156, "balance_loss_mlp": 1.0173533, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 5.965510157326781, "language_loss": 0.6555562, "learning_rate": 7.726680527218211e-08, "loss": 0.67750156, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.5623602867126465 }, { "auxiliary_loss_clip": 0.01170545, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.04601717, "balance_loss_mlp": 1.02122736, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.7458680156916204, "language_loss": 0.75511897, "learning_rate": 7.70525226108627e-08, "loss": 0.77710217, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 3.6371474266052246 }, { "auxiliary_loss_clip": 0.01226283, "auxiliary_loss_mlp": 0.01021644, "balance_loss_clip": 1.05041134, "balance_loss_mlp": 1.01469684, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 1.7951408337135109, "language_loss": 0.79891354, "learning_rate": 7.683853165914666e-08, "loss": 0.82139277, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.585395097732544 }, { "auxiliary_loss_clip": 0.01370677, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.04246283, "balance_loss_mlp": 1.01766753, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.9368947697342884, "language_loss": 0.77466702, "learning_rate": 7.662483244949602e-08, "loss": 0.79861927, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.7954394817352295 }, { "auxiliary_loss_clip": 0.01308838, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.04317975, "balance_loss_mlp": 1.0179584, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.3273893559971244, "language_loss": 0.80810618, "learning_rate": 7.641142501432951e-08, "loss": 0.83144456, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.7048652172088623 }, { "auxiliary_loss_clip": 0.01265099, "auxiliary_loss_mlp": 0.01024582, "balance_loss_clip": 1.04204416, "balance_loss_mlp": 1.01786733, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.8161032181983994, "language_loss": 0.73622668, "learning_rate": 7.619830938602013e-08, "loss": 0.75912356, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.8178369998931885 }, { "auxiliary_loss_clip": 0.01221206, "auxiliary_loss_mlp": 0.01026796, "balance_loss_clip": 1.04672039, "balance_loss_mlp": 1.01995647, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 2.15143373880573, "language_loss": 0.82386214, "learning_rate": 7.598548559689777e-08, "loss": 0.84634221, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.6555545330047607 }, { "auxiliary_loss_clip": 0.01314776, "auxiliary_loss_mlp": 0.01023959, "balance_loss_clip": 1.0421927, "balance_loss_mlp": 1.01727104, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 3.297006619404753, "language_loss": 0.81293416, "learning_rate": 7.577295367924751e-08, "loss": 0.83632147, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.652890205383301 }, { "auxiliary_loss_clip": 0.01276852, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.04977846, "balance_loss_mlp": 1.02049839, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 1.6464482731844308, "language_loss": 0.82236636, "learning_rate": 7.556071366531002e-08, "loss": 0.84541082, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.7772531509399414 }, { "auxiliary_loss_clip": 0.01221777, "auxiliary_loss_mlp": 0.01026886, "balance_loss_clip": 1.04893374, "balance_loss_mlp": 1.01985228, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 2.174642256299152, "language_loss": 0.78717339, "learning_rate": 7.53487655872822e-08, "loss": 0.80966002, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.59081768989563 }, { "auxiliary_loss_clip": 0.01374214, "auxiliary_loss_mlp": 0.01025727, "balance_loss_clip": 1.04117811, "balance_loss_mlp": 1.01899457, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.862325600411536, "language_loss": 0.74073219, "learning_rate": 7.513710947731656e-08, "loss": 0.76473165, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.803830623626709 }, { "auxiliary_loss_clip": 0.01266132, "auxiliary_loss_mlp": 0.0103444, "balance_loss_clip": 1.04510164, "balance_loss_mlp": 1.0274632, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.6853970045568514, "language_loss": 0.85172498, "learning_rate": 7.492574536752095e-08, "loss": 0.87473071, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.647684335708618 }, { "auxiliary_loss_clip": 0.01217942, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.04822779, "balance_loss_mlp": 1.018888, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 2.256371791795798, "language_loss": 0.78216332, "learning_rate": 7.471467328995907e-08, "loss": 0.80459976, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.688703775405884 }, { "auxiliary_loss_clip": 0.01423659, "auxiliary_loss_mlp": 0.01023334, "balance_loss_clip": 1.03873563, "balance_loss_mlp": 1.01669741, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.8761015641131626, "language_loss": 0.60217571, "learning_rate": 7.450389327665018e-08, "loss": 0.62664557, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.875971794128418 }, { "auxiliary_loss_clip": 0.01329267, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.05176497, "balance_loss_mlp": 1.02010345, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 2.3366484079259062, "language_loss": 0.67810369, "learning_rate": 7.429340535957029e-08, "loss": 0.70166731, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 3.444566011428833 }, { "auxiliary_loss_clip": 0.01273595, "auxiliary_loss_mlp": 0.01022533, "balance_loss_clip": 1.04636383, "balance_loss_mlp": 1.01592028, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 6.301799501942848, "language_loss": 0.7108407, "learning_rate": 7.40832095706494e-08, "loss": 0.73380196, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.6831982135772705 }, { "auxiliary_loss_clip": 0.01324428, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.0449338, "balance_loss_mlp": 1.02277327, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 2.134890897511472, "language_loss": 0.80454981, "learning_rate": 7.387330594177443e-08, "loss": 0.82808888, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.6993863582611084 }, { "auxiliary_loss_clip": 0.01318063, "auxiliary_loss_mlp": 0.01021223, "balance_loss_clip": 1.04342687, "balance_loss_mlp": 1.01438308, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 2.1260339494485607, "language_loss": 0.79404509, "learning_rate": 7.366369450478749e-08, "loss": 0.81743801, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.7368505001068115 }, { "auxiliary_loss_clip": 0.01316713, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.04425704, "balance_loss_mlp": 1.02024627, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 2.948781655402579, "language_loss": 0.66694444, "learning_rate": 7.345437529148646e-08, "loss": 0.69037998, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.7175252437591553 }, { "auxiliary_loss_clip": 0.01321973, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 1.0441258, "balance_loss_mlp": 1.01638806, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 1.9744831754307444, "language_loss": 0.72649473, "learning_rate": 7.324534833362483e-08, "loss": 0.74994302, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.7575128078460693 }, { "auxiliary_loss_clip": 0.01269583, "auxiliary_loss_mlp": 0.01027513, "balance_loss_clip": 1.04737139, "balance_loss_mlp": 1.02086377, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.7112773577309563, "language_loss": 0.68870497, "learning_rate": 7.303661366291192e-08, "loss": 0.71167588, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 3.8878109455108643 }, { "auxiliary_loss_clip": 0.01372102, "auxiliary_loss_mlp": 0.01031197, "balance_loss_clip": 1.04147673, "balance_loss_mlp": 1.02438462, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.6767195427056085, "language_loss": 0.81866676, "learning_rate": 7.28281713110126e-08, "loss": 0.84269977, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 3.678748846054077 }, { "auxiliary_loss_clip": 0.01274193, "auxiliary_loss_mlp": 0.0102691, "balance_loss_clip": 1.04878068, "balance_loss_mlp": 1.0201118, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 2.1156315246112043, "language_loss": 0.77235758, "learning_rate": 7.262002130954759e-08, "loss": 0.79536867, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.7222959995269775 }, { "auxiliary_loss_clip": 0.01375073, "auxiliary_loss_mlp": 0.01026671, "balance_loss_clip": 1.04267216, "balance_loss_mlp": 1.02001905, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.7454492833268143, "language_loss": 0.78861547, "learning_rate": 7.241216369009296e-08, "loss": 0.81263292, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 3.6829864978790283 }, { "auxiliary_loss_clip": 0.01172497, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.04746616, "balance_loss_mlp": 1.02421403, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 1.9275015611482877, "language_loss": 0.66671312, "learning_rate": 7.220459848418037e-08, "loss": 0.68875057, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.61224627494812 }, { "auxiliary_loss_clip": 0.01172284, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.05051088, "balance_loss_mlp": 1.01933455, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 2.1029756957926056, "language_loss": 0.79768789, "learning_rate": 7.199732572329708e-08, "loss": 0.81967324, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.562105417251587 }, { "auxiliary_loss_clip": 0.01223317, "auxiliary_loss_mlp": 0.01025809, "balance_loss_clip": 1.04381323, "balance_loss_mlp": 1.01910615, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.760964531289636, "language_loss": 0.76240265, "learning_rate": 7.179034543888684e-08, "loss": 0.78489393, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 3.655902862548828 }, { "auxiliary_loss_clip": 0.01226556, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.04811311, "balance_loss_mlp": 1.01861513, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 2.1816700372421667, "language_loss": 0.77638471, "learning_rate": 7.158365766234808e-08, "loss": 0.79890072, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.6988086700439453 }, { "auxiliary_loss_clip": 0.01316525, "auxiliary_loss_mlp": 0.01027964, "balance_loss_clip": 1.04022062, "balance_loss_mlp": 1.02035594, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 2.247379273378707, "language_loss": 0.72328025, "learning_rate": 7.137726242503527e-08, "loss": 0.74672508, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.660611152648926 }, { "auxiliary_loss_clip": 0.01221647, "auxiliary_loss_mlp": 0.02567981, "balance_loss_clip": 1.0487113, "balance_loss_mlp": 0.99992299, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 6.452056485032234, "language_loss": 0.78294754, "learning_rate": 7.11711597582585e-08, "loss": 0.82084382, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.649448871612549 }, { "auxiliary_loss_clip": 0.01320037, "auxiliary_loss_mlp": 0.01024766, "balance_loss_clip": 1.04012525, "balance_loss_mlp": 1.01815891, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 2.9769075127472857, "language_loss": 0.80159128, "learning_rate": 7.096534969328271e-08, "loss": 0.82503933, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.6509616374969482 }, { "auxiliary_loss_clip": 0.0127458, "auxiliary_loss_mlp": 0.01022663, "balance_loss_clip": 1.04336071, "balance_loss_mlp": 1.01592445, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 1.9158430202860188, "language_loss": 0.8369109, "learning_rate": 7.075983226132987e-08, "loss": 0.85988331, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.7028024196624756 }, { "auxiliary_loss_clip": 0.01277672, "auxiliary_loss_mlp": 0.02569206, "balance_loss_clip": 1.04585576, "balance_loss_mlp": 0.99991721, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 3.233619369603297, "language_loss": 0.79308569, "learning_rate": 7.055460749357656e-08, "loss": 0.83155453, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.7047464847564697 }, { "auxiliary_loss_clip": 0.01268613, "auxiliary_loss_mlp": 0.01025657, "balance_loss_clip": 1.04753482, "balance_loss_mlp": 1.01814055, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 3.165785231860029, "language_loss": 0.7017293, "learning_rate": 7.034967542115521e-08, "loss": 0.72467196, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.616337537765503 }, { "auxiliary_loss_clip": 0.01218409, "auxiliary_loss_mlp": 0.02565981, "balance_loss_clip": 1.04640913, "balance_loss_mlp": 0.99993908, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.6501339691547976, "language_loss": 0.75613189, "learning_rate": 7.014503607515388e-08, "loss": 0.79397583, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.6475989818573 }, { "auxiliary_loss_clip": 0.01269648, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.04885173, "balance_loss_mlp": 1.02063775, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 3.1631165642956525, "language_loss": 0.68382359, "learning_rate": 6.994068948661592e-08, "loss": 0.70679951, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.6665782928466797 }, { "auxiliary_loss_clip": 0.01223024, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.04875612, "balance_loss_mlp": 1.02227867, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 1.985921368413001, "language_loss": 0.76900768, "learning_rate": 6.973663568654142e-08, "loss": 0.791538, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.709414005279541 }, { "auxiliary_loss_clip": 0.01174557, "auxiliary_loss_mlp": 0.01027371, "balance_loss_clip": 1.05110741, "balance_loss_mlp": 1.02047133, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.2495263401601835, "language_loss": 0.65652144, "learning_rate": 6.953287470588386e-08, "loss": 0.67854071, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.584444999694824 }, { "auxiliary_loss_clip": 0.01224121, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.04515314, "balance_loss_mlp": 1.01796174, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.292178211436983, "language_loss": 0.86096376, "learning_rate": 6.932940657555452e-08, "loss": 0.88345748, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.619983196258545 }, { "auxiliary_loss_clip": 0.01167845, "auxiliary_loss_mlp": 0.01021876, "balance_loss_clip": 1.0478313, "balance_loss_mlp": 1.01545024, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 1.4853165154290404, "language_loss": 0.76617765, "learning_rate": 6.912623132641938e-08, "loss": 0.78807485, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.720280408859253 }, { "auxiliary_loss_clip": 0.01275896, "auxiliary_loss_mlp": 0.01026907, "balance_loss_clip": 1.04598534, "balance_loss_mlp": 1.01965356, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 2.0311636903400476, "language_loss": 0.76998258, "learning_rate": 6.892334898929952e-08, "loss": 0.79301065, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.728938579559326 }, { "auxiliary_loss_clip": 0.01217613, "auxiliary_loss_mlp": 0.01023944, "balance_loss_clip": 1.04508507, "balance_loss_mlp": 1.01737547, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 2.0202243670884448, "language_loss": 0.84753656, "learning_rate": 6.872075959497236e-08, "loss": 0.86995208, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.5756380558013916 }, { "auxiliary_loss_clip": 0.01122694, "auxiliary_loss_mlp": 0.01024638, "balance_loss_clip": 1.04605556, "balance_loss_mlp": 1.01780105, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 4.817342337260383, "language_loss": 0.82826632, "learning_rate": 6.85184631741702e-08, "loss": 0.84973967, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.683485269546509 }, { "auxiliary_loss_clip": 0.01216449, "auxiliary_loss_mlp": 0.01027028, "balance_loss_clip": 1.04508853, "balance_loss_mlp": 1.0200218, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 2.0683013592185313, "language_loss": 0.77649605, "learning_rate": 6.831645975758161e-08, "loss": 0.79893082, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.5841498374938965 }, { "auxiliary_loss_clip": 0.01267739, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.04525185, "balance_loss_mlp": 1.02057886, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 5.9666980166046, "language_loss": 0.67322898, "learning_rate": 6.811474937585026e-08, "loss": 0.69618678, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.7142460346221924 }, { "auxiliary_loss_clip": 0.01318085, "auxiliary_loss_mlp": 0.01025401, "balance_loss_clip": 1.04414177, "balance_loss_mlp": 1.01923442, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.6448751413726141, "language_loss": 0.79166591, "learning_rate": 6.79133320595755e-08, "loss": 0.81510079, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.6831910610198975 }, { "auxiliary_loss_clip": 0.01273314, "auxiliary_loss_mlp": 0.0102307, "balance_loss_clip": 1.04812002, "balance_loss_mlp": 1.01647806, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.8958644704220184, "language_loss": 0.75231707, "learning_rate": 6.771220783931198e-08, "loss": 0.77528089, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 4.47619104385376 }, { "auxiliary_loss_clip": 0.01460073, "auxiliary_loss_mlp": 0.02506179, "balance_loss_clip": 1.03284943, "balance_loss_mlp": 0.99990672, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.8501237492570174, "language_loss": 0.64562428, "learning_rate": 6.751137674556994e-08, "loss": 0.68528676, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 3.8085343837738037 }, { "auxiliary_loss_clip": 0.01225331, "auxiliary_loss_mlp": 0.01026706, "balance_loss_clip": 1.04545951, "balance_loss_mlp": 1.01982141, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 2.14933106354972, "language_loss": 0.77659512, "learning_rate": 6.731083880881572e-08, "loss": 0.79911542, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 3.0257620811462402 }, { "auxiliary_loss_clip": 0.01272112, "auxiliary_loss_mlp": 0.01025692, "balance_loss_clip": 1.04872465, "balance_loss_mlp": 1.01901007, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.1587517348349694, "language_loss": 0.80774486, "learning_rate": 6.711059405947072e-08, "loss": 0.83072287, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 3.5893969535827637 }, { "auxiliary_loss_clip": 0.01312697, "auxiliary_loss_mlp": 0.0102229, "balance_loss_clip": 1.04435229, "balance_loss_mlp": 1.01560521, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 2.3922156479850587, "language_loss": 0.77253211, "learning_rate": 6.691064252791156e-08, "loss": 0.79588199, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 2.722959518432617 }, { "auxiliary_loss_clip": 0.01358275, "auxiliary_loss_mlp": 0.01026407, "balance_loss_clip": 1.04144871, "balance_loss_mlp": 1.01940358, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.6051689981763821, "language_loss": 0.77922815, "learning_rate": 6.67109842444713e-08, "loss": 0.80307496, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.776198148727417 }, { "auxiliary_loss_clip": 0.01227318, "auxiliary_loss_mlp": 0.02566657, "balance_loss_clip": 1.0515759, "balance_loss_mlp": 0.99993849, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.998464192675731, "language_loss": 0.76622158, "learning_rate": 6.651161923943704e-08, "loss": 0.80416131, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 4.059560060501099 }, { "auxiliary_loss_clip": 0.01219453, "auxiliary_loss_mlp": 0.01026428, "balance_loss_clip": 1.0447433, "balance_loss_mlp": 1.01881337, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 1.67228467511947, "language_loss": 0.76783705, "learning_rate": 6.631254754305326e-08, "loss": 0.79029584, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.6930644512176514 }, { "auxiliary_loss_clip": 0.01172676, "auxiliary_loss_mlp": 0.0102129, "balance_loss_clip": 1.0479728, "balance_loss_mlp": 1.01464367, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.26555383502116, "language_loss": 0.78172702, "learning_rate": 6.611376918551848e-08, "loss": 0.80366665, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.580793619155884 }, { "auxiliary_loss_clip": 0.01218444, "auxiliary_loss_mlp": 0.02567134, "balance_loss_clip": 1.0421176, "balance_loss_mlp": 0.99988937, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 3.0334438680721822, "language_loss": 0.79789042, "learning_rate": 6.591528419698744e-08, "loss": 0.83574617, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.6849617958068848 }, { "auxiliary_loss_clip": 0.01272939, "auxiliary_loss_mlp": 0.01025126, "balance_loss_clip": 1.04387188, "balance_loss_mlp": 1.01866531, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.2706713730159582, "language_loss": 0.83580172, "learning_rate": 6.571709260756986e-08, "loss": 0.85878241, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.634756326675415 }, { "auxiliary_loss_clip": 0.0122522, "auxiliary_loss_mlp": 0.01025291, "balance_loss_clip": 1.05240607, "balance_loss_mlp": 1.01831436, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 15.807396454302408, "language_loss": 0.76015913, "learning_rate": 6.551919444733122e-08, "loss": 0.7826643, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.733100414276123 }, { "auxiliary_loss_clip": 0.01271098, "auxiliary_loss_mlp": 0.01026349, "balance_loss_clip": 1.04711115, "balance_loss_mlp": 1.01949143, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 2.1112556293668585, "language_loss": 0.66032505, "learning_rate": 6.53215897462931e-08, "loss": 0.68329954, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 3.0016939640045166 }, { "auxiliary_loss_clip": 0.01220283, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.04574156, "balance_loss_mlp": 1.02156711, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.998306283332992, "language_loss": 0.74829656, "learning_rate": 6.512427853443103e-08, "loss": 0.77078402, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.6631088256835938 }, { "auxiliary_loss_clip": 0.01226243, "auxiliary_loss_mlp": 0.01022481, "balance_loss_clip": 1.04753995, "balance_loss_mlp": 1.01593387, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.6626931397643498, "language_loss": 0.7600981, "learning_rate": 6.492726084167799e-08, "loss": 0.78258538, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.7732231616973877 }, { "auxiliary_loss_clip": 0.01059711, "auxiliary_loss_mlp": 0.00999598, "balance_loss_clip": 1.00574422, "balance_loss_mlp": 0.99872828, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.7738463302921195, "language_loss": 0.57456428, "learning_rate": 6.473053669792072e-08, "loss": 0.59515738, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.025599241256714 }, { "auxiliary_loss_clip": 0.01223452, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04711771, "balance_loss_mlp": 1.01848662, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 3.470860934129358, "language_loss": 0.72978812, "learning_rate": 6.453410613300248e-08, "loss": 0.75227487, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.687568187713623 }, { "auxiliary_loss_clip": 0.01418205, "auxiliary_loss_mlp": 0.01022506, "balance_loss_clip": 1.04125524, "balance_loss_mlp": 1.01592016, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.9068201663238014, "language_loss": 0.58195066, "learning_rate": 6.43379691767214e-08, "loss": 0.60635781, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.7942821979522705 }, { "auxiliary_loss_clip": 0.01274428, "auxiliary_loss_mlp": 0.00999458, "balance_loss_clip": 1.00766158, "balance_loss_mlp": 0.99861139, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7202495577259429, "language_loss": 0.55046725, "learning_rate": 6.414212585883105e-08, "loss": 0.57320619, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.3678667545318604 }, { "auxiliary_loss_clip": 0.01274705, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.04661083, "balance_loss_mlp": 1.02262831, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.6195241331758947, "language_loss": 0.69930482, "learning_rate": 6.394657620904143e-08, "loss": 0.72234809, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.802504777908325 }, { "auxiliary_loss_clip": 0.01175245, "auxiliary_loss_mlp": 0.01025739, "balance_loss_clip": 1.0489397, "balance_loss_mlp": 1.01874495, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.8995984746531938, "language_loss": 0.71318138, "learning_rate": 6.375132025701657e-08, "loss": 0.73519117, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.6561758518218994 }, { "auxiliary_loss_clip": 0.01178049, "auxiliary_loss_mlp": 0.01028017, "balance_loss_clip": 1.05194199, "balance_loss_mlp": 1.0209713, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 4.254408088398737, "language_loss": 0.69506627, "learning_rate": 6.355635803237724e-08, "loss": 0.71712697, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.629960298538208 }, { "auxiliary_loss_clip": 0.01223089, "auxiliary_loss_mlp": 0.01026641, "balance_loss_clip": 1.04634476, "balance_loss_mlp": 1.01961088, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 1.9310618555927483, "language_loss": 0.799523, "learning_rate": 6.336168956469867e-08, "loss": 0.82202035, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.6667723655700684 }, { "auxiliary_loss_clip": 0.01265709, "auxiliary_loss_mlp": 0.01024111, "balance_loss_clip": 1.04516649, "balance_loss_mlp": 1.01749182, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.6411523776650996, "language_loss": 0.7191903, "learning_rate": 6.316731488351168e-08, "loss": 0.74208844, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.7147057056427 }, { "auxiliary_loss_clip": 0.0122288, "auxiliary_loss_mlp": 0.01025078, "balance_loss_clip": 1.0485636, "balance_loss_mlp": 1.01795268, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.7786618687253717, "language_loss": 0.63423848, "learning_rate": 6.297323401830334e-08, "loss": 0.65671808, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.619178056716919 }, { "auxiliary_loss_clip": 0.01223129, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 1.04679465, "balance_loss_mlp": 1.01378095, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.2254605605740645, "language_loss": 0.68947113, "learning_rate": 6.277944699851523e-08, "loss": 0.71191043, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 3.8282272815704346 }, { "auxiliary_loss_clip": 0.01173528, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04925799, "balance_loss_mlp": 1.01594806, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 1.9463046491098237, "language_loss": 0.73851609, "learning_rate": 6.25859538535447e-08, "loss": 0.76048237, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.637486219406128 }, { "auxiliary_loss_clip": 0.01266246, "auxiliary_loss_mlp": 0.01025176, "balance_loss_clip": 1.04543066, "balance_loss_mlp": 1.01812458, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 14.046082153230564, "language_loss": 0.77396375, "learning_rate": 6.239275461274474e-08, "loss": 0.79687798, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 3.6801514625549316 }, { "auxiliary_loss_clip": 0.01221687, "auxiliary_loss_mlp": 0.01024508, "balance_loss_clip": 1.04797602, "balance_loss_mlp": 1.0176388, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.9502130959820356, "language_loss": 0.85965276, "learning_rate": 6.219984930542299e-08, "loss": 0.88211465, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.7060203552246094 }, { "auxiliary_loss_clip": 0.01225628, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.04725337, "balance_loss_mlp": 1.02113593, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.186827642523092, "language_loss": 0.75872844, "learning_rate": 6.200723796084383e-08, "loss": 0.78125966, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 3.6751155853271484 }, { "auxiliary_loss_clip": 0.01227925, "auxiliary_loss_mlp": 0.00998769, "balance_loss_clip": 1.00637376, "balance_loss_mlp": 0.99780953, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7616374138038766, "language_loss": 0.62986958, "learning_rate": 6.181492060822546e-08, "loss": 0.65213656, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.1840269565582275 }, { "auxiliary_loss_clip": 0.01371811, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.04186141, "balance_loss_mlp": 1.0233053, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.0612843727714836, "language_loss": 0.82026619, "learning_rate": 6.162289727674274e-08, "loss": 0.8442893, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.7368199825286865 }, { "auxiliary_loss_clip": 0.01320509, "auxiliary_loss_mlp": 0.01025488, "balance_loss_clip": 1.0452323, "balance_loss_mlp": 1.01947117, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.4349435396949644, "language_loss": 0.87797487, "learning_rate": 6.143116799552527e-08, "loss": 0.9014349, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 3.6829094886779785 }, { "auxiliary_loss_clip": 0.01226006, "auxiliary_loss_mlp": 0.01026091, "balance_loss_clip": 1.04879463, "balance_loss_mlp": 1.01928425, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.247363750575837, "language_loss": 0.56475681, "learning_rate": 6.123973279365802e-08, "loss": 0.58727777, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.716836929321289 }, { "auxiliary_loss_clip": 0.01225648, "auxiliary_loss_mlp": 0.01027205, "balance_loss_clip": 1.04998899, "balance_loss_mlp": 1.02071738, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 1.807401491478104, "language_loss": 0.77886069, "learning_rate": 6.10485917001824e-08, "loss": 0.80138922, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.6723430156707764 }, { "auxiliary_loss_clip": 0.01175114, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.04453993, "balance_loss_mlp": 1.02065539, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.6487303034629541, "language_loss": 0.81110525, "learning_rate": 6.085774474409322e-08, "loss": 0.83312643, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.6461377143859863 }, { "auxiliary_loss_clip": 0.01269146, "auxiliary_loss_mlp": 0.01022053, "balance_loss_clip": 1.04811537, "balance_loss_mlp": 1.01548481, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.1838280236195033, "language_loss": 0.70045346, "learning_rate": 6.066719195434267e-08, "loss": 0.72336543, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.7047085762023926 }, { "auxiliary_loss_clip": 0.01224989, "auxiliary_loss_mlp": 0.01022807, "balance_loss_clip": 1.04950988, "balance_loss_mlp": 1.0156306, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.0082459652859095, "language_loss": 0.66301, "learning_rate": 6.047693335983717e-08, "loss": 0.68548799, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.776366710662842 }, { "auxiliary_loss_clip": 0.01224373, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 1.04665995, "balance_loss_mlp": 1.02071297, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.3833625705966797, "language_loss": 0.82276267, "learning_rate": 6.028696898943853e-08, "loss": 0.84528625, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.6630938053131104 }, { "auxiliary_loss_clip": 0.01272297, "auxiliary_loss_mlp": 0.0256893, "balance_loss_clip": 1.04495859, "balance_loss_mlp": 0.99992216, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 2.6064861629315335, "language_loss": 0.70919275, "learning_rate": 6.00972988719648e-08, "loss": 0.74760503, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.6866466999053955 }, { "auxiliary_loss_clip": 0.01321234, "auxiliary_loss_mlp": 0.02568285, "balance_loss_clip": 1.04440653, "balance_loss_mlp": 0.99991548, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.6547624975412485, "language_loss": 0.70908141, "learning_rate": 5.990792303618807e-08, "loss": 0.74797654, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.7846834659576416 }, { "auxiliary_loss_clip": 0.01327692, "auxiliary_loss_mlp": 0.01025936, "balance_loss_clip": 1.04992151, "balance_loss_mlp": 1.01931143, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.8592493412049176, "language_loss": 0.69803941, "learning_rate": 5.971884151083695e-08, "loss": 0.72157568, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.7674665451049805 }, { "auxiliary_loss_clip": 0.0127535, "auxiliary_loss_mlp": 0.01023945, "balance_loss_clip": 1.04695463, "balance_loss_mlp": 1.01702785, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.8626296885726128, "language_loss": 0.74090058, "learning_rate": 5.9530054324595124e-08, "loss": 0.76389354, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.788607358932495 }, { "auxiliary_loss_clip": 0.01107862, "auxiliary_loss_mlp": 0.02504532, "balance_loss_clip": 1.00681317, "balance_loss_mlp": 0.99981117, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7184408929410301, "language_loss": 0.57520521, "learning_rate": 5.934156150610103e-08, "loss": 0.61132908, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.239762544631958 }, { "auxiliary_loss_clip": 0.01271996, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.04581785, "balance_loss_mlp": 1.02492893, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 2.55743660460017, "language_loss": 0.79188991, "learning_rate": 5.915336308394914e-08, "loss": 0.81492686, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.6788175106048584 }, { "auxiliary_loss_clip": 0.01215798, "auxiliary_loss_mlp": 0.01023749, "balance_loss_clip": 1.04647422, "balance_loss_mlp": 1.01692438, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.796220889811145, "language_loss": 0.7702952, "learning_rate": 5.89654590866886e-08, "loss": 0.79269063, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.6483120918273926 }, { "auxiliary_loss_clip": 0.01427152, "auxiliary_loss_mlp": 0.010236, "balance_loss_clip": 1.04632497, "balance_loss_mlp": 1.01610219, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 2.161649314901735, "language_loss": 0.88450336, "learning_rate": 5.877784954282483e-08, "loss": 0.90901089, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.784393072128296 }, { "auxiliary_loss_clip": 0.01227543, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.04848897, "balance_loss_mlp": 1.02061307, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 2.7499012123479307, "language_loss": 0.72801268, "learning_rate": 5.8590534480817963e-08, "loss": 0.75056565, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.9204342365264893 }, { "auxiliary_loss_clip": 0.0117501, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.05112696, "balance_loss_mlp": 1.01834917, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.300030991527989, "language_loss": 0.72775251, "learning_rate": 5.840351392908349e-08, "loss": 0.74975377, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.528831958770752 }, { "auxiliary_loss_clip": 0.01279563, "auxiliary_loss_mlp": 0.02565513, "balance_loss_clip": 1.04625916, "balance_loss_mlp": 0.99990177, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 3.443298946856316, "language_loss": 0.71001655, "learning_rate": 5.821678791599205e-08, "loss": 0.74846733, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 2.7325503826141357 }, { "auxiliary_loss_clip": 0.01271301, "auxiliary_loss_mlp": 0.01026751, "balance_loss_clip": 1.0476346, "balance_loss_mlp": 1.0200243, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 1.8177289632942266, "language_loss": 0.80650729, "learning_rate": 5.803035646986965e-08, "loss": 0.8294878, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.675334930419922 }, { "auxiliary_loss_clip": 0.01175136, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.04827237, "balance_loss_mlp": 1.02123094, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.785506813784668, "language_loss": 0.67396724, "learning_rate": 5.7844219618998766e-08, "loss": 0.69600248, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 3.4817934036254883 }, { "auxiliary_loss_clip": 0.01313765, "auxiliary_loss_mlp": 0.0102423, "balance_loss_clip": 1.04071307, "balance_loss_mlp": 1.01760817, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 1.7721496804453651, "language_loss": 0.71958768, "learning_rate": 5.765837739161505e-08, "loss": 0.74296767, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 3.6690096855163574 }, { "auxiliary_loss_clip": 0.01322025, "auxiliary_loss_mlp": 0.01025278, "balance_loss_clip": 1.04599643, "balance_loss_mlp": 1.01873946, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.666850578159443, "language_loss": 0.74326801, "learning_rate": 5.7472829815911504e-08, "loss": 0.76674104, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.691148042678833 }, { "auxiliary_loss_clip": 0.01270068, "auxiliary_loss_mlp": 0.01021599, "balance_loss_clip": 1.04485393, "balance_loss_mlp": 1.01441991, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.8451428570727484, "language_loss": 0.82048154, "learning_rate": 5.7287576920035164e-08, "loss": 0.84339821, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.690112829208374 }, { "auxiliary_loss_clip": 0.01313652, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04554152, "balance_loss_mlp": 1.01728487, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 1.7864136972950835, "language_loss": 0.76322836, "learning_rate": 5.7102618732088435e-08, "loss": 0.78660136, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 3.7211196422576904 }, { "auxiliary_loss_clip": 0.01177097, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.02318716, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.7089122409412214, "language_loss": 0.74632955, "learning_rate": 5.6917955280130216e-08, "loss": 0.76839888, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.6704318523406982 }, { "auxiliary_loss_clip": 0.01220168, "auxiliary_loss_mlp": 0.01024197, "balance_loss_clip": 1.04800177, "balance_loss_mlp": 1.01700854, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.3304280556542722, "language_loss": 0.72060859, "learning_rate": 5.6733586592172755e-08, "loss": 0.74305224, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.6581056118011475 }, { "auxiliary_loss_clip": 0.01264592, "auxiliary_loss_mlp": 0.0256079, "balance_loss_clip": 1.04283381, "balance_loss_mlp": 0.99991059, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 1.753744912981716, "language_loss": 0.80246782, "learning_rate": 5.6549512696185244e-08, "loss": 0.84072173, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.650956630706787 }, { "auxiliary_loss_clip": 0.01171871, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.05033946, "balance_loss_mlp": 1.02003217, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.625601258765649, "language_loss": 0.68217891, "learning_rate": 5.636573362009156e-08, "loss": 0.70416701, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 3.528310775756836 }, { "auxiliary_loss_clip": 0.01176181, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.04870892, "balance_loss_mlp": 1.02084672, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 2.8554600777243797, "language_loss": 0.76712757, "learning_rate": 5.618224939177074e-08, "loss": 0.78917676, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.587123155593872 }, { "auxiliary_loss_clip": 0.01266142, "auxiliary_loss_mlp": 0.01021581, "balance_loss_clip": 1.04516387, "balance_loss_mlp": 1.01506579, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.8130260201248998, "language_loss": 0.70227039, "learning_rate": 5.599906003905719e-08, "loss": 0.7251476, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.745382070541382 }, { "auxiliary_loss_clip": 0.01225014, "auxiliary_loss_mlp": 0.01028796, "balance_loss_clip": 1.05202866, "balance_loss_mlp": 1.02149189, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.6685322764308417, "language_loss": 0.81577253, "learning_rate": 5.581616558974023e-08, "loss": 0.8383106, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.6579818725585938 }, { "auxiliary_loss_clip": 0.01229982, "auxiliary_loss_mlp": 0.02567561, "balance_loss_clip": 1.04852152, "balance_loss_mlp": 0.99989283, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.7514163569503698, "language_loss": 0.79663223, "learning_rate": 5.5633566071565444e-08, "loss": 0.8346076, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.6408329010009766 }, { "auxiliary_loss_clip": 0.01423549, "auxiliary_loss_mlp": 0.01024015, "balance_loss_clip": 1.04381824, "balance_loss_mlp": 1.01750636, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 2.508073046011233, "language_loss": 0.70424289, "learning_rate": 5.5451261512232896e-08, "loss": 0.72871852, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 3.0209579467773438 }, { "auxiliary_loss_clip": 0.01224929, "auxiliary_loss_mlp": 0.01026881, "balance_loss_clip": 1.04506755, "balance_loss_mlp": 1.02006269, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 3.5500058418484226, "language_loss": 0.62538803, "learning_rate": 5.5269251939397576e-08, "loss": 0.64790612, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 3.066128730773926 }, { "auxiliary_loss_clip": 0.01321686, "auxiliary_loss_mlp": 0.01023266, "balance_loss_clip": 1.04141986, "balance_loss_mlp": 1.01628351, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.0488016804574882, "language_loss": 0.7671141, "learning_rate": 5.508753738067073e-08, "loss": 0.79056358, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.778892755508423 }, { "auxiliary_loss_clip": 0.01225083, "auxiliary_loss_mlp": 0.01019247, "balance_loss_clip": 1.04625845, "balance_loss_mlp": 1.01238036, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 1.9453749902112667, "language_loss": 0.79160124, "learning_rate": 5.4906117863617875e-08, "loss": 0.81404448, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.7032482624053955 }, { "auxiliary_loss_clip": 0.01317228, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.04257774, "balance_loss_mlp": 1.0183903, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 1.7803058715346476, "language_loss": 0.78010195, "learning_rate": 5.4724993415760533e-08, "loss": 0.80352187, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.7863426208496094 }, { "auxiliary_loss_clip": 0.01329858, "auxiliary_loss_mlp": 0.02565364, "balance_loss_clip": 1.04357886, "balance_loss_mlp": 0.99991763, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 2.1805709756597795, "language_loss": 0.74868041, "learning_rate": 5.454416406457496e-08, "loss": 0.78763264, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.702840566635132 }, { "auxiliary_loss_clip": 0.01221701, "auxiliary_loss_mlp": 0.01023098, "balance_loss_clip": 1.0466491, "balance_loss_mlp": 1.01637113, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 2.5938394828002616, "language_loss": 0.7428754, "learning_rate": 5.436362983749299e-08, "loss": 0.76532334, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.575099229812622 }, { "auxiliary_loss_clip": 0.01313794, "auxiliary_loss_mlp": 0.01018379, "balance_loss_clip": 1.0479629, "balance_loss_mlp": 1.01189685, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 2.1566773996243525, "language_loss": 0.64545077, "learning_rate": 5.418339076190137e-08, "loss": 0.66877252, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.7410027980804443 }, { "auxiliary_loss_clip": 0.01268234, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.0455997, "balance_loss_mlp": 1.02144992, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 2.12654806262134, "language_loss": 0.88661242, "learning_rate": 5.400344686514202e-08, "loss": 0.90957606, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.678560495376587 }, { "auxiliary_loss_clip": 0.01219991, "auxiliary_loss_mlp": 0.01023248, "balance_loss_clip": 1.0477854, "balance_loss_mlp": 1.01670039, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 1.8221807449596406, "language_loss": 0.66893148, "learning_rate": 5.38237981745131e-08, "loss": 0.69136393, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.6776480674743652 }, { "auxiliary_loss_clip": 0.01224703, "auxiliary_loss_mlp": 0.02564967, "balance_loss_clip": 1.04648209, "balance_loss_mlp": 0.99989748, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 2.0051933085365383, "language_loss": 0.81323123, "learning_rate": 5.364444471726592e-08, "loss": 0.85112786, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.618046760559082 }, { "auxiliary_loss_clip": 0.01221729, "auxiliary_loss_mlp": 0.01026867, "balance_loss_clip": 1.04712105, "balance_loss_mlp": 1.01993823, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 3.4683001619516443, "language_loss": 0.80351269, "learning_rate": 5.346538652060939e-08, "loss": 0.8259986, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.670834541320801 }, { "auxiliary_loss_clip": 0.01269872, "auxiliary_loss_mlp": 0.01025931, "balance_loss_clip": 1.04778016, "balance_loss_mlp": 1.01957107, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 2.0176861073747885, "language_loss": 0.70098865, "learning_rate": 5.3286623611705994e-08, "loss": 0.72394663, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.6197986602783203 }, { "auxiliary_loss_clip": 0.01059431, "auxiliary_loss_mlp": 0.01002249, "balance_loss_clip": 1.00563264, "balance_loss_mlp": 1.00140905, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8203169183894441, "language_loss": 0.60614705, "learning_rate": 5.3108156017673824e-08, "loss": 0.62676388, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 4.4203643798828125 }, { "auxiliary_loss_clip": 0.01177611, "auxiliary_loss_mlp": 0.01029189, "balance_loss_clip": 1.04798973, "balance_loss_mlp": 1.02184546, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.7878314897411958, "language_loss": 0.71651495, "learning_rate": 5.2929983765586775e-08, "loss": 0.73858297, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 3.6032068729400635 }, { "auxiliary_loss_clip": 0.01171192, "auxiliary_loss_mlp": 0.01022495, "balance_loss_clip": 1.04838634, "balance_loss_mlp": 1.01603115, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.7861405759556255, "language_loss": 0.6268549, "learning_rate": 5.275210688247278e-08, "loss": 0.64879179, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.586944341659546 }, { "auxiliary_loss_clip": 0.01374866, "auxiliary_loss_mlp": 0.01024251, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.01756907, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 1.9533391621767997, "language_loss": 0.85105193, "learning_rate": 5.257452539531604e-08, "loss": 0.87504315, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.7508585453033447 }, { "auxiliary_loss_clip": 0.01222091, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.04517078, "balance_loss_mlp": 1.02309024, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.7414692614400629, "language_loss": 0.68328297, "learning_rate": 5.2397239331055445e-08, "loss": 0.70580202, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 3.636845588684082 }, { "auxiliary_loss_clip": 0.01264761, "auxiliary_loss_mlp": 0.01024482, "balance_loss_clip": 1.04727006, "balance_loss_mlp": 1.01733851, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.1403804069176946, "language_loss": 0.80863726, "learning_rate": 5.2220248716585036e-08, "loss": 0.83152974, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.662332057952881 }, { "auxiliary_loss_clip": 0.01217327, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 1.04424238, "balance_loss_mlp": 1.01892579, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.0937526968173863, "language_loss": 0.75596941, "learning_rate": 5.204355357875445e-08, "loss": 0.7783978, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.675457239151001 }, { "auxiliary_loss_clip": 0.01273035, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.04443431, "balance_loss_mlp": 1.02364004, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 2.2008735155387003, "language_loss": 0.70542848, "learning_rate": 5.1867153944367584e-08, "loss": 0.72846383, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.610868215560913 }, { "auxiliary_loss_clip": 0.01325017, "auxiliary_loss_mlp": 0.01025047, "balance_loss_clip": 1.04429531, "balance_loss_mlp": 1.01812053, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.7334318232814747, "language_loss": 0.7362836, "learning_rate": 5.16910498401848e-08, "loss": 0.75978422, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.771665334701538 }, { "auxiliary_loss_clip": 0.01172869, "auxiliary_loss_mlp": 0.01021556, "balance_loss_clip": 1.05067849, "balance_loss_mlp": 1.01549745, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.0434319159245296, "language_loss": 0.83635128, "learning_rate": 5.151524129292073e-08, "loss": 0.85829556, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 3.506770133972168 }, { "auxiliary_loss_clip": 0.01222977, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.04901648, "balance_loss_mlp": 1.02022147, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.0872207667755673, "language_loss": 0.66803205, "learning_rate": 5.1339728329245155e-08, "loss": 0.69053507, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.6235122680664062 }, { "auxiliary_loss_clip": 0.01177897, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.04924893, "balance_loss_mlp": 1.01811862, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 2.2172447950387744, "language_loss": 0.78790748, "learning_rate": 5.116451097578367e-08, "loss": 0.80993557, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.6340863704681396 }, { "auxiliary_loss_clip": 0.01321402, "auxiliary_loss_mlp": 0.01024865, "balance_loss_clip": 1.04301131, "balance_loss_mlp": 1.01831102, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 2.347249812006517, "language_loss": 0.74301648, "learning_rate": 5.0989589259115895e-08, "loss": 0.76647919, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.687514066696167 }, { "auxiliary_loss_clip": 0.01220767, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.04486895, "balance_loss_mlp": 1.02471697, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 2.208767087641274, "language_loss": 0.71731472, "learning_rate": 5.081496320577816e-08, "loss": 0.73984993, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.640324831008911 }, { "auxiliary_loss_clip": 0.01181964, "auxiliary_loss_mlp": 0.0099898, "balance_loss_clip": 1.02005267, "balance_loss_mlp": 0.99801409, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9186404950393257, "language_loss": 0.6118896, "learning_rate": 5.0640632842260835e-08, "loss": 0.63369906, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.3043155670166016 }, { "auxiliary_loss_clip": 0.01316622, "auxiliary_loss_mlp": 0.02563536, "balance_loss_clip": 1.04582858, "balance_loss_mlp": 0.99990976, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.572472575035379, "language_loss": 0.72719574, "learning_rate": 5.0466598195009426e-08, "loss": 0.76599729, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.0005831718444824 }, { "auxiliary_loss_clip": 0.01220606, "auxiliary_loss_mlp": 0.01021906, "balance_loss_clip": 1.04490709, "balance_loss_mlp": 1.01566827, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 1.9282547649349842, "language_loss": 0.70181489, "learning_rate": 5.0292859290425036e-08, "loss": 0.72424001, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.6587905883789062 }, { "auxiliary_loss_clip": 0.01172124, "auxiliary_loss_mlp": 0.01025277, "balance_loss_clip": 1.04951644, "balance_loss_mlp": 1.01901817, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 1.9860508867025712, "language_loss": 0.77873456, "learning_rate": 5.011941615486348e-08, "loss": 0.80070853, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.6158289909362793 }, { "auxiliary_loss_clip": 0.01171406, "auxiliary_loss_mlp": 0.01022432, "balance_loss_clip": 1.04799473, "balance_loss_mlp": 1.01613522, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.038649066424119, "language_loss": 0.84596819, "learning_rate": 4.994626881463659e-08, "loss": 0.86790657, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.552450656890869 }, { "auxiliary_loss_clip": 0.01360612, "auxiliary_loss_mlp": 0.01023171, "balance_loss_clip": 1.04058254, "balance_loss_mlp": 1.016361, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 4.693994822726858, "language_loss": 0.71296227, "learning_rate": 4.9773417296009814e-08, "loss": 0.73680013, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.858205556869507 }, { "auxiliary_loss_clip": 0.01228929, "auxiliary_loss_mlp": 0.01028108, "balance_loss_clip": 1.0490495, "balance_loss_mlp": 1.02082455, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 1.71489651736961, "language_loss": 0.65754515, "learning_rate": 4.960086162520527e-08, "loss": 0.68011552, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.693136215209961 }, { "auxiliary_loss_clip": 0.01274115, "auxiliary_loss_mlp": 0.01025491, "balance_loss_clip": 1.04295015, "balance_loss_mlp": 1.01898229, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.9201485035042742, "language_loss": 0.82350516, "learning_rate": 4.942860182839936e-08, "loss": 0.84650123, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.7034459114074707 }, { "auxiliary_loss_clip": 0.01271115, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.04569507, "balance_loss_mlp": 1.01926708, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 2.0036338772841704, "language_loss": 0.79605103, "learning_rate": 4.925663793172341e-08, "loss": 0.81902385, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.6840360164642334 }, { "auxiliary_loss_clip": 0.01161725, "auxiliary_loss_mlp": 0.02504905, "balance_loss_clip": 1.00712192, "balance_loss_mlp": 0.99985296, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7842745074283981, "language_loss": 0.5652535, "learning_rate": 4.908496996126477e-08, "loss": 0.60191977, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.2649669647216797 }, { "auxiliary_loss_clip": 0.01232872, "auxiliary_loss_mlp": 0.01024979, "balance_loss_clip": 1.05675828, "balance_loss_mlp": 1.01802611, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 2.340446922808765, "language_loss": 0.76372683, "learning_rate": 4.89135979430646e-08, "loss": 0.78630531, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.6783008575439453 }, { "auxiliary_loss_clip": 0.01174334, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.05075479, "balance_loss_mlp": 1.02345133, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 4.125410637833837, "language_loss": 0.85510969, "learning_rate": 4.874252190312078e-08, "loss": 0.87715995, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 3.5444211959838867 }, { "auxiliary_loss_clip": 0.0112378, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.0460248, "balance_loss_mlp": 1.02099264, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.9517312534852662, "language_loss": 0.65004373, "learning_rate": 4.857174186738477e-08, "loss": 0.6715641, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.729975700378418 }, { "auxiliary_loss_clip": 0.01175479, "auxiliary_loss_mlp": 0.01020914, "balance_loss_clip": 1.05149364, "balance_loss_mlp": 1.01403892, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 4.700425355625792, "language_loss": 0.73036432, "learning_rate": 4.840125786176408e-08, "loss": 0.75232828, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 3.4785571098327637 }, { "auxiliary_loss_clip": 0.01272199, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.04703259, "balance_loss_mlp": 1.02240777, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.8225008428527856, "language_loss": 0.77340567, "learning_rate": 4.823106991212067e-08, "loss": 0.79642361, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.6980156898498535 }, { "auxiliary_loss_clip": 0.012234, "auxiliary_loss_mlp": 0.01021336, "balance_loss_clip": 1.04703736, "balance_loss_mlp": 1.0146544, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 2.120786092795345, "language_loss": 0.83541811, "learning_rate": 4.806117804427212e-08, "loss": 0.85786545, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.644279718399048 }, { "auxiliary_loss_clip": 0.01220847, "auxiliary_loss_mlp": 0.01021093, "balance_loss_clip": 1.04563701, "balance_loss_mlp": 1.01469469, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 1.903726425000934, "language_loss": 0.64602953, "learning_rate": 4.7891582283990926e-08, "loss": 0.66844893, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 3.539407968521118 }, { "auxiliary_loss_clip": 0.01218693, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.04266763, "balance_loss_mlp": 1.02107942, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.6476462940931886, "language_loss": 0.72739983, "learning_rate": 4.772228265700473e-08, "loss": 0.74986249, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.7077977657318115 }, { "auxiliary_loss_clip": 0.01227755, "auxiliary_loss_mlp": 0.01021379, "balance_loss_clip": 1.04868281, "balance_loss_mlp": 1.01457834, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.317424078870103, "language_loss": 0.75901079, "learning_rate": 4.75532791889961e-08, "loss": 0.78150213, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.591948986053467 }, { "auxiliary_loss_clip": 0.01221653, "auxiliary_loss_mlp": 0.01024032, "balance_loss_clip": 1.04488659, "balance_loss_mlp": 1.0169661, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.8911021387323868, "language_loss": 0.65782601, "learning_rate": 4.738457190560252e-08, "loss": 0.68028289, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.7287280559539795 }, { "auxiliary_loss_clip": 0.01367111, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.0470264, "balance_loss_mlp": 1.01704264, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 2.362074826983453, "language_loss": 0.79086751, "learning_rate": 4.721616083241664e-08, "loss": 0.8147766, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 3.462134838104248 }, { "auxiliary_loss_clip": 0.0122002, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.04602396, "balance_loss_mlp": 1.02006745, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 3.0052127375951327, "language_loss": 0.77693874, "learning_rate": 4.7048045994986684e-08, "loss": 0.79940933, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.593822956085205 }, { "auxiliary_loss_clip": 0.0112722, "auxiliary_loss_mlp": 0.01024469, "balance_loss_clip": 1.04851532, "balance_loss_mlp": 1.01761734, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.0386099466548244, "language_loss": 0.91214001, "learning_rate": 4.688022741881559e-08, "loss": 0.93365693, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.599095582962036 }, { "auxiliary_loss_clip": 0.01218986, "auxiliary_loss_mlp": 0.01022354, "balance_loss_clip": 1.0469023, "balance_loss_mlp": 1.01578259, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 3.355303747181579, "language_loss": 0.7510674, "learning_rate": 4.671270512936076e-08, "loss": 0.77348077, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.4895782470703125 }, { "auxiliary_loss_clip": 0.01317739, "auxiliary_loss_mlp": 0.01021269, "balance_loss_clip": 1.04397714, "balance_loss_mlp": 1.01508474, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.8623749035013923, "language_loss": 0.82931304, "learning_rate": 4.6545479152035884e-08, "loss": 0.85270309, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.703263521194458 }, { "auxiliary_loss_clip": 0.01224012, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.04825139, "balance_loss_mlp": 1.01714015, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.956444068680892, "language_loss": 0.76125681, "learning_rate": 4.637854951220821e-08, "loss": 0.78373116, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.5428106784820557 }, { "auxiliary_loss_clip": 0.0131546, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.04364395, "balance_loss_mlp": 1.02182293, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 1.9025363427225546, "language_loss": 0.74947953, "learning_rate": 4.621191623520171e-08, "loss": 0.77291989, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.696183204650879 }, { "auxiliary_loss_clip": 0.01331765, "auxiliary_loss_mlp": 0.01027887, "balance_loss_clip": 1.04385591, "balance_loss_mlp": 1.02062094, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.3644031001433743, "language_loss": 0.85183918, "learning_rate": 4.604557934629372e-08, "loss": 0.87543571, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.9611563682556152 }, { "auxiliary_loss_clip": 0.01265356, "auxiliary_loss_mlp": 0.01024153, "balance_loss_clip": 1.0466764, "balance_loss_mlp": 1.01765001, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.8282747930680068, "language_loss": 0.80487716, "learning_rate": 4.587953887071805e-08, "loss": 0.82777226, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.7727267742156982 }, { "auxiliary_loss_clip": 0.01264855, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.04190874, "balance_loss_mlp": 1.02044058, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 9.51625757219278, "language_loss": 0.86189389, "learning_rate": 4.5713794833662554e-08, "loss": 0.88481581, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.6742160320281982 }, { "auxiliary_loss_clip": 0.01173065, "auxiliary_loss_mlp": 0.01025121, "balance_loss_clip": 1.04780936, "balance_loss_mlp": 1.01788461, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.7856242478746225, "language_loss": 0.63259321, "learning_rate": 4.5548347260270236e-08, "loss": 0.65457511, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.5892162322998047 }, { "auxiliary_loss_clip": 0.01315702, "auxiliary_loss_mlp": 0.0102623, "balance_loss_clip": 1.04260027, "balance_loss_mlp": 1.01901531, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 2.5837787685161073, "language_loss": 0.69617593, "learning_rate": 4.538319617564012e-08, "loss": 0.71959531, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.7438905239105225 }, { "auxiliary_loss_clip": 0.01272277, "auxiliary_loss_mlp": 0.01025153, "balance_loss_clip": 1.04426765, "balance_loss_mlp": 1.01836097, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 2.1079067109164455, "language_loss": 0.74963152, "learning_rate": 4.521834160482485e-08, "loss": 0.77260584, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.6656606197357178 }, { "auxiliary_loss_clip": 0.01225618, "auxiliary_loss_mlp": 0.01025201, "balance_loss_clip": 1.04831147, "balance_loss_mlp": 1.01847172, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 2.5302708216246845, "language_loss": 0.82187676, "learning_rate": 4.5053783572832846e-08, "loss": 0.84438497, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.726696491241455 }, { "auxiliary_loss_clip": 0.01218898, "auxiliary_loss_mlp": 0.01024147, "balance_loss_clip": 1.0473696, "balance_loss_mlp": 1.01740551, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.782350678596645, "language_loss": 0.76269865, "learning_rate": 4.488952210462771e-08, "loss": 0.78512907, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.6394431591033936 }, { "auxiliary_loss_clip": 0.01171833, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.04935408, "balance_loss_mlp": 1.01884174, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 2.0924657086736986, "language_loss": 0.85650635, "learning_rate": 4.4725557225127495e-08, "loss": 0.87848365, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.6760668754577637 }, { "auxiliary_loss_clip": 0.01221689, "auxiliary_loss_mlp": 0.01028934, "balance_loss_clip": 1.04760158, "balance_loss_mlp": 1.0225563, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.7150973678870693, "language_loss": 0.79342103, "learning_rate": 4.456188895920565e-08, "loss": 0.81592727, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.69814395904541 }, { "auxiliary_loss_clip": 0.01173158, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.04935205, "balance_loss_mlp": 1.01902556, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 2.3605814368180456, "language_loss": 0.85379821, "learning_rate": 4.439851733169031e-08, "loss": 0.8757906, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 3.4479074478149414 }, { "auxiliary_loss_clip": 0.01318068, "auxiliary_loss_mlp": 0.01021314, "balance_loss_clip": 1.04437232, "balance_loss_mlp": 1.01476932, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.4016968483513974, "language_loss": 0.69679999, "learning_rate": 4.4235442367365204e-08, "loss": 0.7201938, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 3.7420363426208496 }, { "auxiliary_loss_clip": 0.01271751, "auxiliary_loss_mlp": 0.01023846, "balance_loss_clip": 1.04252326, "balance_loss_mlp": 1.01707458, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 2.5820126061632265, "language_loss": 0.79612839, "learning_rate": 4.4072664090968545e-08, "loss": 0.81908435, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.667287826538086 }, { "auxiliary_loss_clip": 0.01273926, "auxiliary_loss_mlp": 0.01023685, "balance_loss_clip": 1.04359221, "balance_loss_mlp": 1.01661313, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.8720929691893313, "language_loss": 0.84638, "learning_rate": 4.391018252719347e-08, "loss": 0.8693561, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.6974427700042725 }, { "auxiliary_loss_clip": 0.01275633, "auxiliary_loss_mlp": 0.01022656, "balance_loss_clip": 1.04490352, "balance_loss_mlp": 1.01567292, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 1.6876943433302813, "language_loss": 0.69254941, "learning_rate": 4.374799770068849e-08, "loss": 0.7155323, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.618621826171875 }, { "auxiliary_loss_clip": 0.01218463, "auxiliary_loss_mlp": 0.0102284, "balance_loss_clip": 1.04731607, "balance_loss_mlp": 1.0155952, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 2.5105608486700133, "language_loss": 0.74953413, "learning_rate": 4.358610963605658e-08, "loss": 0.77194715, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 3.5370163917541504 }, { "auxiliary_loss_clip": 0.0117651, "auxiliary_loss_mlp": 0.0102555, "balance_loss_clip": 1.04999924, "balance_loss_mlp": 1.01869512, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 4.187774208764383, "language_loss": 0.68856889, "learning_rate": 4.342451835785677e-08, "loss": 0.71058953, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.686305046081543 }, { "auxiliary_loss_clip": 0.01270228, "auxiliary_loss_mlp": 0.01027973, "balance_loss_clip": 1.04636955, "balance_loss_mlp": 1.02177715, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 3.0600037164696916, "language_loss": 0.75352693, "learning_rate": 4.3263223890601665e-08, "loss": 0.77650893, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.6157405376434326 }, { "auxiliary_loss_clip": 0.01219177, "auxiliary_loss_mlp": 0.02562922, "balance_loss_clip": 1.0495261, "balance_loss_mlp": 0.99989647, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.83192166448584, "language_loss": 0.7974298, "learning_rate": 4.31022262587597e-08, "loss": 0.83525085, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.709728479385376 }, { "auxiliary_loss_clip": 0.01224674, "auxiliary_loss_mlp": 0.01026235, "balance_loss_clip": 1.05092525, "balance_loss_mlp": 1.01882946, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.6746965565345462, "language_loss": 0.6603688, "learning_rate": 4.2941525486754225e-08, "loss": 0.6828779, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 3.5759010314941406 }, { "auxiliary_loss_clip": 0.01316348, "auxiliary_loss_mlp": 0.01020384, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.01427436, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 2.250218890259956, "language_loss": 0.7964431, "learning_rate": 4.278112159896286e-08, "loss": 0.81981039, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.674241542816162 }, { "auxiliary_loss_clip": 0.01265212, "auxiliary_loss_mlp": 0.01022713, "balance_loss_clip": 1.04024446, "balance_loss_mlp": 1.01632059, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.6973182146331591, "language_loss": 0.67477763, "learning_rate": 4.2621014619719896e-08, "loss": 0.69765687, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.725257635116577 }, { "auxiliary_loss_clip": 0.01168919, "auxiliary_loss_mlp": 0.00999843, "balance_loss_clip": 1.00548077, "balance_loss_mlp": 0.99890101, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.720324846069768, "language_loss": 0.58613384, "learning_rate": 4.246120457331215e-08, "loss": 0.60782146, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.257882833480835 }, { "auxiliary_loss_clip": 0.01270294, "auxiliary_loss_mlp": 0.0102828, "balance_loss_clip": 1.04698217, "balance_loss_mlp": 1.02130651, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 1.9288290804178072, "language_loss": 0.72089636, "learning_rate": 4.2301691483983325e-08, "loss": 0.743882, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.695051670074463 }, { "auxiliary_loss_clip": 0.01226197, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.02353668, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 1.9632626053935698, "language_loss": 0.76130056, "learning_rate": 4.214247537593163e-08, "loss": 0.78386724, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.6557021141052246 }, { "auxiliary_loss_clip": 0.01272414, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.04378617, "balance_loss_mlp": 1.01988447, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 1.9265344570949134, "language_loss": 0.80387914, "learning_rate": 4.1983556273309293e-08, "loss": 0.82686454, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.6794092655181885 }, { "auxiliary_loss_clip": 0.01175808, "auxiliary_loss_mlp": 0.0102742, "balance_loss_clip": 1.0497725, "balance_loss_mlp": 1.02011228, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 2.6843477379406973, "language_loss": 0.68990958, "learning_rate": 4.182493420022526e-08, "loss": 0.71194178, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.5598158836364746 }, { "auxiliary_loss_clip": 0.01225401, "auxiliary_loss_mlp": 0.0102473, "balance_loss_clip": 1.04600644, "balance_loss_mlp": 1.01828051, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 3.7622765839679975, "language_loss": 0.78405237, "learning_rate": 4.166660918074139e-08, "loss": 0.80655366, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.7598819732666016 }, { "auxiliary_loss_clip": 0.01317939, "auxiliary_loss_mlp": 0.01021718, "balance_loss_clip": 1.04486144, "balance_loss_mlp": 1.01510441, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.4095714046721284, "language_loss": 0.73268402, "learning_rate": 4.15085812388758e-08, "loss": 0.75608063, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.709152936935425 }, { "auxiliary_loss_clip": 0.01269265, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.0458827, "balance_loss_mlp": 1.02075267, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 2.030050301832805, "language_loss": 0.78865612, "learning_rate": 4.135085039860153e-08, "loss": 0.81162709, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.705562114715576 }, { "auxiliary_loss_clip": 0.01266614, "auxiliary_loss_mlp": 0.01023462, "balance_loss_clip": 1.04692197, "balance_loss_mlp": 1.01596951, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.1559037928504803, "language_loss": 0.78582972, "learning_rate": 4.1193416683845906e-08, "loss": 0.80873042, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.7035701274871826 }, { "auxiliary_loss_clip": 0.01319772, "auxiliary_loss_mlp": 0.01024646, "balance_loss_clip": 1.04557753, "balance_loss_mlp": 1.0184145, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 4.1954482502577, "language_loss": 0.83661544, "learning_rate": 4.103628011849136e-08, "loss": 0.86005968, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.6687703132629395 }, { "auxiliary_loss_clip": 0.01274392, "auxiliary_loss_mlp": 0.01024992, "balance_loss_clip": 1.04718637, "balance_loss_mlp": 1.01754773, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 1.9418738709170267, "language_loss": 0.75851369, "learning_rate": 4.0879440726375506e-08, "loss": 0.78150755, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.727123737335205 }, { "auxiliary_loss_clip": 0.01271665, "auxiliary_loss_mlp": 0.01025332, "balance_loss_clip": 1.04327416, "balance_loss_mlp": 1.01830173, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.6464163581679965, "language_loss": 0.56367534, "learning_rate": 4.0722898531291074e-08, "loss": 0.58664531, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.7609939575195312 }, { "auxiliary_loss_clip": 0.01276988, "auxiliary_loss_mlp": 0.0102868, "balance_loss_clip": 1.0477314, "balance_loss_mlp": 1.02160478, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.8614383585557865, "language_loss": 0.7693063, "learning_rate": 4.0566653556985295e-08, "loss": 0.79236293, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.9020912647247314 }, { "auxiliary_loss_clip": 0.01470398, "auxiliary_loss_mlp": 0.01025951, "balance_loss_clip": 1.04192376, "balance_loss_mlp": 1.01869702, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 3.5236179013408884, "language_loss": 0.81386608, "learning_rate": 4.0410705827159886e-08, "loss": 0.83882958, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 3.9069552421569824 }, { "auxiliary_loss_clip": 0.0127221, "auxiliary_loss_mlp": 0.01023203, "balance_loss_clip": 1.04274273, "balance_loss_mlp": 1.01598477, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 3.0783342080545824, "language_loss": 0.71168971, "learning_rate": 4.0255055365472356e-08, "loss": 0.73464382, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 3.121744394302368 }, { "auxiliary_loss_clip": 0.01420758, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.03855026, "balance_loss_mlp": 1.02357149, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 3.03388606472478, "language_loss": 0.75337321, "learning_rate": 4.009970219553471e-08, "loss": 0.77788359, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 3.9288625717163086 }, { "auxiliary_loss_clip": 0.01226171, "auxiliary_loss_mlp": 0.01021736, "balance_loss_clip": 1.04858053, "balance_loss_mlp": 1.01460111, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 3.2015040629572686, "language_loss": 0.76808023, "learning_rate": 3.99446463409141e-08, "loss": 0.79055929, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.6398537158966064 }, { "auxiliary_loss_clip": 0.01229548, "auxiliary_loss_mlp": 0.01028964, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.02187085, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.8047452198016125, "language_loss": 0.6903255, "learning_rate": 3.978988782513215e-08, "loss": 0.71291065, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.652686834335327 }, { "auxiliary_loss_clip": 0.01226201, "auxiliary_loss_mlp": 0.01028079, "balance_loss_clip": 1.04719257, "balance_loss_mlp": 1.02131987, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.799681124036644, "language_loss": 0.76447511, "learning_rate": 3.963542667166586e-08, "loss": 0.78701788, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 3.621711492538452 }, { "auxiliary_loss_clip": 0.01219703, "auxiliary_loss_mlp": 0.01024717, "balance_loss_clip": 1.04652643, "balance_loss_mlp": 1.01805639, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 6.104763460693421, "language_loss": 0.68092996, "learning_rate": 3.9481262903946486e-08, "loss": 0.70337415, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.6802186965942383 }, { "auxiliary_loss_clip": 0.01280349, "auxiliary_loss_mlp": 0.00999109, "balance_loss_clip": 1.00625229, "balance_loss_mlp": 0.99820852, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7788715424616803, "language_loss": 0.54444706, "learning_rate": 3.932739654536066e-08, "loss": 0.56724161, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.2498064041137695 }, { "auxiliary_loss_clip": 0.01219561, "auxiliary_loss_mlp": 0.01022451, "balance_loss_clip": 1.04741836, "balance_loss_mlp": 1.01575398, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.265981059557103, "language_loss": 0.74418378, "learning_rate": 3.917382761925014e-08, "loss": 0.76660395, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.5700900554656982 }, { "auxiliary_loss_clip": 0.01216816, "auxiliary_loss_mlp": 0.01024341, "balance_loss_clip": 1.04724598, "balance_loss_mlp": 1.01764405, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.7986561298312898, "language_loss": 0.78901815, "learning_rate": 3.9020556148910754e-08, "loss": 0.81142974, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.6741905212402344 }, { "auxiliary_loss_clip": 0.01169776, "auxiliary_loss_mlp": 0.00997922, "balance_loss_clip": 1.00610459, "balance_loss_mlp": 0.99708128, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7063008486423541, "language_loss": 0.56624949, "learning_rate": 3.8867582157593895e-08, "loss": 0.58792651, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 4.010875701904297 }, { "auxiliary_loss_clip": 0.01219182, "auxiliary_loss_mlp": 0.01022203, "balance_loss_clip": 1.0497309, "balance_loss_mlp": 1.0156523, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 2.4822216211766635, "language_loss": 0.76758981, "learning_rate": 3.871490566850544e-08, "loss": 0.79000378, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.728760004043579 }, { "auxiliary_loss_clip": 0.01269127, "auxiliary_loss_mlp": 0.01020167, "balance_loss_clip": 1.04534805, "balance_loss_mlp": 1.01340497, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.7776844108076166, "language_loss": 0.70651203, "learning_rate": 3.856252670480642e-08, "loss": 0.72940499, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.726381778717041 }, { "auxiliary_loss_clip": 0.01271443, "auxiliary_loss_mlp": 0.01022836, "balance_loss_clip": 1.04223049, "balance_loss_mlp": 1.01585054, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 2.1429204283715806, "language_loss": 0.81527936, "learning_rate": 3.841044528961279e-08, "loss": 0.83822221, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.626502752304077 }, { "auxiliary_loss_clip": 0.01172788, "auxiliary_loss_mlp": 0.0102625, "balance_loss_clip": 1.04717743, "balance_loss_mlp": 1.01870716, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.8067567426507496, "language_loss": 0.78877449, "learning_rate": 3.825866144599477e-08, "loss": 0.81076485, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.618201494216919 }, { "auxiliary_loss_clip": 0.01270754, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.04552758, "balance_loss_mlp": 1.01790559, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 2.367181984966003, "language_loss": 0.7547012, "learning_rate": 3.8107175196978145e-08, "loss": 0.77765918, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.6070165634155273 }, { "auxiliary_loss_clip": 0.01328002, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.04826546, "balance_loss_mlp": 1.01895475, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 2.3042926569760755, "language_loss": 0.77119911, "learning_rate": 3.7955986565542996e-08, "loss": 0.79473239, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.6926443576812744 }, { "auxiliary_loss_clip": 0.01319879, "auxiliary_loss_mlp": 0.01020181, "balance_loss_clip": 1.04310584, "balance_loss_mlp": 1.01343131, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.2395194249864976, "language_loss": 0.68358821, "learning_rate": 3.780509557462497e-08, "loss": 0.70698881, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.8270039558410645 }, { "auxiliary_loss_clip": 0.01270014, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.04338765, "balance_loss_mlp": 1.02107251, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.589681271638446, "language_loss": 0.75704533, "learning_rate": 3.765450224711375e-08, "loss": 0.78002739, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.634904623031616 }, { "auxiliary_loss_clip": 0.01272823, "auxiliary_loss_mlp": 0.0102, "balance_loss_clip": 1.04936361, "balance_loss_mlp": 1.01334786, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 2.1592043760598947, "language_loss": 0.80082047, "learning_rate": 3.750420660585396e-08, "loss": 0.82374871, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.65928316116333 }, { "auxiliary_loss_clip": 0.01171334, "auxiliary_loss_mlp": 0.01024482, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.01773524, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.958116767396989, "language_loss": 0.80092776, "learning_rate": 3.735420867364603e-08, "loss": 0.82288587, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.587510347366333 }, { "auxiliary_loss_clip": 0.01415847, "auxiliary_loss_mlp": 0.01021489, "balance_loss_clip": 1.03825879, "balance_loss_mlp": 1.01539469, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.748219621314816, "language_loss": 0.61773652, "learning_rate": 3.7204508473244186e-08, "loss": 0.64210987, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.907794952392578 }, { "auxiliary_loss_clip": 0.01459272, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.03992414, "balance_loss_mlp": 1.02145338, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.6804941775891205, "language_loss": 0.69470435, "learning_rate": 3.7055106027357395e-08, "loss": 0.71957952, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.8467531204223633 }, { "auxiliary_loss_clip": 0.01215148, "auxiliary_loss_mlp": 0.01019839, "balance_loss_clip": 1.04449487, "balance_loss_mlp": 1.01313639, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 2.1026785217184663, "language_loss": 0.71671307, "learning_rate": 3.690600135865063e-08, "loss": 0.73906296, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.805140495300293 }, { "auxiliary_loss_clip": 0.01169929, "auxiliary_loss_mlp": 0.00997939, "balance_loss_clip": 1.00693977, "balance_loss_mlp": 0.99706328, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.78395613323091, "language_loss": 0.58023262, "learning_rate": 3.675719448974246e-08, "loss": 0.60191131, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.3798115253448486 }, { "auxiliary_loss_clip": 0.0137335, "auxiliary_loss_mlp": 0.02565756, "balance_loss_clip": 1.04512489, "balance_loss_mlp": 0.99993646, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 2.10487629180476, "language_loss": 0.60542142, "learning_rate": 3.6608685443207054e-08, "loss": 0.64481246, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.754261016845703 }, { "auxiliary_loss_clip": 0.01320631, "auxiliary_loss_mlp": 0.01022922, "balance_loss_clip": 1.04309607, "balance_loss_mlp": 1.0162853, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.326311077982442, "language_loss": 0.66801107, "learning_rate": 3.646047424157306e-08, "loss": 0.69144654, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 3.8016135692596436 }, { "auxiliary_loss_clip": 0.01273035, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.0490278, "balance_loss_mlp": 1.01715446, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 3.960701598183781, "language_loss": 0.68926591, "learning_rate": 3.631256090732382e-08, "loss": 0.71224415, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.668837308883667 }, { "auxiliary_loss_clip": 0.01322823, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.04630899, "balance_loss_mlp": 1.01962245, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 1.6440466404120444, "language_loss": 0.83175302, "learning_rate": 3.6164945462897833e-08, "loss": 0.8552438, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 3.650991201400757 }, { "auxiliary_loss_clip": 0.01220038, "auxiliary_loss_mlp": 0.02562405, "balance_loss_clip": 1.0487771, "balance_loss_mlp": 0.99990612, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 2.3523004645171706, "language_loss": 0.7577306, "learning_rate": 3.6017627930687856e-08, "loss": 0.795555, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.6554887294769287 }, { "auxiliary_loss_clip": 0.01367304, "auxiliary_loss_mlp": 0.01021456, "balance_loss_clip": 1.04015541, "balance_loss_mlp": 1.0144614, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 3.243715009580539, "language_loss": 0.773794, "learning_rate": 3.587060833304267e-08, "loss": 0.79768157, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.6814963817596436 }, { "auxiliary_loss_clip": 0.01225168, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.04892421, "balance_loss_mlp": 1.02353358, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 2.301280753462474, "language_loss": 0.64281952, "learning_rate": 3.5723886692264225e-08, "loss": 0.6653772, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 3.5092315673828125 }, { "auxiliary_loss_clip": 0.01268346, "auxiliary_loss_mlp": 0.01024634, "balance_loss_clip": 1.04392695, "balance_loss_mlp": 1.01852441, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 3.7575859321750946, "language_loss": 0.61906224, "learning_rate": 3.557746303061071e-08, "loss": 0.64199209, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.70263934135437 }, { "auxiliary_loss_clip": 0.01269309, "auxiliary_loss_mlp": 0.01025065, "balance_loss_clip": 1.04495919, "balance_loss_mlp": 1.01900661, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 1.6795847693451915, "language_loss": 0.72507656, "learning_rate": 3.543133737029391e-08, "loss": 0.74802029, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.7003724575042725 }, { "auxiliary_loss_clip": 0.01227407, "auxiliary_loss_mlp": 0.01022803, "balance_loss_clip": 1.04822087, "balance_loss_mlp": 1.01554942, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 2.060320809966886, "language_loss": 0.69341028, "learning_rate": 3.5285509733481214e-08, "loss": 0.71591234, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.66176700592041 }, { "auxiliary_loss_clip": 0.01224413, "auxiliary_loss_mlp": 0.01027567, "balance_loss_clip": 1.04712319, "balance_loss_mlp": 1.02020931, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.6300509161129637, "language_loss": 0.76369846, "learning_rate": 3.513998014229469e-08, "loss": 0.78621823, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.6615893840789795 }, { "auxiliary_loss_clip": 0.01169182, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.04572344, "balance_loss_mlp": 1.01744795, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 8.719509777245694, "language_loss": 0.86428106, "learning_rate": 3.499474861881069e-08, "loss": 0.88621581, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 3.608135938644409 }, { "auxiliary_loss_clip": 0.01419382, "auxiliary_loss_mlp": 0.01026209, "balance_loss_clip": 1.04301155, "balance_loss_mlp": 1.0189786, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 1.8393513058277469, "language_loss": 0.67939782, "learning_rate": 3.4849815185061136e-08, "loss": 0.70385367, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.727985382080078 }, { "auxiliary_loss_clip": 0.0122124, "auxiliary_loss_mlp": 0.01023385, "balance_loss_clip": 1.04568779, "balance_loss_mlp": 1.0168258, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 2.0789199579757143, "language_loss": 0.76339692, "learning_rate": 3.470517986303223e-08, "loss": 0.78584313, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.6368422508239746 }, { "auxiliary_loss_clip": 0.01319784, "auxiliary_loss_mlp": 0.01024361, "balance_loss_clip": 1.04650593, "balance_loss_mlp": 1.01752758, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.6079317276356988, "language_loss": 0.79236639, "learning_rate": 3.4560842674664856e-08, "loss": 0.81580782, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.6587727069854736 }, { "auxiliary_loss_clip": 0.01221683, "auxiliary_loss_mlp": 0.01022222, "balance_loss_clip": 1.04411829, "balance_loss_mlp": 1.01571298, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 1.9438233937997522, "language_loss": 0.75275016, "learning_rate": 3.441680364185506e-08, "loss": 0.77518922, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.670668125152588 }, { "auxiliary_loss_clip": 0.01274583, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 1.0475347, "balance_loss_mlp": 1.02091789, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 3.0839895707673954, "language_loss": 0.74857801, "learning_rate": 3.427306278645314e-08, "loss": 0.77160525, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.639906406402588 }, { "auxiliary_loss_clip": 0.01365411, "auxiliary_loss_mlp": 0.01024738, "balance_loss_clip": 1.0434103, "balance_loss_mlp": 1.01793408, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 1.7125817416605325, "language_loss": 0.73373443, "learning_rate": 3.4129620130264767e-08, "loss": 0.75763589, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.7568180561065674 }, { "auxiliary_loss_clip": 0.01274751, "auxiliary_loss_mlp": 0.02564972, "balance_loss_clip": 1.04660392, "balance_loss_mlp": 0.99990416, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.3163433994749445, "language_loss": 0.77774084, "learning_rate": 3.398647569505009e-08, "loss": 0.81613809, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.611524820327759 }, { "auxiliary_loss_clip": 0.01321634, "auxiliary_loss_mlp": 0.01023634, "balance_loss_clip": 1.04377842, "balance_loss_mlp": 1.01679754, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.7487274902195398, "language_loss": 0.75006974, "learning_rate": 3.384362950252373e-08, "loss": 0.77352238, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.678293228149414 }, { "auxiliary_loss_clip": 0.01268676, "auxiliary_loss_mlp": 0.01023088, "balance_loss_clip": 1.04428387, "balance_loss_mlp": 1.01657665, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 1.8978414734320195, "language_loss": 0.57322311, "learning_rate": 3.3701081574355473e-08, "loss": 0.59614068, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.7819316387176514 }, { "auxiliary_loss_clip": 0.01067173, "auxiliary_loss_mlp": 0.00999418, "balance_loss_clip": 1.00690317, "balance_loss_mlp": 0.9985652, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6405429864430534, "language_loss": 0.51653421, "learning_rate": 3.3558831932169796e-08, "loss": 0.53720009, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.317901372909546 }, { "auxiliary_loss_clip": 0.01219562, "auxiliary_loss_mlp": 0.01026687, "balance_loss_clip": 1.04838777, "balance_loss_mlp": 1.01979423, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 1.871089389278832, "language_loss": 0.88656461, "learning_rate": 3.341688059754588e-08, "loss": 0.9090271, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.6857049465179443 }, { "auxiliary_loss_clip": 0.01329309, "auxiliary_loss_mlp": 0.02565364, "balance_loss_clip": 1.04420447, "balance_loss_mlp": 0.99993145, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.4618544063761805, "language_loss": 0.7784276, "learning_rate": 3.327522759201762e-08, "loss": 0.81737435, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.673247814178467 }, { "auxiliary_loss_clip": 0.01321898, "auxiliary_loss_mlp": 0.01025852, "balance_loss_clip": 1.04608512, "balance_loss_mlp": 1.01886392, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.3302821373845473, "language_loss": 0.67467129, "learning_rate": 3.313387293707359e-08, "loss": 0.69814879, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.7527167797088623 }, { "auxiliary_loss_clip": 0.01323081, "auxiliary_loss_mlp": 0.01024219, "balance_loss_clip": 1.04766846, "balance_loss_mlp": 1.01698899, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 2.1800263765041996, "language_loss": 0.68424505, "learning_rate": 3.29928166541571e-08, "loss": 0.70771801, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.657149314880371 }, { "auxiliary_loss_clip": 0.01265899, "auxiliary_loss_mlp": 0.01026194, "balance_loss_clip": 1.04416239, "balance_loss_mlp": 1.01892185, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 2.1590195324876826, "language_loss": 0.80777359, "learning_rate": 3.2852058764666346e-08, "loss": 0.83069444, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.6995131969451904 }, { "auxiliary_loss_clip": 0.01308793, "auxiliary_loss_mlp": 0.01022033, "balance_loss_clip": 1.04658437, "balance_loss_mlp": 1.01579285, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.6663616432996124, "language_loss": 0.68594193, "learning_rate": 3.2711599289954264e-08, "loss": 0.70925021, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 3.7326853275299072 }, { "auxiliary_loss_clip": 0.01418399, "auxiliary_loss_mlp": 0.01024866, "balance_loss_clip": 1.04174972, "balance_loss_mlp": 1.01798177, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 2.055785699784817, "language_loss": 0.78081656, "learning_rate": 3.257143825132847e-08, "loss": 0.80524921, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.7102925777435303 }, { "auxiliary_loss_clip": 0.0127095, "auxiliary_loss_mlp": 0.01022759, "balance_loss_clip": 1.0450865, "balance_loss_mlp": 1.01676571, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.9179132295994927, "language_loss": 0.76189929, "learning_rate": 3.243157567005106e-08, "loss": 0.78483641, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 3.569077968597412 }, { "auxiliary_loss_clip": 0.01179717, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.05430436, "balance_loss_mlp": 1.02340734, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 2.4052174926809893, "language_loss": 0.64054978, "learning_rate": 3.2292011567339296e-08, "loss": 0.66265583, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.5195908546447754 }, { "auxiliary_loss_clip": 0.01223622, "auxiliary_loss_mlp": 0.02563056, "balance_loss_clip": 1.04680824, "balance_loss_mlp": 0.99991298, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.0958728095011443, "language_loss": 0.56007719, "learning_rate": 3.21527459643649e-08, "loss": 0.59794402, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 3.5367391109466553 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.01023286, "balance_loss_clip": 1.04971647, "balance_loss_mlp": 1.01647902, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 2.4058220870223255, "language_loss": 0.74361205, "learning_rate": 3.2013778882254536e-08, "loss": 0.7661137, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 2.6752138137817383 }, { "auxiliary_loss_clip": 0.01219509, "auxiliary_loss_mlp": 0.01022638, "balance_loss_clip": 1.04730034, "balance_loss_mlp": 1.01594472, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.6800690916219647, "language_loss": 0.75645727, "learning_rate": 3.1875110342088676e-08, "loss": 0.77887875, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.6489193439483643 }, { "auxiliary_loss_clip": 0.01265194, "auxiliary_loss_mlp": 0.01027365, "balance_loss_clip": 1.04606771, "balance_loss_mlp": 1.02033794, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.8506240206849007, "language_loss": 0.6555438, "learning_rate": 3.1736740364904035e-08, "loss": 0.67846942, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.6844778060913086 }, { "auxiliary_loss_clip": 0.01368981, "auxiliary_loss_mlp": 0.02563974, "balance_loss_clip": 1.04160547, "balance_loss_mlp": 0.99993008, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.296228228810353, "language_loss": 0.77255499, "learning_rate": 3.159866897169094e-08, "loss": 0.81188452, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.693107843399048 }, { "auxiliary_loss_clip": 0.01225529, "auxiliary_loss_mlp": 0.01026047, "balance_loss_clip": 1.0456233, "balance_loss_mlp": 1.01910281, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 2.0775856967077826, "language_loss": 0.76278675, "learning_rate": 3.146089618339487e-08, "loss": 0.78530246, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.6582260131835938 }, { "auxiliary_loss_clip": 0.01325601, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.04599559, "balance_loss_mlp": 1.01881397, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.8952901052977706, "language_loss": 0.6800763, "learning_rate": 3.132342202091554e-08, "loss": 0.70359528, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 3.662045478820801 }, { "auxiliary_loss_clip": 0.0117579, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.04891908, "balance_loss_mlp": 1.02076912, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.1051376048129407, "language_loss": 0.67919689, "learning_rate": 3.1186246505107595e-08, "loss": 0.70123291, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.5810420513153076 }, { "auxiliary_loss_clip": 0.01219906, "auxiliary_loss_mlp": 0.01031612, "balance_loss_clip": 1.04835796, "balance_loss_mlp": 1.02459991, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.9332771472019947, "language_loss": 0.83665621, "learning_rate": 3.104936965678084e-08, "loss": 0.85917139, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.650239944458008 }, { "auxiliary_loss_clip": 0.01221377, "auxiliary_loss_mlp": 0.01026727, "balance_loss_clip": 1.04652071, "balance_loss_mlp": 1.02007556, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 2.0257643891388684, "language_loss": 0.81814027, "learning_rate": 3.091279149669956e-08, "loss": 0.84062123, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.6025078296661377 }, { "auxiliary_loss_clip": 0.01221968, "auxiliary_loss_mlp": 0.02564481, "balance_loss_clip": 1.04851389, "balance_loss_mlp": 0.99992371, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 1.9408365011303668, "language_loss": 0.73954463, "learning_rate": 3.0776512045581624e-08, "loss": 0.77740908, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.6383557319641113 }, { "auxiliary_loss_clip": 0.01268194, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.04575849, "balance_loss_mlp": 1.02084732, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 1.9249477057142563, "language_loss": 0.77646059, "learning_rate": 3.0640531324101384e-08, "loss": 0.79942977, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.6450350284576416 }, { "auxiliary_loss_clip": 0.01222164, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.05060375, "balance_loss_mlp": 1.02277124, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.818548218700609, "language_loss": 0.75988615, "learning_rate": 3.0504849352886554e-08, "loss": 0.78240824, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.662421226501465 }, { "auxiliary_loss_clip": 0.01223206, "auxiliary_loss_mlp": 0.01025007, "balance_loss_clip": 1.04812562, "balance_loss_mlp": 1.01808429, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 2.8537368757391532, "language_loss": 0.72059405, "learning_rate": 3.036946615252023e-08, "loss": 0.74307621, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.5725772380828857 }, { "auxiliary_loss_clip": 0.01177434, "auxiliary_loss_mlp": 0.01022352, "balance_loss_clip": 1.0472343, "balance_loss_mlp": 1.01544726, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.1846210336437215, "language_loss": 0.67007077, "learning_rate": 3.0234381743539984e-08, "loss": 0.69206864, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.758647918701172 }, { "auxiliary_loss_clip": 0.01277295, "auxiliary_loss_mlp": 0.01026253, "balance_loss_clip": 1.04565883, "balance_loss_mlp": 1.01904082, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 1.9934296615922622, "language_loss": 0.80276543, "learning_rate": 3.0099596146437863e-08, "loss": 0.8258009, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.586824655532837 }, { "auxiliary_loss_clip": 0.010596, "auxiliary_loss_mlp": 0.01001575, "balance_loss_clip": 1.00574529, "balance_loss_mlp": 1.00071716, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7795799464668808, "language_loss": 0.59991312, "learning_rate": 2.996510938166086e-08, "loss": 0.62052482, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.206479787826538 }, { "auxiliary_loss_clip": 0.0122036, "auxiliary_loss_mlp": 0.01024905, "balance_loss_clip": 1.04927099, "balance_loss_mlp": 1.01827717, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 3.36107724233894, "language_loss": 0.74096745, "learning_rate": 2.983092146960997e-08, "loss": 0.7634201, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.5966572761535645 }, { "auxiliary_loss_clip": 0.01274833, "auxiliary_loss_mlp": 0.01025068, "balance_loss_clip": 1.04297829, "balance_loss_mlp": 1.0180347, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 1.9938572455152703, "language_loss": 0.80164742, "learning_rate": 2.9697032430642256e-08, "loss": 0.82464635, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.6297965049743652 }, { "auxiliary_loss_clip": 0.01167886, "auxiliary_loss_mlp": 0.01022138, "balance_loss_clip": 1.04689872, "balance_loss_mlp": 1.01518822, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 3.267085958774322, "language_loss": 0.7383495, "learning_rate": 2.9563442285067906e-08, "loss": 0.76024973, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.6666948795318604 }, { "auxiliary_loss_clip": 0.01222639, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.04811096, "balance_loss_mlp": 1.02852404, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 1.7933177635014923, "language_loss": 0.80022699, "learning_rate": 2.943015105315294e-08, "loss": 0.8228097, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.6828339099884033 }, { "auxiliary_loss_clip": 0.01372469, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.04265201, "balance_loss_mlp": 1.02061582, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.725456587571462, "language_loss": 0.66334653, "learning_rate": 2.929715875511718e-08, "loss": 0.68734944, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.741497039794922 }, { "auxiliary_loss_clip": 0.01222608, "auxiliary_loss_mlp": 0.01023897, "balance_loss_clip": 1.04499495, "balance_loss_mlp": 1.01696455, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 2.110084012498291, "language_loss": 0.70110178, "learning_rate": 2.9164465411135375e-08, "loss": 0.72356689, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 3.585805654525757 }, { "auxiliary_loss_clip": 0.01221275, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.05041337, "balance_loss_mlp": 1.0254097, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 2.224166685672258, "language_loss": 0.81181228, "learning_rate": 2.9032071041337426e-08, "loss": 0.83435166, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.6328377723693848 }, { "auxiliary_loss_clip": 0.01264718, "auxiliary_loss_mlp": 0.01023312, "balance_loss_clip": 1.04361176, "balance_loss_mlp": 1.01685965, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 2.2409133339159846, "language_loss": 0.72939217, "learning_rate": 2.889997566580704e-08, "loss": 0.75227249, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 3.561384439468384 }, { "auxiliary_loss_clip": 0.01175068, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.04833913, "balance_loss_mlp": 1.0204761, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 1.8065299252391038, "language_loss": 0.70417809, "learning_rate": 2.8768179304583086e-08, "loss": 0.72620988, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.647545337677002 }, { "auxiliary_loss_clip": 0.01318601, "auxiliary_loss_mlp": 0.0102394, "balance_loss_clip": 1.04560256, "balance_loss_mlp": 1.01745796, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.8350941180393248, "language_loss": 0.73656905, "learning_rate": 2.8636681977659117e-08, "loss": 0.75999451, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.724052667617798 }, { "auxiliary_loss_clip": 0.01369096, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.04587412, "balance_loss_mlp": 1.02092671, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.4209632629188653, "language_loss": 0.78264689, "learning_rate": 2.850548370498318e-08, "loss": 0.8066197, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 3.594080686569214 }, { "auxiliary_loss_clip": 0.01220236, "auxiliary_loss_mlp": 0.01022199, "balance_loss_clip": 1.04519057, "balance_loss_mlp": 1.01566362, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.71743451038497, "language_loss": 0.71584147, "learning_rate": 2.8374584506457798e-08, "loss": 0.73826575, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.675935745239258 }, { "auxiliary_loss_clip": 0.01270744, "auxiliary_loss_mlp": 0.01021027, "balance_loss_clip": 1.04866648, "balance_loss_mlp": 1.01424062, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 3.917139735036052, "language_loss": 0.67144263, "learning_rate": 2.824398440193998e-08, "loss": 0.69436026, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.614469051361084 }, { "auxiliary_loss_clip": 0.01371271, "auxiliary_loss_mlp": 0.01025114, "balance_loss_clip": 1.04217911, "balance_loss_mlp": 1.01728213, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 1.9554909252378858, "language_loss": 0.71505862, "learning_rate": 2.811368341124232e-08, "loss": 0.73902249, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.723264694213867 }, { "auxiliary_loss_clip": 0.01273199, "auxiliary_loss_mlp": 0.01025215, "balance_loss_clip": 1.04563808, "balance_loss_mlp": 1.01862597, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 5.803291532243222, "language_loss": 0.68115336, "learning_rate": 2.7983681554131222e-08, "loss": 0.70413744, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.6504855155944824 }, { "auxiliary_loss_clip": 0.01274697, "auxiliary_loss_mlp": 0.01022602, "balance_loss_clip": 1.0451535, "balance_loss_mlp": 1.01576829, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.3108124162063635, "language_loss": 0.70401692, "learning_rate": 2.7853978850327365e-08, "loss": 0.72698998, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 3.6407430171966553 }, { "auxiliary_loss_clip": 0.01319097, "auxiliary_loss_mlp": 0.01024139, "balance_loss_clip": 1.04932737, "balance_loss_mlp": 1.01741552, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 2.155835176809916, "language_loss": 0.87402797, "learning_rate": 2.7724575319507225e-08, "loss": 0.89746028, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.8227415084838867 }, { "auxiliary_loss_clip": 0.01220653, "auxiliary_loss_mlp": 0.010182, "balance_loss_clip": 1.04421604, "balance_loss_mlp": 1.011235, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 1.9814881923355818, "language_loss": 0.77128732, "learning_rate": 2.759547098130044e-08, "loss": 0.79367578, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.700204610824585 }, { "auxiliary_loss_clip": 0.0117132, "auxiliary_loss_mlp": 0.01023704, "balance_loss_clip": 1.04841793, "balance_loss_mlp": 1.01715398, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 2.1136480140116727, "language_loss": 0.7673136, "learning_rate": 2.746666585529267e-08, "loss": 0.78926384, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.636631965637207 }, { "auxiliary_loss_clip": 0.01218355, "auxiliary_loss_mlp": 0.01029091, "balance_loss_clip": 1.04580283, "balance_loss_mlp": 1.02177143, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.7798965759248175, "language_loss": 0.74650878, "learning_rate": 2.73381599610234e-08, "loss": 0.76898324, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.721801519393921 }, { "auxiliary_loss_clip": 0.01217826, "auxiliary_loss_mlp": 0.01025311, "balance_loss_clip": 1.04457283, "balance_loss_mlp": 1.01848078, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 2.035437326630357, "language_loss": 0.7185967, "learning_rate": 2.7209953317987033e-08, "loss": 0.74102813, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.682130813598633 }, { "auxiliary_loss_clip": 0.01224587, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04892945, "balance_loss_mlp": 1.016729, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 2.3903012948024975, "language_loss": 0.78453505, "learning_rate": 2.7082045945631793e-08, "loss": 0.80701679, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.7091119289398193 }, { "auxiliary_loss_clip": 0.01311631, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.04422057, "balance_loss_mlp": 1.01870584, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.10284918361491, "language_loss": 0.69594646, "learning_rate": 2.6954437863361712e-08, "loss": 0.71931869, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.674400806427002 }, { "auxiliary_loss_clip": 0.01421787, "auxiliary_loss_mlp": 0.01025581, "balance_loss_clip": 1.04365492, "balance_loss_mlp": 1.01903903, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.164929149028275, "language_loss": 0.70719212, "learning_rate": 2.6827129090534862e-08, "loss": 0.73166579, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.807718276977539 }, { "auxiliary_loss_clip": 0.01275286, "auxiliary_loss_mlp": 0.01023531, "balance_loss_clip": 1.04964709, "balance_loss_mlp": 1.0167743, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 1.9203535620428298, "language_loss": 0.77826798, "learning_rate": 2.670011964646335e-08, "loss": 0.80125618, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.679957389831543 }, { "auxiliary_loss_clip": 0.01477654, "auxiliary_loss_mlp": 0.01030384, "balance_loss_clip": 1.0352515, "balance_loss_mlp": 1.02293372, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 2.5846020874473465, "language_loss": 0.68311596, "learning_rate": 2.657340955041487e-08, "loss": 0.7081964, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.8025341033935547 }, { "auxiliary_loss_clip": 0.01277532, "auxiliary_loss_mlp": 0.01026753, "balance_loss_clip": 1.05245245, "balance_loss_mlp": 1.01939142, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 1.9623904334750848, "language_loss": 0.71867549, "learning_rate": 2.6446998821611167e-08, "loss": 0.74171841, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.7200379371643066 }, { "auxiliary_loss_clip": 0.01372171, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.04316378, "balance_loss_mlp": 1.02160764, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 18.73521421515372, "language_loss": 0.71612591, "learning_rate": 2.6320887479228228e-08, "loss": 0.74013853, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.73938250541687 }, { "auxiliary_loss_clip": 0.01273674, "auxiliary_loss_mlp": 0.01024803, "balance_loss_clip": 1.04567158, "balance_loss_mlp": 1.01813293, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.293531199432286, "language_loss": 0.72567511, "learning_rate": 2.619507554239786e-08, "loss": 0.74865985, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.693652391433716 }, { "auxiliary_loss_clip": 0.01275535, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04799962, "balance_loss_mlp": 1.01905131, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.6069591778442895, "language_loss": 0.69780928, "learning_rate": 2.606956303020502e-08, "loss": 0.72082651, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.698528528213501 }, { "auxiliary_loss_clip": 0.01222702, "auxiliary_loss_mlp": 0.01027249, "balance_loss_clip": 1.04958248, "balance_loss_mlp": 1.02072251, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 4.0684661698936075, "language_loss": 0.84074163, "learning_rate": 2.5944349961690036e-08, "loss": 0.86324114, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.7622976303100586 }, { "auxiliary_loss_clip": 0.0131949, "auxiliary_loss_mlp": 0.01024244, "balance_loss_clip": 1.04549432, "balance_loss_mlp": 1.01766336, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.7480365927912171, "language_loss": 0.73392051, "learning_rate": 2.581943635584749e-08, "loss": 0.75735784, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 3.7062292098999023 }, { "auxiliary_loss_clip": 0.01262777, "auxiliary_loss_mlp": 0.01022271, "balance_loss_clip": 1.04347229, "balance_loss_mlp": 1.01587248, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 2.9015903742344076, "language_loss": 0.65552723, "learning_rate": 2.569482223162689e-08, "loss": 0.67837763, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.828725814819336 }, { "auxiliary_loss_clip": 0.01222535, "auxiliary_loss_mlp": 0.0102241, "balance_loss_clip": 1.04670858, "balance_loss_mlp": 1.01538837, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.856456565987711, "language_loss": 0.72384095, "learning_rate": 2.5570507607932e-08, "loss": 0.74629045, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 3.6076018810272217 }, { "auxiliary_loss_clip": 0.01227632, "auxiliary_loss_mlp": 0.01026788, "balance_loss_clip": 1.04766762, "balance_loss_mlp": 1.01945925, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.4274385944112695, "language_loss": 0.63471371, "learning_rate": 2.54464925036213e-08, "loss": 0.65725791, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.607809066772461 }, { "auxiliary_loss_clip": 0.01219937, "auxiliary_loss_mlp": 0.0102263, "balance_loss_clip": 1.04682517, "balance_loss_mlp": 1.01628482, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.8277544952463367, "language_loss": 0.60820353, "learning_rate": 2.532277693750773e-08, "loss": 0.63062912, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 3.628143787384033 }, { "auxiliary_loss_clip": 0.01374904, "auxiliary_loss_mlp": 0.01027085, "balance_loss_clip": 1.04717755, "balance_loss_mlp": 1.02037978, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 2.71923818864093, "language_loss": 0.7572515, "learning_rate": 2.5199360928358948e-08, "loss": 0.78127134, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.6756675243377686 }, { "auxiliary_loss_clip": 0.01215721, "auxiliary_loss_mlp": 0.02561633, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 0.99991447, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.7698988133398943, "language_loss": 0.87494451, "learning_rate": 2.507624449489665e-08, "loss": 0.91271806, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.6500160694122314 }, { "auxiliary_loss_clip": 0.0126941, "auxiliary_loss_mlp": 0.01025027, "balance_loss_clip": 1.04702377, "balance_loss_mlp": 1.01730216, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 1.7743467113676687, "language_loss": 0.65046, "learning_rate": 2.495342765579811e-08, "loss": 0.67340446, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.6095948219299316 }, { "auxiliary_loss_clip": 0.01371001, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.04603624, "balance_loss_mlp": 1.02281773, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 7.308141005900582, "language_loss": 0.71351987, "learning_rate": 2.4830910429693984e-08, "loss": 0.73752773, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.7559518814086914 }, { "auxiliary_loss_clip": 0.01170913, "auxiliary_loss_mlp": 0.01021878, "balance_loss_clip": 1.04618669, "balance_loss_mlp": 1.01497269, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 1.9262877184407075, "language_loss": 0.79396713, "learning_rate": 2.470869283517052e-08, "loss": 0.81589508, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 3.444391965866089 }, { "auxiliary_loss_clip": 0.01215329, "auxiliary_loss_mlp": 0.01027571, "balance_loss_clip": 1.04349864, "balance_loss_mlp": 1.02096641, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.6213556531067619, "language_loss": 0.77215266, "learning_rate": 2.458677489076777e-08, "loss": 0.79458165, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.641322612762451 }, { "auxiliary_loss_clip": 0.01216199, "auxiliary_loss_mlp": 0.01020546, "balance_loss_clip": 1.04479682, "balance_loss_mlp": 1.01373959, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.6554421180715795, "language_loss": 0.82773453, "learning_rate": 2.446515661498072e-08, "loss": 0.85010195, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.5824618339538574 }, { "auxiliary_loss_clip": 0.01425367, "auxiliary_loss_mlp": 0.01023413, "balance_loss_clip": 1.04477775, "balance_loss_mlp": 1.01684737, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 2.1672663900505538, "language_loss": 0.74549663, "learning_rate": 2.434383802625861e-08, "loss": 0.76998436, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.825965404510498 }, { "auxiliary_loss_clip": 0.01321283, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.04230082, "balance_loss_mlp": 1.02080762, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 15.44718959070033, "language_loss": 0.74151647, "learning_rate": 2.4222819143005168e-08, "loss": 0.76500386, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.6974971294403076 }, { "auxiliary_loss_clip": 0.01171261, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.05067003, "balance_loss_mlp": 1.01958013, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.966315916075314, "language_loss": 0.80973279, "learning_rate": 2.4102099983579706e-08, "loss": 0.83171052, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.552006244659424 }, { "auxiliary_loss_clip": 0.01223016, "auxiliary_loss_mlp": 0.01022263, "balance_loss_clip": 1.04622722, "balance_loss_mlp": 1.01490748, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.6309664772230155, "language_loss": 0.77325058, "learning_rate": 2.3981680566294236e-08, "loss": 0.79570335, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.630385398864746 }, { "auxiliary_loss_clip": 0.01170223, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.04863572, "balance_loss_mlp": 1.01794159, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.7914071044664213, "language_loss": 0.73269314, "learning_rate": 2.3861560909416822e-08, "loss": 0.7546401, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.59612774848938 }, { "auxiliary_loss_clip": 0.01272861, "auxiliary_loss_mlp": 0.01024652, "balance_loss_clip": 1.04447818, "balance_loss_mlp": 1.01809502, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 3.1084448742888875, "language_loss": 0.82521987, "learning_rate": 2.3741741031169325e-08, "loss": 0.84819496, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.7746214866638184 }, { "auxiliary_loss_clip": 0.01362921, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.04054141, "balance_loss_mlp": 1.01882195, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.858111912452729, "language_loss": 0.71091688, "learning_rate": 2.3622220949728544e-08, "loss": 0.73479974, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.7585113048553467 }, { "auxiliary_loss_clip": 0.01218101, "auxiliary_loss_mlp": 0.01025225, "balance_loss_clip": 1.04448175, "balance_loss_mlp": 1.01817942, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 3.229541052412379, "language_loss": 0.61558485, "learning_rate": 2.3503000683225526e-08, "loss": 0.63801813, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.7931573390960693 }, { "auxiliary_loss_clip": 0.01170669, "auxiliary_loss_mlp": 0.01025463, "balance_loss_clip": 1.04665995, "balance_loss_mlp": 1.01813149, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 2.3188323094512064, "language_loss": 0.84627229, "learning_rate": 2.3384080249745585e-08, "loss": 0.86823356, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.5449109077453613 }, { "auxiliary_loss_clip": 0.01375141, "auxiliary_loss_mlp": 0.01026528, "balance_loss_clip": 1.04254711, "balance_loss_mlp": 1.0195868, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 3.8208314481889767, "language_loss": 0.83132696, "learning_rate": 2.3265459667329178e-08, "loss": 0.85534358, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.8333001136779785 }, { "auxiliary_loss_clip": 0.01269711, "auxiliary_loss_mlp": 0.01024728, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.01821661, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.070232717613571, "language_loss": 0.86406279, "learning_rate": 2.31471389539708e-08, "loss": 0.88700724, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.6046979427337646 }, { "auxiliary_loss_clip": 0.0122482, "auxiliary_loss_mlp": 0.02561092, "balance_loss_clip": 1.04939413, "balance_loss_mlp": 0.99989629, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.2810645191404544, "language_loss": 0.72885001, "learning_rate": 2.3029118127619872e-08, "loss": 0.76670909, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.717371940612793 }, { "auxiliary_loss_clip": 0.01267306, "auxiliary_loss_mlp": 0.01024566, "balance_loss_clip": 1.04609084, "balance_loss_mlp": 1.01751482, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.328240085517227, "language_loss": 0.86628079, "learning_rate": 2.2911397206179628e-08, "loss": 0.8891995, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.67322039604187 }, { "auxiliary_loss_clip": 0.01172927, "auxiliary_loss_mlp": 0.01024356, "balance_loss_clip": 1.0502454, "balance_loss_mlp": 1.01686084, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 2.6586155641863285, "language_loss": 0.62984848, "learning_rate": 2.279397620750845e-08, "loss": 0.65182132, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 3.5344791412353516 }, { "auxiliary_loss_clip": 0.01263898, "auxiliary_loss_mlp": 0.01023161, "balance_loss_clip": 1.04301143, "balance_loss_mlp": 1.01653314, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 2.7181276444157847, "language_loss": 0.78817028, "learning_rate": 2.2676855149419195e-08, "loss": 0.81104088, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.615699052810669 }, { "auxiliary_loss_clip": 0.01271609, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.05065012, "balance_loss_mlp": 1.0193578, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.4145679829898268, "language_loss": 0.7563861, "learning_rate": 2.2560034049678988e-08, "loss": 0.77936327, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.580643892288208 }, { "auxiliary_loss_clip": 0.01176472, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.05094743, "balance_loss_mlp": 1.01699781, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.8183434837482446, "language_loss": 0.75371516, "learning_rate": 2.2443512926008988e-08, "loss": 0.77572274, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 3.539654493331909 }, { "auxiliary_loss_clip": 0.01324487, "auxiliary_loss_mlp": 0.01022698, "balance_loss_clip": 1.04362142, "balance_loss_mlp": 1.01606715, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.7826062644086433, "language_loss": 0.69903219, "learning_rate": 2.2327291796085946e-08, "loss": 0.72250408, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.6827640533447266 }, { "auxiliary_loss_clip": 0.0117537, "auxiliary_loss_mlp": 0.01025378, "balance_loss_clip": 1.04964125, "balance_loss_mlp": 1.01810634, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 3.078576226361324, "language_loss": 0.7685715, "learning_rate": 2.2211370677540197e-08, "loss": 0.79057896, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 3.539621114730835 }, { "auxiliary_loss_clip": 0.01173992, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.04940724, "balance_loss_mlp": 1.02063775, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 2.3741341954905617, "language_loss": 0.78162867, "learning_rate": 2.2095749587957012e-08, "loss": 0.8036437, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.579008102416992 }, { "auxiliary_loss_clip": 0.01270751, "auxiliary_loss_mlp": 0.01024802, "balance_loss_clip": 1.04317546, "balance_loss_mlp": 1.01757789, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 2.004230762694471, "language_loss": 0.69621235, "learning_rate": 2.1980428544876138e-08, "loss": 0.71916795, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.618901252746582 }, { "auxiliary_loss_clip": 0.01370908, "auxiliary_loss_mlp": 0.01023657, "balance_loss_clip": 1.03809333, "balance_loss_mlp": 1.01668322, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.8025924355702376, "language_loss": 0.74391669, "learning_rate": 2.1865407565791584e-08, "loss": 0.76786232, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.7850942611694336 }, { "auxiliary_loss_clip": 0.0127219, "auxiliary_loss_mlp": 0.01022185, "balance_loss_clip": 1.04483354, "balance_loss_mlp": 1.01479101, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 3.3679451855893126, "language_loss": 0.77284825, "learning_rate": 2.175068666815183e-08, "loss": 0.79579198, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.660050868988037 }, { "auxiliary_loss_clip": 0.01323569, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.04349697, "balance_loss_mlp": 1.02395594, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.828183917223483, "language_loss": 0.79406023, "learning_rate": 2.163626586935985e-08, "loss": 0.81760412, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.6824791431427 }, { "auxiliary_loss_clip": 0.01223201, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.04666686, "balance_loss_mlp": 1.0178659, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 3.2781531495277343, "language_loss": 0.62692094, "learning_rate": 2.1522145186773755e-08, "loss": 0.64940155, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 3.578819751739502 }, { "auxiliary_loss_clip": 0.0126757, "auxiliary_loss_mlp": 0.01028958, "balance_loss_clip": 1.04460263, "balance_loss_mlp": 1.02252078, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 1.8803747063053404, "language_loss": 0.85850632, "learning_rate": 2.140832463770481e-08, "loss": 0.88147157, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.6643025875091553 }, { "auxiliary_loss_clip": 0.01276131, "auxiliary_loss_mlp": 0.01023158, "balance_loss_clip": 1.04561913, "balance_loss_mlp": 1.01596689, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.4508610854659976, "language_loss": 0.75941831, "learning_rate": 2.129480423941987e-08, "loss": 0.78241122, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.754242181777954 }, { "auxiliary_loss_clip": 0.01172257, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.04590189, "balance_loss_mlp": 1.01758003, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 2.4006556428520827, "language_loss": 0.80193555, "learning_rate": 2.1181584009140052e-08, "loss": 0.8239001, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.6505181789398193 }, { "auxiliary_loss_clip": 0.01329253, "auxiliary_loss_mlp": 0.01025038, "balance_loss_clip": 1.04531503, "balance_loss_mlp": 1.01796031, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.1299278025103843, "language_loss": 0.83743197, "learning_rate": 2.10686639640405e-08, "loss": 0.86097491, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.7308623790740967 }, { "auxiliary_loss_clip": 0.01124114, "auxiliary_loss_mlp": 0.0102094, "balance_loss_clip": 1.047683, "balance_loss_mlp": 1.01443172, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.8452083359676565, "language_loss": 0.81117976, "learning_rate": 2.0956044121251294e-08, "loss": 0.83263028, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.6003708839416504 }, { "auxiliary_loss_clip": 0.01323037, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.04966831, "balance_loss_mlp": 1.01725304, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 2.0094512220159535, "language_loss": 0.80923295, "learning_rate": 2.084372449785654e-08, "loss": 0.83270907, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.7282297611236572 }, { "auxiliary_loss_clip": 0.01271871, "auxiliary_loss_mlp": 0.01023351, "balance_loss_clip": 1.04399586, "balance_loss_mlp": 1.0163058, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.7116244930347377, "language_loss": 0.68821818, "learning_rate": 2.0731705110895282e-08, "loss": 0.71117032, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.618002414703369 }, { "auxiliary_loss_clip": 0.01225884, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.04941797, "balance_loss_mlp": 1.02158809, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 1.9829492131995907, "language_loss": 0.86783206, "learning_rate": 2.0619985977360587e-08, "loss": 0.89038408, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.612980842590332 }, { "auxiliary_loss_clip": 0.0136928, "auxiliary_loss_mlp": 0.01023749, "balance_loss_clip": 1.03939402, "balance_loss_mlp": 1.01715946, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.8278700797881262, "language_loss": 0.76770216, "learning_rate": 2.0508567114200237e-08, "loss": 0.79163241, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.8509252071380615 }, { "auxiliary_loss_clip": 0.01274131, "auxiliary_loss_mlp": 0.01021024, "balance_loss_clip": 1.0457741, "balance_loss_mlp": 1.01446486, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 2.5841543967902316, "language_loss": 0.79247302, "learning_rate": 2.0397448538316485e-08, "loss": 0.81542456, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.7303056716918945 }, { "auxiliary_loss_clip": 0.01320368, "auxiliary_loss_mlp": 0.01027084, "balance_loss_clip": 1.04492891, "balance_loss_mlp": 1.02008617, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 2.0232244568218745, "language_loss": 0.66766703, "learning_rate": 2.028663026656563e-08, "loss": 0.69114155, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.6849939823150635 }, { "auxiliary_loss_clip": 0.01172579, "auxiliary_loss_mlp": 0.02565301, "balance_loss_clip": 1.04995632, "balance_loss_mlp": 0.9999162, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 1.9189074658067655, "language_loss": 0.71874201, "learning_rate": 2.0176112315758885e-08, "loss": 0.7561208, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.574220895767212 }, { "auxiliary_loss_clip": 0.01375177, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.04576087, "balance_loss_mlp": 1.01793408, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 4.5189985138413284, "language_loss": 0.69003671, "learning_rate": 2.0065894702661957e-08, "loss": 0.71404099, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.7664084434509277 }, { "auxiliary_loss_clip": 0.01317206, "auxiliary_loss_mlp": 0.02567385, "balance_loss_clip": 1.04134786, "balance_loss_mlp": 0.99991977, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.7798972250302554, "language_loss": 0.78303099, "learning_rate": 1.9955977443994577e-08, "loss": 0.82187694, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.7336721420288086 }, { "auxiliary_loss_clip": 0.01277228, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.04718661, "balance_loss_mlp": 1.02148032, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.5739729363265242, "language_loss": 0.62341881, "learning_rate": 1.9846360556430965e-08, "loss": 0.64648694, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.8120670318603516 }, { "auxiliary_loss_clip": 0.01172157, "auxiliary_loss_mlp": 0.0102351, "balance_loss_clip": 1.04915905, "balance_loss_mlp": 1.0168283, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.2150691759150583, "language_loss": 0.61562878, "learning_rate": 1.973704405660004e-08, "loss": 0.63758546, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 3.568995475769043 }, { "auxiliary_loss_clip": 0.01416375, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.04120994, "balance_loss_mlp": 1.02073884, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.9936397015207787, "language_loss": 0.78086323, "learning_rate": 1.9628027961085203e-08, "loss": 0.80530107, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 3.741464138031006 }, { "auxiliary_loss_clip": 0.01313068, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.03936601, "balance_loss_mlp": 1.01819658, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.9503820050021772, "language_loss": 0.8365128, "learning_rate": 1.9519312286423894e-08, "loss": 0.85989571, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.838115692138672 }, { "auxiliary_loss_clip": 0.01216621, "auxiliary_loss_mlp": 0.01023559, "balance_loss_clip": 1.04634511, "balance_loss_mlp": 1.01689875, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.6024066104172787, "language_loss": 0.77930462, "learning_rate": 1.9410897049108255e-08, "loss": 0.80170643, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.583789825439453 }, { "auxiliary_loss_clip": 0.01178101, "auxiliary_loss_mlp": 0.01029882, "balance_loss_clip": 1.0518465, "balance_loss_mlp": 1.02236545, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 1.8004896964857606, "language_loss": 0.91047442, "learning_rate": 1.9302782265584905e-08, "loss": 0.93255419, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 3.5316457748413086 }, { "auxiliary_loss_clip": 0.01365997, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.04448295, "balance_loss_mlp": 1.02085435, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.776038993437981, "language_loss": 0.87115633, "learning_rate": 1.9194967952254282e-08, "loss": 0.89509225, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.677776336669922 }, { "auxiliary_loss_clip": 0.01220232, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.04889345, "balance_loss_mlp": 1.02444363, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 3.3833993752261358, "language_loss": 0.80833483, "learning_rate": 1.9087454125472635e-08, "loss": 0.83084649, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.5659666061401367 }, { "auxiliary_loss_clip": 0.01172724, "auxiliary_loss_mlp": 0.01027047, "balance_loss_clip": 1.0486362, "balance_loss_mlp": 1.02012646, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 1.8408963451253924, "language_loss": 0.78196877, "learning_rate": 1.8980240801548696e-08, "loss": 0.80396652, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.660219430923462 }, { "auxiliary_loss_clip": 0.01266735, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.04788876, "balance_loss_mlp": 1.02432585, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.963082046522566, "language_loss": 0.74171257, "learning_rate": 1.8873327996747458e-08, "loss": 0.76468635, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.686218023300171 }, { "auxiliary_loss_clip": 0.01224351, "auxiliary_loss_mlp": 0.01025823, "balance_loss_clip": 1.04609573, "balance_loss_mlp": 1.01889122, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 2.146396767234672, "language_loss": 0.65878403, "learning_rate": 1.8766715727287053e-08, "loss": 0.68128574, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 3.5874691009521484 }, { "auxiliary_loss_clip": 0.0122724, "auxiliary_loss_mlp": 0.0256887, "balance_loss_clip": 1.04852915, "balance_loss_mlp": 0.99992514, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.8152557456259737, "language_loss": 0.79574621, "learning_rate": 1.8660404009340546e-08, "loss": 0.83370721, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.658733367919922 }, { "auxiliary_loss_clip": 0.0111496, "auxiliary_loss_mlp": 0.01000115, "balance_loss_clip": 1.00538397, "balance_loss_mlp": 0.99931067, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.870600881988479, "language_loss": 0.59546304, "learning_rate": 1.8554392859035485e-08, "loss": 0.61661375, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.1773815155029297 }, { "auxiliary_loss_clip": 0.01469028, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.03913295, "balance_loss_mlp": 1.0210644, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.9166694192507059, "language_loss": 0.79193997, "learning_rate": 1.8448682292453444e-08, "loss": 0.81690937, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.807110548019409 }, { "auxiliary_loss_clip": 0.01172925, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.04909658, "balance_loss_mlp": 1.01927066, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.7620294599513378, "language_loss": 0.65997505, "learning_rate": 1.8343272325631154e-08, "loss": 0.68196511, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.565525770187378 }, { "auxiliary_loss_clip": 0.01475177, "auxiliary_loss_mlp": 0.02568875, "balance_loss_clip": 1.04333401, "balance_loss_mlp": 0.99990082, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.3615611582880778, "language_loss": 0.7788074, "learning_rate": 1.8238162974558492e-08, "loss": 0.81924796, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.828733205795288 }, { "auxiliary_loss_clip": 0.01267304, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.04611993, "balance_loss_mlp": 1.02402925, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 2.6866496422184447, "language_loss": 0.74985814, "learning_rate": 1.8133354255181144e-08, "loss": 0.77283728, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.6623237133026123 }, { "auxiliary_loss_clip": 0.01221893, "auxiliary_loss_mlp": 0.01023272, "balance_loss_clip": 1.04614902, "balance_loss_mlp": 1.01633441, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 1.7563807989600058, "language_loss": 0.74637592, "learning_rate": 1.802884618339795e-08, "loss": 0.76882762, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.6214537620544434 }, { "auxiliary_loss_clip": 0.01223678, "auxiliary_loss_mlp": 0.01026584, "balance_loss_clip": 1.05054355, "balance_loss_mlp": 1.01938367, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 3.1293283736446917, "language_loss": 0.80873692, "learning_rate": 1.7924638775062894e-08, "loss": 0.83123958, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.6076314449310303 }, { "auxiliary_loss_clip": 0.01311484, "auxiliary_loss_mlp": 0.01026442, "balance_loss_clip": 1.04500771, "balance_loss_mlp": 1.02021635, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 2.1519518299768423, "language_loss": 0.8161037, "learning_rate": 1.7820732045984444e-08, "loss": 0.83948296, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.7457802295684814 }, { "auxiliary_loss_clip": 0.01222438, "auxiliary_loss_mlp": 0.01023654, "balance_loss_clip": 1.04676139, "balance_loss_mlp": 1.01638556, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 2.244986768321738, "language_loss": 0.74209517, "learning_rate": 1.7717126011924655e-08, "loss": 0.76455611, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.6108946800231934 }, { "auxiliary_loss_clip": 0.01366353, "auxiliary_loss_mlp": 0.01025562, "balance_loss_clip": 1.03825676, "balance_loss_mlp": 1.018466, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.8869007192551703, "language_loss": 0.76417172, "learning_rate": 1.7613820688600957e-08, "loss": 0.78809083, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.7022299766540527 }, { "auxiliary_loss_clip": 0.01178862, "auxiliary_loss_mlp": 0.01031201, "balance_loss_clip": 1.04684937, "balance_loss_mlp": 1.02387571, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.9426772540590664, "language_loss": 0.78854054, "learning_rate": 1.7510816091684588e-08, "loss": 0.81064117, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.6314632892608643 }, { "auxiliary_loss_clip": 0.01271952, "auxiliary_loss_mlp": 0.01022258, "balance_loss_clip": 1.04812622, "balance_loss_mlp": 1.01522183, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 3.410570350806108, "language_loss": 0.78940141, "learning_rate": 1.740811223680083e-08, "loss": 0.81234348, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.6872122287750244 }, { "auxiliary_loss_clip": 0.01173475, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.04912591, "balance_loss_mlp": 1.02078021, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 3.138113284048057, "language_loss": 0.74312019, "learning_rate": 1.7305709139530334e-08, "loss": 0.76513296, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.613074779510498 }, { "auxiliary_loss_clip": 0.01219521, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04529965, "balance_loss_mlp": 1.02054, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.3431478704037905, "language_loss": 0.74459267, "learning_rate": 1.7203606815407334e-08, "loss": 0.7670601, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.5804033279418945 }, { "auxiliary_loss_clip": 0.01278109, "auxiliary_loss_mlp": 0.01026058, "balance_loss_clip": 1.04837084, "balance_loss_mlp": 1.01824999, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.899751300715834, "language_loss": 0.79582465, "learning_rate": 1.7101805279920557e-08, "loss": 0.81886631, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 3.511331081390381 }, { "auxiliary_loss_clip": 0.01174965, "auxiliary_loss_mlp": 0.01028818, "balance_loss_clip": 1.05115008, "balance_loss_mlp": 1.021415, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.4891130123881515, "language_loss": 0.81137204, "learning_rate": 1.7000304548513643e-08, "loss": 0.83340991, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.600139617919922 }, { "auxiliary_loss_clip": 0.01321368, "auxiliary_loss_mlp": 0.01028926, "balance_loss_clip": 1.04378831, "balance_loss_mlp": 1.02238762, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.7033997662228546, "language_loss": 0.82428539, "learning_rate": 1.6899104636583394e-08, "loss": 0.84778833, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.6944420337677 }, { "auxiliary_loss_clip": 0.0111595, "auxiliary_loss_mlp": 0.01003074, "balance_loss_clip": 1.0057627, "balance_loss_mlp": 1.00222778, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7314755389977237, "language_loss": 0.61875826, "learning_rate": 1.6798205559482638e-08, "loss": 0.63994849, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 4.220173120498657 }, { "auxiliary_loss_clip": 0.01328621, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.04870319, "balance_loss_mlp": 1.01852489, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 1.8707217500142637, "language_loss": 0.76362097, "learning_rate": 1.669760733251713e-08, "loss": 0.78716034, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.712400436401367 }, { "auxiliary_loss_clip": 0.01321108, "auxiliary_loss_mlp": 0.01025966, "balance_loss_clip": 1.040887, "balance_loss_mlp": 1.01955605, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.8280571409262683, "language_loss": 0.82260835, "learning_rate": 1.659730997094755e-08, "loss": 0.84607911, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 3.6606149673461914 }, { "auxiliary_loss_clip": 0.01217083, "auxiliary_loss_mlp": 0.01022707, "balance_loss_clip": 1.04581881, "balance_loss_mlp": 1.01635575, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 2.467923902149448, "language_loss": 0.62149954, "learning_rate": 1.6497313489989283e-08, "loss": 0.64389741, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.617973566055298 }, { "auxiliary_loss_clip": 0.01372902, "auxiliary_loss_mlp": 0.01020924, "balance_loss_clip": 1.03690743, "balance_loss_mlp": 1.01411414, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 4.234284893997262, "language_loss": 0.70179027, "learning_rate": 1.639761790481131e-08, "loss": 0.72572851, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.77366042137146 }, { "auxiliary_loss_clip": 0.01123781, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.04923379, "balance_loss_mlp": 1.02073467, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 1.9775954121737436, "language_loss": 0.79359329, "learning_rate": 1.6298223230537754e-08, "loss": 0.81510574, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.6222193241119385 }, { "auxiliary_loss_clip": 0.01270103, "auxiliary_loss_mlp": 0.02568408, "balance_loss_clip": 1.04674983, "balance_loss_mlp": 0.99990034, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.148295110983144, "language_loss": 0.6946851, "learning_rate": 1.619912948224611e-08, "loss": 0.73307025, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 2.80619215965271 }, { "auxiliary_loss_clip": 0.01320112, "auxiliary_loss_mlp": 0.01025915, "balance_loss_clip": 1.04422688, "balance_loss_mlp": 1.01923048, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.906249277793569, "language_loss": 0.60873705, "learning_rate": 1.6100336674969682e-08, "loss": 0.63219726, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 3.648334264755249 }, { "auxiliary_loss_clip": 0.01272081, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.03987813, "balance_loss_mlp": 1.01944935, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.8832657676782472, "language_loss": 0.76713121, "learning_rate": 1.600184482369449e-08, "loss": 0.79011393, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.7324020862579346 }, { "auxiliary_loss_clip": 0.01322214, "auxiliary_loss_mlp": 0.01027494, "balance_loss_clip": 1.04382849, "balance_loss_mlp": 1.02037454, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 2.5458002733173832, "language_loss": 0.89116263, "learning_rate": 1.5903653943362126e-08, "loss": 0.91465974, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.7397284507751465 }, { "auxiliary_loss_clip": 0.01273453, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.04776049, "balance_loss_mlp": 1.02159894, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 2.231809182616771, "language_loss": 0.77018934, "learning_rate": 1.580576404886802e-08, "loss": 0.79320782, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.669149160385132 }, { "auxiliary_loss_clip": 0.01222896, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.047822, "balance_loss_mlp": 1.01978135, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 2.2713394546658985, "language_loss": 0.79782224, "learning_rate": 1.570817515506162e-08, "loss": 0.82031643, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.6409096717834473 }, { "auxiliary_loss_clip": 0.01172021, "auxiliary_loss_mlp": 0.01027268, "balance_loss_clip": 1.04945958, "balance_loss_mlp": 1.02120042, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.1518083322673114, "language_loss": 0.81457639, "learning_rate": 1.561088727674753e-08, "loss": 0.83656925, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.552351713180542 }, { "auxiliary_loss_clip": 0.01384623, "auxiliary_loss_mlp": 0.01028097, "balance_loss_clip": 1.04476237, "balance_loss_mlp": 1.02058411, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 2.604159877640764, "language_loss": 0.71319067, "learning_rate": 1.551390042868417e-08, "loss": 0.73731786, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.7985424995422363 }, { "auxiliary_loss_clip": 0.01224724, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.04973245, "balance_loss_mlp": 1.02047086, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 2.026286008614578, "language_loss": 0.71022904, "learning_rate": 1.5417214625584207e-08, "loss": 0.73274505, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.606011390686035 }, { "auxiliary_loss_clip": 0.01219157, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.04491603, "balance_loss_mlp": 1.02468967, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.6580307327368407, "language_loss": 0.85405791, "learning_rate": 1.5320829882114806e-08, "loss": 0.87656379, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.6332411766052246 }, { "auxiliary_loss_clip": 0.01171859, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.04655695, "balance_loss_mlp": 1.01694369, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 1.9960147379831428, "language_loss": 0.79073739, "learning_rate": 1.5224746212897378e-08, "loss": 0.81269485, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.5939931869506836 }, { "auxiliary_loss_clip": 0.01168634, "auxiliary_loss_mlp": 0.0102404, "balance_loss_clip": 1.04793453, "balance_loss_mlp": 1.01762629, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.9827202244288054, "language_loss": 0.77409637, "learning_rate": 1.512896363250804e-08, "loss": 0.79602307, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.635939121246338 }, { "auxiliary_loss_clip": 0.01222893, "auxiliary_loss_mlp": 0.01024553, "balance_loss_clip": 1.04572463, "balance_loss_mlp": 1.01771343, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 1.912805621488122, "language_loss": 0.76038957, "learning_rate": 1.503348215547673e-08, "loss": 0.78286409, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.6445906162261963 }, { "auxiliary_loss_clip": 0.01265153, "auxiliary_loss_mlp": 0.01021875, "balance_loss_clip": 1.04406977, "balance_loss_mlp": 1.01511288, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 3.128259983882231, "language_loss": 0.80799884, "learning_rate": 1.4938301796288078e-08, "loss": 0.83086908, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.6102638244628906 }, { "auxiliary_loss_clip": 0.01174433, "auxiliary_loss_mlp": 0.01026238, "balance_loss_clip": 1.05022836, "balance_loss_mlp": 1.01937759, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 4.5197700017498015, "language_loss": 0.81570733, "learning_rate": 1.4843422569380537e-08, "loss": 0.83771402, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.572871685028076 }, { "auxiliary_loss_clip": 0.0126705, "auxiliary_loss_mlp": 0.01024512, "balance_loss_clip": 1.0395236, "balance_loss_mlp": 1.01757371, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.9145786323281435, "language_loss": 0.82879144, "learning_rate": 1.4748844489147483e-08, "loss": 0.8517071, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.7164347171783447 }, { "auxiliary_loss_clip": 0.0126845, "auxiliary_loss_mlp": 0.01021568, "balance_loss_clip": 1.04453063, "balance_loss_mlp": 1.01500273, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 1.8409563010984247, "language_loss": 0.71444297, "learning_rate": 1.4654567569936326e-08, "loss": 0.73734307, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.6631295680999756 }, { "auxiliary_loss_clip": 0.01365608, "auxiliary_loss_mlp": 0.01025249, "balance_loss_clip": 1.04230797, "balance_loss_mlp": 1.01822424, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 2.5744475587937816, "language_loss": 0.82988107, "learning_rate": 1.456059182604874e-08, "loss": 0.85378969, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 3.638010025024414 }, { "auxiliary_loss_clip": 0.01173285, "auxiliary_loss_mlp": 0.01024243, "balance_loss_clip": 1.04921007, "balance_loss_mlp": 1.01707578, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 2.0750819151882096, "language_loss": 0.76562893, "learning_rate": 1.4466917271740653e-08, "loss": 0.78760421, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.692897081375122 }, { "auxiliary_loss_clip": 0.01270082, "auxiliary_loss_mlp": 0.01025778, "balance_loss_clip": 1.04458308, "balance_loss_mlp": 1.01754618, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 2.8519348277920207, "language_loss": 0.67852074, "learning_rate": 1.4373543921222697e-08, "loss": 0.70147932, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.6505165100097656 }, { "auxiliary_loss_clip": 0.01267027, "auxiliary_loss_mlp": 0.01023253, "balance_loss_clip": 1.04563951, "balance_loss_mlp": 1.01634443, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.9399620880679498, "language_loss": 0.77914882, "learning_rate": 1.428047178865932e-08, "loss": 0.8020516, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 3.584616184234619 }, { "auxiliary_loss_clip": 0.01273137, "auxiliary_loss_mlp": 0.01022775, "balance_loss_clip": 1.04342496, "balance_loss_mlp": 1.01581621, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.9380920742761807, "language_loss": 0.74800581, "learning_rate": 1.4187700888169451e-08, "loss": 0.77096498, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.695824384689331 }, { "auxiliary_loss_clip": 0.01109894, "auxiliary_loss_mlp": 0.01001452, "balance_loss_clip": 1.00577116, "balance_loss_mlp": 1.00062358, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7580303290131469, "language_loss": 0.57006705, "learning_rate": 1.40952312338265e-08, "loss": 0.59118056, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 4.156641960144043 }, { "auxiliary_loss_clip": 0.01321676, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.0438062, "balance_loss_mlp": 1.02005076, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 2.4013239899865155, "language_loss": 0.68457973, "learning_rate": 1.4003062839657909e-08, "loss": 0.70806241, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.900498867034912 }, { "auxiliary_loss_clip": 0.0122222, "auxiliary_loss_mlp": 0.01022332, "balance_loss_clip": 1.04465151, "balance_loss_mlp": 1.01600158, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.7812788110584818, "language_loss": 0.80174255, "learning_rate": 1.391119571964583e-08, "loss": 0.82418811, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.730473518371582 }, { "auxiliary_loss_clip": 0.0121724, "auxiliary_loss_mlp": 0.01027246, "balance_loss_clip": 1.04601717, "balance_loss_mlp": 1.02059102, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 2.0479436791996877, "language_loss": 0.73074442, "learning_rate": 1.3819629887726225e-08, "loss": 0.75318933, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.6018919944763184 }, { "auxiliary_loss_clip": 0.0127583, "auxiliary_loss_mlp": 0.01028437, "balance_loss_clip": 1.04663873, "balance_loss_mlp": 1.02052426, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 1.8185009793684477, "language_loss": 0.76689059, "learning_rate": 1.3728365357789317e-08, "loss": 0.7899332, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 2.670301914215088 }, { "auxiliary_loss_clip": 0.01310307, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.03954709, "balance_loss_mlp": 1.01836979, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.731551973201352, "language_loss": 0.76460171, "learning_rate": 1.3637402143680254e-08, "loss": 0.78795946, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 3.6302926540374756 }, { "auxiliary_loss_clip": 0.01113605, "auxiliary_loss_mlp": 0.01002642, "balance_loss_clip": 1.00692463, "balance_loss_mlp": 1.00170028, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7282129443940352, "language_loss": 0.55094671, "learning_rate": 1.3546740259197998e-08, "loss": 0.5721091, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.2555418014526367 }, { "auxiliary_loss_clip": 0.01271542, "auxiliary_loss_mlp": 0.0102826, "balance_loss_clip": 1.04508042, "balance_loss_mlp": 1.02081549, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.1284800870413965, "language_loss": 0.7004326, "learning_rate": 1.3456379718095989e-08, "loss": 0.72343063, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.709378957748413 }, { "auxiliary_loss_clip": 0.01166532, "auxiliary_loss_mlp": 0.01000207, "balance_loss_clip": 1.0063076, "balance_loss_mlp": 0.99924129, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.835289741210995, "language_loss": 0.61960506, "learning_rate": 1.3366320534081487e-08, "loss": 0.64127243, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.245222568511963 }, { "auxiliary_loss_clip": 0.01221218, "auxiliary_loss_mlp": 0.01023296, "balance_loss_clip": 1.04539585, "balance_loss_mlp": 1.01586628, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.6085242201752687, "language_loss": 0.75741947, "learning_rate": 1.3276562720816675e-08, "loss": 0.77986461, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.762338876724243 }, { "auxiliary_loss_clip": 0.0117211, "auxiliary_loss_mlp": 0.01023186, "balance_loss_clip": 1.04718339, "balance_loss_mlp": 1.01588476, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.691062703249326, "language_loss": 0.82724154, "learning_rate": 1.3187106291917549e-08, "loss": 0.84919447, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.6079702377319336 }, { "auxiliary_loss_clip": 0.01215886, "auxiliary_loss_mlp": 0.01026931, "balance_loss_clip": 1.0458076, "balance_loss_mlp": 1.02061629, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.8844740063849523, "language_loss": 0.70944238, "learning_rate": 1.309795126095503e-08, "loss": 0.73187047, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.6149537563323975 }, { "auxiliary_loss_clip": 0.01470972, "auxiliary_loss_mlp": 0.01021786, "balance_loss_clip": 1.04159212, "balance_loss_mlp": 1.01523864, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.803708175335097, "language_loss": 0.80613536, "learning_rate": 1.3009097641453192e-08, "loss": 0.83106291, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.7755377292633057 }, { "auxiliary_loss_clip": 0.01274529, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.04856539, "balance_loss_mlp": 1.02099681, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.9977007762472057, "language_loss": 0.7601856, "learning_rate": 1.2920545446891474e-08, "loss": 0.78320682, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.6641483306884766 }, { "auxiliary_loss_clip": 0.01271583, "auxiliary_loss_mlp": 0.01031997, "balance_loss_clip": 1.04633355, "balance_loss_mlp": 1.02496994, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.8065379681806681, "language_loss": 0.70583916, "learning_rate": 1.2832294690703127e-08, "loss": 0.72887504, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.6810286045074463 }, { "auxiliary_loss_clip": 0.01221737, "auxiliary_loss_mlp": 0.01026939, "balance_loss_clip": 1.04742575, "balance_loss_mlp": 1.02019143, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 2.007865982775203, "language_loss": 0.77648377, "learning_rate": 1.2744345386275668e-08, "loss": 0.79897052, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.650714874267578 }, { "auxiliary_loss_clip": 0.01178388, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.05037665, "balance_loss_mlp": 1.01935494, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 1.7239054824452158, "language_loss": 0.78824073, "learning_rate": 1.265669754695109e-08, "loss": 0.81028926, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.71108341217041 }, { "auxiliary_loss_clip": 0.01429875, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 1.04225469, "balance_loss_mlp": 1.01749253, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 2.031049845470471, "language_loss": 0.82018495, "learning_rate": 1.2569351186025201e-08, "loss": 0.84473312, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.798649311065674 }, { "auxiliary_loss_clip": 0.01312469, "auxiliary_loss_mlp": 0.01024096, "balance_loss_clip": 1.04299259, "balance_loss_mlp": 1.01733041, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.8386372917942597, "language_loss": 0.75632179, "learning_rate": 1.2482306316748737e-08, "loss": 0.77968752, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.7903928756713867 }, { "auxiliary_loss_clip": 0.01227895, "auxiliary_loss_mlp": 0.01025516, "balance_loss_clip": 1.04663944, "balance_loss_mlp": 1.01853967, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.319885902531926, "language_loss": 0.78305197, "learning_rate": 1.2395562952326021e-08, "loss": 0.8055861, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.599350929260254 }, { "auxiliary_loss_clip": 0.01279813, "auxiliary_loss_mlp": 0.01022881, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.01616049, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 3.2849519855377154, "language_loss": 0.80975688, "learning_rate": 1.2309121105916309e-08, "loss": 0.83278376, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.7232558727264404 }, { "auxiliary_loss_clip": 0.01225984, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04812646, "balance_loss_mlp": 1.01881647, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 1.9295563429371245, "language_loss": 0.6926859, "learning_rate": 1.222298079063222e-08, "loss": 0.71520281, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 3.664524555206299 }, { "auxiliary_loss_clip": 0.01220205, "auxiliary_loss_mlp": 0.01025249, "balance_loss_clip": 1.04737258, "balance_loss_mlp": 1.01867199, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 2.490355063281661, "language_loss": 0.72396076, "learning_rate": 1.2137142019541524e-08, "loss": 0.74641526, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.728853464126587 }, { "auxiliary_loss_clip": 0.0117827, "auxiliary_loss_mlp": 0.01022573, "balance_loss_clip": 1.04614115, "balance_loss_mlp": 1.01588535, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 12.03225706718188, "language_loss": 0.73627847, "learning_rate": 1.2051604805666027e-08, "loss": 0.75828689, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 3.719008684158325 }, { "auxiliary_loss_clip": 0.01173256, "auxiliary_loss_mlp": 0.02563863, "balance_loss_clip": 1.04994988, "balance_loss_mlp": 0.99991363, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 3.014792055101665, "language_loss": 0.78594548, "learning_rate": 1.196636916198135e-08, "loss": 0.82331669, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.624998092651367 }, { "auxiliary_loss_clip": 0.01175205, "auxiliary_loss_mlp": 0.01028035, "balance_loss_clip": 1.04939461, "balance_loss_mlp": 1.02124596, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 4.998653803924958, "language_loss": 0.77007449, "learning_rate": 1.1881435101418036e-08, "loss": 0.79210687, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.6511096954345703 }, { "auxiliary_loss_clip": 0.01165546, "auxiliary_loss_mlp": 0.01000912, "balance_loss_clip": 1.00743604, "balance_loss_mlp": 1.00007713, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.8309565040197836, "language_loss": 0.65536457, "learning_rate": 1.1796802636860003e-08, "loss": 0.67702919, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 4.14655065536499 }, { "auxiliary_loss_clip": 0.01174176, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.04876971, "balance_loss_mlp": 1.01816797, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 2.4813926197855123, "language_loss": 0.742522, "learning_rate": 1.1712471781146316e-08, "loss": 0.76451409, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.598094940185547 }, { "auxiliary_loss_clip": 0.01171086, "auxiliary_loss_mlp": 0.01023887, "balance_loss_clip": 1.04652286, "balance_loss_mlp": 1.01691663, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.8009121807248005, "language_loss": 0.67362493, "learning_rate": 1.1628442547069628e-08, "loss": 0.69557464, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.860253095626831 }, { "auxiliary_loss_clip": 0.01225527, "auxiliary_loss_mlp": 0.02569025, "balance_loss_clip": 1.04669929, "balance_loss_mlp": 0.99992919, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 1.857871092875102, "language_loss": 0.77246845, "learning_rate": 1.1544714947377521e-08, "loss": 0.81041396, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.625199556350708 }, { "auxiliary_loss_clip": 0.01174835, "auxiliary_loss_mlp": 0.01028001, "balance_loss_clip": 1.05007052, "balance_loss_mlp": 1.02099121, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.0841548490694137, "language_loss": 0.70080316, "learning_rate": 1.1461288994770945e-08, "loss": 0.72283149, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.641010284423828 }, { "auxiliary_loss_clip": 0.01173848, "auxiliary_loss_mlp": 0.01020308, "balance_loss_clip": 1.04716635, "balance_loss_mlp": 1.01350439, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.793043112607677, "language_loss": 0.77351862, "learning_rate": 1.1378164701906002e-08, "loss": 0.79546022, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 3.5909347534179688 }, { "auxiliary_loss_clip": 0.01176626, "auxiliary_loss_mlp": 0.01022163, "balance_loss_clip": 1.05062556, "balance_loss_mlp": 1.01491511, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 1.8641250072020366, "language_loss": 0.66707027, "learning_rate": 1.1295342081392156e-08, "loss": 0.68905813, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.5905184745788574 }, { "auxiliary_loss_clip": 0.01170769, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.04595518, "balance_loss_mlp": 1.01777029, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.641520468140671, "language_loss": 0.69892949, "learning_rate": 1.1212821145793804e-08, "loss": 0.72088122, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.692209005355835 }, { "auxiliary_loss_clip": 0.01268084, "auxiliary_loss_mlp": 0.01024125, "balance_loss_clip": 1.04406261, "balance_loss_mlp": 1.01753879, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 2.1498043369071906, "language_loss": 0.78617084, "learning_rate": 1.1130601907629156e-08, "loss": 0.80909288, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.6269166469573975 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.01000967, "balance_loss_clip": 1.00546825, "balance_loss_mlp": 1.00005555, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8116654381175274, "language_loss": 0.64719212, "learning_rate": 1.1048684379370899e-08, "loss": 0.66838777, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.1674466133117676 }, { "auxiliary_loss_clip": 0.01263372, "auxiliary_loss_mlp": 0.01023445, "balance_loss_clip": 1.04480517, "balance_loss_mlp": 1.01668012, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 3.947911547100608, "language_loss": 0.74619585, "learning_rate": 1.0967068573445759e-08, "loss": 0.76906407, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.6678624153137207 }, { "auxiliary_loss_clip": 0.01269402, "auxiliary_loss_mlp": 0.01021831, "balance_loss_clip": 1.04453385, "balance_loss_mlp": 1.01486576, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.73761264754445, "language_loss": 0.65491283, "learning_rate": 1.0885754502234945e-08, "loss": 0.67782521, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.709085702896118 }, { "auxiliary_loss_clip": 0.01216764, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 1.04380882, "balance_loss_mlp": 1.01834679, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 1.7384385773953785, "language_loss": 0.77656507, "learning_rate": 1.08047421780737e-08, "loss": 0.79898286, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.70662260055542 }, { "auxiliary_loss_clip": 0.01277419, "auxiliary_loss_mlp": 0.02566559, "balance_loss_clip": 1.0452491, "balance_loss_mlp": 0.99991512, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 2.1582994485004545, "language_loss": 0.73777401, "learning_rate": 1.0724031613251305e-08, "loss": 0.77621388, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.6845579147338867 }, { "auxiliary_loss_clip": 0.01227868, "auxiliary_loss_mlp": 0.01020935, "balance_loss_clip": 1.04757953, "balance_loss_mlp": 1.0139221, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.5593081684382337, "language_loss": 0.66222471, "learning_rate": 1.0643622820011744e-08, "loss": 0.68471277, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.636754035949707 }, { "auxiliary_loss_clip": 0.01175734, "auxiliary_loss_mlp": 0.01025589, "balance_loss_clip": 1.04795444, "balance_loss_mlp": 1.01833558, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 2.7550081878871366, "language_loss": 0.67435497, "learning_rate": 1.0563515810552814e-08, "loss": 0.69636822, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.636791467666626 }, { "auxiliary_loss_clip": 0.01176127, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.05177498, "balance_loss_mlp": 1.02129364, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.6594236730144831, "language_loss": 0.73443973, "learning_rate": 1.0483710597026795e-08, "loss": 0.7564826, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.657710552215576 }, { "auxiliary_loss_clip": 0.01320835, "auxiliary_loss_mlp": 0.01024195, "balance_loss_clip": 1.04490542, "balance_loss_mlp": 1.01782596, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 2.222342392048259, "language_loss": 0.73996222, "learning_rate": 1.0404207191540227e-08, "loss": 0.76341254, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.7994613647460938 }, { "auxiliary_loss_clip": 0.01172314, "auxiliary_loss_mlp": 0.01024097, "balance_loss_clip": 1.04891658, "balance_loss_mlp": 1.01751399, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 1.9640284189325186, "language_loss": 0.74442106, "learning_rate": 1.0325005606153236e-08, "loss": 0.7663852, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.6557979583740234 }, { "auxiliary_loss_clip": 0.01276763, "auxiliary_loss_mlp": 0.01028888, "balance_loss_clip": 1.04323053, "balance_loss_mlp": 1.0214349, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.589112986932865, "language_loss": 0.79501343, "learning_rate": 1.0246105852881104e-08, "loss": 0.81806993, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.6910030841827393 }, { "auxiliary_loss_clip": 0.01174553, "auxiliary_loss_mlp": 0.01023622, "balance_loss_clip": 1.04900265, "balance_loss_mlp": 1.0166899, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 2.1623159046520337, "language_loss": 0.79051566, "learning_rate": 1.0167507943692476e-08, "loss": 0.81249738, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.598576545715332 }, { "auxiliary_loss_clip": 0.01222178, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.04995942, "balance_loss_mlp": 1.02405262, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.746410091008232, "language_loss": 0.7153483, "learning_rate": 1.008921189051093e-08, "loss": 0.73788035, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.6294760704040527 }, { "auxiliary_loss_clip": 0.01175139, "auxiliary_loss_mlp": 0.01028996, "balance_loss_clip": 1.05061531, "balance_loss_mlp": 1.02159286, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 2.342723055330534, "language_loss": 0.77297932, "learning_rate": 1.0011217705213848e-08, "loss": 0.79502064, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 3.4895198345184326 }, { "auxiliary_loss_clip": 0.01220229, "auxiliary_loss_mlp": 0.01024994, "balance_loss_clip": 1.04891276, "balance_loss_mlp": 1.01838064, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 2.0483492869229885, "language_loss": 0.74750006, "learning_rate": 9.933525399632658e-09, "loss": 0.7699523, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.695963144302368 }, { "auxiliary_loss_clip": 0.01274578, "auxiliary_loss_mlp": 0.0102967, "balance_loss_clip": 1.04855406, "balance_loss_mlp": 1.0220288, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 2.221707014412274, "language_loss": 0.65427953, "learning_rate": 9.856134985553488e-09, "loss": 0.67732203, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 3.7503786087036133 }, { "auxiliary_loss_clip": 0.01170253, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.04753637, "balance_loss_mlp": 1.02270865, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.8209859863779456, "language_loss": 0.73679531, "learning_rate": 9.77904647471628e-09, "loss": 0.75879586, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 3.5465800762176514 }, { "auxiliary_loss_clip": 0.01360182, "auxiliary_loss_mlp": 0.01018685, "balance_loss_clip": 1.04184532, "balance_loss_mlp": 1.01242018, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 2.243483289963047, "language_loss": 0.74176961, "learning_rate": 9.702259878815454e-09, "loss": 0.76555824, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.7463488578796387 }, { "auxiliary_loss_clip": 0.0122457, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.04961872, "balance_loss_mlp": 1.02028739, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.2445158737513786, "language_loss": 0.74112439, "learning_rate": 9.625775209499254e-09, "loss": 0.76364642, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.6496777534484863 }, { "auxiliary_loss_clip": 0.01313065, "auxiliary_loss_mlp": 0.01026567, "balance_loss_clip": 1.04082966, "balance_loss_mlp": 1.01897883, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 4.17214859314391, "language_loss": 0.74477655, "learning_rate": 9.549592478370172e-09, "loss": 0.76817286, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.792856454849243 }, { "auxiliary_loss_clip": 0.01222679, "auxiliary_loss_mlp": 0.01023006, "balance_loss_clip": 1.0456531, "balance_loss_mlp": 1.01650906, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.7745754562034666, "language_loss": 0.79300159, "learning_rate": 9.473711696985632e-09, "loss": 0.81545848, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.6848390102386475 }, { "auxiliary_loss_clip": 0.01168318, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04480743, "balance_loss_mlp": 1.0183419, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.0580225806043058, "language_loss": 0.76392978, "learning_rate": 9.398132876856201e-09, "loss": 0.78586686, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.662001371383667 }, { "auxiliary_loss_clip": 0.01267688, "auxiliary_loss_mlp": 0.00999851, "balance_loss_clip": 1.00556421, "balance_loss_mlp": 0.99895108, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7775726462656665, "language_loss": 0.60780251, "learning_rate": 9.322856029447379e-09, "loss": 0.63047791, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 4.136422395706177 }, { "auxiliary_loss_clip": 0.01170418, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.04858756, "balance_loss_mlp": 1.02321291, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 2.099567174610099, "language_loss": 0.80512297, "learning_rate": 9.247881166178695e-09, "loss": 0.8271215, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.603888988494873 }, { "auxiliary_loss_clip": 0.0132883, "auxiliary_loss_mlp": 0.01019421, "balance_loss_clip": 1.0457443, "balance_loss_mlp": 1.01240277, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.1321640052720436, "language_loss": 0.76410896, "learning_rate": 9.173208298423274e-09, "loss": 0.78759146, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.787639617919922 }, { "auxiliary_loss_clip": 0.01274785, "auxiliary_loss_mlp": 0.02564941, "balance_loss_clip": 1.04665685, "balance_loss_mlp": 0.99992526, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.8564581719871136, "language_loss": 0.76400197, "learning_rate": 9.09883743750961e-09, "loss": 0.80239928, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.825347423553467 }, { "auxiliary_loss_clip": 0.01264719, "auxiliary_loss_mlp": 0.01022444, "balance_loss_clip": 1.04594445, "balance_loss_mlp": 1.01598275, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.6880253693450873, "language_loss": 0.84088731, "learning_rate": 9.024768594719124e-09, "loss": 0.86375892, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.6846513748168945 }, { "auxiliary_loss_clip": 0.01326721, "auxiliary_loss_mlp": 0.01027764, "balance_loss_clip": 1.04704702, "balance_loss_mlp": 1.02068877, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 3.9933943582614937, "language_loss": 0.72934031, "learning_rate": 8.95100178128816e-09, "loss": 0.75288522, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.68218994140625 }, { "auxiliary_loss_clip": 0.01272464, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.04469419, "balance_loss_mlp": 1.02281678, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 2.2485615667602237, "language_loss": 0.7038933, "learning_rate": 8.877537008407321e-09, "loss": 0.72691923, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.7670435905456543 }, { "auxiliary_loss_clip": 0.01272142, "auxiliary_loss_mlp": 0.01021211, "balance_loss_clip": 1.04497325, "balance_loss_mlp": 1.01455927, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.4928044543411279, "language_loss": 0.68599439, "learning_rate": 8.804374287221028e-09, "loss": 0.70892787, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.718585968017578 }, { "auxiliary_loss_clip": 0.01315105, "auxiliary_loss_mlp": 0.01022672, "balance_loss_clip": 1.03773594, "balance_loss_mlp": 1.01642191, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.9185330347536966, "language_loss": 0.84149885, "learning_rate": 8.731513628827958e-09, "loss": 0.86487657, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.8405776023864746 }, { "auxiliary_loss_clip": 0.01222345, "auxiliary_loss_mlp": 0.01021076, "balance_loss_clip": 1.04719925, "balance_loss_mlp": 1.01437354, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 2.6578701387971124, "language_loss": 0.82519102, "learning_rate": 8.658955044280825e-09, "loss": 0.84762526, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.6409029960632324 }, { "auxiliary_loss_clip": 0.01225486, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 1.05053616, "balance_loss_mlp": 1.01868916, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.799761148301575, "language_loss": 0.7741133, "learning_rate": 8.586698544587268e-09, "loss": 0.79662549, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.7247602939605713 }, { "auxiliary_loss_clip": 0.01269128, "auxiliary_loss_mlp": 0.01025593, "balance_loss_clip": 1.0435816, "balance_loss_mlp": 1.01861334, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 2.059661096480692, "language_loss": 0.73917639, "learning_rate": 8.514744140707853e-09, "loss": 0.76212358, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.6683847904205322 }, { "auxiliary_loss_clip": 0.01170823, "auxiliary_loss_mlp": 0.01025527, "balance_loss_clip": 1.04808009, "balance_loss_mlp": 1.01894665, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.8333373131195876, "language_loss": 0.76256466, "learning_rate": 8.443091843558515e-09, "loss": 0.78452814, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.6159791946411133 }, { "auxiliary_loss_clip": 0.01271182, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.04580021, "balance_loss_mlp": 1.02120614, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.1594355172163726, "language_loss": 0.64745605, "learning_rate": 8.37174166400878e-09, "loss": 0.67045259, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.7242605686187744 }, { "auxiliary_loss_clip": 0.01175161, "auxiliary_loss_mlp": 0.01025416, "balance_loss_clip": 1.0518496, "balance_loss_mlp": 1.01861835, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 3.9349581571842736, "language_loss": 0.85379863, "learning_rate": 8.300693612881992e-09, "loss": 0.87580442, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.6209068298339844 }, { "auxiliary_loss_clip": 0.01220607, "auxiliary_loss_mlp": 0.02567984, "balance_loss_clip": 1.04899859, "balance_loss_mlp": 0.99992162, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 4.173892651063349, "language_loss": 0.81389475, "learning_rate": 8.22994770095664e-09, "loss": 0.85178071, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.6583497524261475 }, { "auxiliary_loss_clip": 0.01278595, "auxiliary_loss_mlp": 0.01027425, "balance_loss_clip": 1.05227232, "balance_loss_mlp": 1.01997185, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.1058478498947704, "language_loss": 0.75437373, "learning_rate": 8.159503938964585e-09, "loss": 0.77743393, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 3.581129550933838 }, { "auxiliary_loss_clip": 0.01317405, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.04387951, "balance_loss_mlp": 1.01511312, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.9835223632426529, "language_loss": 0.70566148, "learning_rate": 8.089362337592164e-09, "loss": 0.72905219, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.8353919982910156 }, { "auxiliary_loss_clip": 0.01266206, "auxiliary_loss_mlp": 0.01025418, "balance_loss_clip": 1.04677868, "balance_loss_mlp": 1.01862025, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.5685078356174094, "language_loss": 0.72482204, "learning_rate": 8.019522907479536e-09, "loss": 0.74773836, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.904819965362549 }, { "auxiliary_loss_clip": 0.01224958, "auxiliary_loss_mlp": 0.01021784, "balance_loss_clip": 1.04844141, "balance_loss_mlp": 1.01539779, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.203743715705549, "language_loss": 0.77358413, "learning_rate": 7.949985659221558e-09, "loss": 0.79605156, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 3.533684253692627 }, { "auxiliary_loss_clip": 0.01274409, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.04587162, "balance_loss_mlp": 1.0171411, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.2094302506806116, "language_loss": 0.79168725, "learning_rate": 7.880750603366904e-09, "loss": 0.81466901, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.74710750579834 }, { "auxiliary_loss_clip": 0.01330054, "auxiliary_loss_mlp": 0.01026416, "balance_loss_clip": 1.04416013, "balance_loss_mlp": 1.01887035, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 2.2250387810220285, "language_loss": 0.79522294, "learning_rate": 7.811817750418282e-09, "loss": 0.81878769, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 3.603872060775757 }, { "auxiliary_loss_clip": 0.01315217, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 1.04503441, "balance_loss_mlp": 1.02108502, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.5735079734233341, "language_loss": 0.80234194, "learning_rate": 7.743187110833105e-09, "loss": 0.82577586, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.8274173736572266 }, { "auxiliary_loss_clip": 0.01271954, "auxiliary_loss_mlp": 0.01024299, "balance_loss_clip": 1.04309607, "balance_loss_mlp": 1.01764965, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.4741514796056676, "language_loss": 0.80824244, "learning_rate": 7.674858695022602e-09, "loss": 0.83120501, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.667311429977417 }, { "auxiliary_loss_clip": 0.01175085, "auxiliary_loss_mlp": 0.01024326, "balance_loss_clip": 1.04975629, "balance_loss_mlp": 1.01720297, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.5853046464808886, "language_loss": 0.76385319, "learning_rate": 7.606832513351591e-09, "loss": 0.78584731, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.6024863719940186 }, { "auxiliary_loss_clip": 0.01059433, "auxiliary_loss_mlp": 0.0250491, "balance_loss_clip": 1.00553203, "balance_loss_mlp": 0.99987334, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8243247616888904, "language_loss": 0.63899928, "learning_rate": 7.539108576140264e-09, "loss": 0.67464268, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.257070302963257 }, { "auxiliary_loss_clip": 0.01365756, "auxiliary_loss_mlp": 0.01021974, "balance_loss_clip": 1.04346263, "balance_loss_mlp": 1.01540875, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.064540588734389, "language_loss": 0.70204836, "learning_rate": 7.471686893661732e-09, "loss": 0.72592568, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 3.644542694091797 }, { "auxiliary_loss_clip": 0.01270293, "auxiliary_loss_mlp": 0.01023353, "balance_loss_clip": 1.04895413, "balance_loss_mlp": 1.01709199, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 1.987032716125693, "language_loss": 0.64435399, "learning_rate": 7.4045674761442636e-09, "loss": 0.66729045, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.6821351051330566 }, { "auxiliary_loss_clip": 0.01172094, "auxiliary_loss_mlp": 0.02565719, "balance_loss_clip": 1.04933703, "balance_loss_mlp": 0.99991035, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 1.8894643921673775, "language_loss": 0.73856938, "learning_rate": 7.337750333769488e-09, "loss": 0.77594745, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.580782413482666 }, { "auxiliary_loss_clip": 0.01276015, "auxiliary_loss_mlp": 0.01021324, "balance_loss_clip": 1.04219389, "balance_loss_mlp": 1.0144577, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.8119797976548588, "language_loss": 0.7297368, "learning_rate": 7.2712354766737425e-09, "loss": 0.75271016, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.7738685607910156 }, { "auxiliary_loss_clip": 0.01321744, "auxiliary_loss_mlp": 0.01026502, "balance_loss_clip": 1.04853201, "balance_loss_mlp": 1.01971006, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 2.2569761156082744, "language_loss": 0.80883372, "learning_rate": 7.2050229149469565e-09, "loss": 0.83231616, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.7790071964263916 }, { "auxiliary_loss_clip": 0.01322319, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.04011869, "balance_loss_mlp": 1.02114415, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 2.8687149388324262, "language_loss": 0.63674074, "learning_rate": 7.139112658633984e-09, "loss": 0.66023898, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.751539707183838 }, { "auxiliary_loss_clip": 0.01317728, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.04534495, "balance_loss_mlp": 1.01407754, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.0893943711549023, "language_loss": 0.70336163, "learning_rate": 7.073504717733048e-09, "loss": 0.72674561, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.77908992767334 }, { "auxiliary_loss_clip": 0.01172908, "auxiliary_loss_mlp": 0.01005029, "balance_loss_clip": 1.00688207, "balance_loss_mlp": 1.00408733, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.8278038706050304, "language_loss": 0.57168984, "learning_rate": 7.008199102196855e-09, "loss": 0.59346926, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.2262117862701416 }, { "auxiliary_loss_clip": 0.01157486, "auxiliary_loss_mlp": 0.01002725, "balance_loss_clip": 1.00659549, "balance_loss_mlp": 1.0018611, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.7988962252651206, "language_loss": 0.58890647, "learning_rate": 6.9431958219321464e-09, "loss": 0.61050856, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.1899397373199463 }, { "auxiliary_loss_clip": 0.01268683, "auxiliary_loss_mlp": 0.01022234, "balance_loss_clip": 1.04328573, "balance_loss_mlp": 1.01563549, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.5224925279099604, "language_loss": 0.77706081, "learning_rate": 6.878494886800146e-09, "loss": 0.79997003, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.727565288543701 }, { "auxiliary_loss_clip": 0.01172085, "auxiliary_loss_mlp": 0.0102364, "balance_loss_clip": 1.04633248, "balance_loss_mlp": 1.01668143, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 2.04076562862349, "language_loss": 0.76383328, "learning_rate": 6.814096306615669e-09, "loss": 0.78579056, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.690594434738159 }, { "auxiliary_loss_clip": 0.01278807, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.04651427, "balance_loss_mlp": 1.02629066, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.886516735077991, "language_loss": 0.65310633, "learning_rate": 6.750000091148011e-09, "loss": 0.67623377, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.659674644470215 }, { "auxiliary_loss_clip": 0.01173217, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.0502317, "balance_loss_mlp": 1.0235641, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 5.740910309039396, "language_loss": 0.72491604, "learning_rate": 6.686206250120729e-09, "loss": 0.74695241, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.6384987831115723 }, { "auxiliary_loss_clip": 0.01323857, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.0424962, "balance_loss_mlp": 1.01969206, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 6.671808009581269, "language_loss": 0.74327302, "learning_rate": 6.622714793210749e-09, "loss": 0.7667743, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.6937525272369385 }, { "auxiliary_loss_clip": 0.01172067, "auxiliary_loss_mlp": 0.01021546, "balance_loss_clip": 1.0482111, "balance_loss_mlp": 1.0150671, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.7627876023356712, "language_loss": 0.78521097, "learning_rate": 6.559525730050364e-09, "loss": 0.80714709, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.5429210662841797 }, { "auxiliary_loss_clip": 0.01319367, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.04529226, "balance_loss_mlp": 1.01638722, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 1.8790345632150771, "language_loss": 0.76119643, "learning_rate": 6.496639070224574e-09, "loss": 0.78462058, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.64664888381958 }, { "auxiliary_loss_clip": 0.01225587, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.04772305, "balance_loss_mlp": 1.02203763, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.5282871089098293, "language_loss": 0.8384459, "learning_rate": 6.4340548232739714e-09, "loss": 0.86098766, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 3.537400245666504 }, { "auxiliary_loss_clip": 0.01327295, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 1.04418004, "balance_loss_mlp": 1.02108693, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 1.7768572852416236, "language_loss": 0.79411519, "learning_rate": 6.371772998692071e-09, "loss": 0.8176617, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.74153208732605 }, { "auxiliary_loss_clip": 0.01322445, "auxiliary_loss_mlp": 0.01022688, "balance_loss_clip": 1.04243147, "balance_loss_mlp": 1.01550317, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 3.301652686147625, "language_loss": 0.64821875, "learning_rate": 6.309793605927094e-09, "loss": 0.67167008, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.693718671798706 }, { "auxiliary_loss_clip": 0.01176939, "auxiliary_loss_mlp": 0.01027166, "balance_loss_clip": 1.04614913, "balance_loss_mlp": 1.02101231, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 2.148641129515742, "language_loss": 0.80508924, "learning_rate": 6.248116654381297e-09, "loss": 0.82713032, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 3.5914552211761475 }, { "auxiliary_loss_clip": 0.01275093, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 1.04424906, "balance_loss_mlp": 1.0181073, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 2.3031475074120706, "language_loss": 0.72816038, "learning_rate": 6.186742153410751e-09, "loss": 0.75115585, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.7147908210754395 }, { "auxiliary_loss_clip": 0.01271675, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.04536605, "balance_loss_mlp": 1.02557564, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 6.431900452535017, "language_loss": 0.87507898, "learning_rate": 6.125670112326453e-09, "loss": 0.89812243, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 3.5621843338012695 }, { "auxiliary_loss_clip": 0.01226397, "auxiliary_loss_mlp": 0.01024962, "balance_loss_clip": 1.04778028, "balance_loss_mlp": 1.01814318, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.5947281458875502, "language_loss": 0.70410728, "learning_rate": 6.064900540392548e-09, "loss": 0.72662085, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.7153406143188477 }, { "auxiliary_loss_clip": 0.01265116, "auxiliary_loss_mlp": 0.01027228, "balance_loss_clip": 1.04768705, "balance_loss_mlp": 1.0210886, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 1.9783632060827394, "language_loss": 0.78825289, "learning_rate": 6.0044334468278835e-09, "loss": 0.81117636, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.7277421951293945 }, { "auxiliary_loss_clip": 0.01367694, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.04208255, "balance_loss_mlp": 1.0233686, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 1.7978796383149325, "language_loss": 0.7161262, "learning_rate": 5.944268840805345e-09, "loss": 0.74011368, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.7255361080169678 }, { "auxiliary_loss_clip": 0.01317571, "auxiliary_loss_mlp": 0.0102212, "balance_loss_clip": 1.04298282, "balance_loss_mlp": 1.01615906, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 4.822984836200634, "language_loss": 0.64116704, "learning_rate": 5.88440673145163e-09, "loss": 0.66456389, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.7915236949920654 }, { "auxiliary_loss_clip": 0.01220199, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.05035663, "balance_loss_mlp": 1.01922047, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 3.191448174318995, "language_loss": 0.82834876, "learning_rate": 5.824847127848142e-09, "loss": 0.85081154, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 3.515204906463623 }, { "auxiliary_loss_clip": 0.01373791, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.04596436, "balance_loss_mlp": 1.0256114, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.8335337308634592, "language_loss": 0.78982365, "learning_rate": 5.765590039029433e-09, "loss": 0.81388676, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.7313811779022217 }, { "auxiliary_loss_clip": 0.01173664, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.05110669, "balance_loss_mlp": 1.01927197, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 2.4584418863121416, "language_loss": 0.7116769, "learning_rate": 5.706635473985422e-09, "loss": 0.73367405, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.6834144592285156 }, { "auxiliary_loss_clip": 0.01218873, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.04488683, "balance_loss_mlp": 1.01692176, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 2.1595264601935886, "language_loss": 0.85083747, "learning_rate": 5.6479834416591764e-09, "loss": 0.87326646, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.6524124145507812 }, { "auxiliary_loss_clip": 0.01221033, "auxiliary_loss_mlp": 0.02569138, "balance_loss_clip": 1.048069, "balance_loss_mlp": 0.99987769, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 2.5939874355795367, "language_loss": 0.68607068, "learning_rate": 5.589633950947803e-09, "loss": 0.72397232, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.6467039585113525 }, { "auxiliary_loss_clip": 0.01271243, "auxiliary_loss_mlp": 0.01025784, "balance_loss_clip": 1.04639804, "balance_loss_mlp": 1.01827347, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 1.9917340242041177, "language_loss": 0.70092523, "learning_rate": 5.5315870107035535e-09, "loss": 0.72389555, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.6610519886016846 }, { "auxiliary_loss_clip": 0.01272935, "auxiliary_loss_mlp": 0.01025066, "balance_loss_clip": 1.04869461, "balance_loss_mlp": 1.01805639, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 2.0604930519317213, "language_loss": 0.78960466, "learning_rate": 5.473842629731607e-09, "loss": 0.81258476, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.7171144485473633 }, { "auxiliary_loss_clip": 0.01279565, "auxiliary_loss_mlp": 0.02568369, "balance_loss_clip": 1.04635072, "balance_loss_mlp": 0.99994457, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 1.9912109999352892, "language_loss": 0.78261828, "learning_rate": 5.416400816792066e-09, "loss": 0.82109761, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.6238460540771484 }, { "auxiliary_loss_clip": 0.01171202, "auxiliary_loss_mlp": 0.01025033, "balance_loss_clip": 1.04819536, "balance_loss_mlp": 1.01784492, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 2.625799802977219, "language_loss": 0.7864815, "learning_rate": 5.359261580598407e-09, "loss": 0.8084439, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.59236216545105 }, { "auxiliary_loss_clip": 0.01220888, "auxiliary_loss_mlp": 0.01024604, "balance_loss_clip": 1.04707026, "balance_loss_mlp": 1.01672482, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.301133808003227, "language_loss": 0.78066677, "learning_rate": 5.302424929819027e-09, "loss": 0.80312169, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.575076103210449 }, { "auxiliary_loss_clip": 0.01224774, "auxiliary_loss_mlp": 0.01023147, "balance_loss_clip": 1.04411221, "balance_loss_mlp": 1.01603937, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.3345552991396996, "language_loss": 0.73527825, "learning_rate": 5.24589087307592e-09, "loss": 0.75775743, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.630694627761841 }, { "auxiliary_loss_clip": 0.01174957, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04823565, "balance_loss_mlp": 1.01838374, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 2.4688961882876854, "language_loss": 0.6480757, "learning_rate": 5.189659418944891e-09, "loss": 0.67008102, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 3.0589349269866943 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01028106, "balance_loss_clip": 1.04964876, "balance_loss_mlp": 1.02104282, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 1.9564665642922583, "language_loss": 0.78578341, "learning_rate": 5.133730575956674e-09, "loss": 0.80779666, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.6388168334960938 }, { "auxiliary_loss_clip": 0.01272742, "auxiliary_loss_mlp": 0.01024266, "balance_loss_clip": 1.04585528, "balance_loss_mlp": 1.01774561, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 2.94121351394592, "language_loss": 0.72344553, "learning_rate": 5.0781043525953696e-09, "loss": 0.74641562, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.6564722061157227 }, { "auxiliary_loss_clip": 0.01266018, "auxiliary_loss_mlp": 0.01025051, "balance_loss_clip": 1.04704213, "balance_loss_mlp": 1.01853633, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.847890416653465, "language_loss": 0.74091196, "learning_rate": 5.0227807572995605e-09, "loss": 0.76382262, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.70780873298645 }, { "auxiliary_loss_clip": 0.0127298, "auxiliary_loss_mlp": 0.01024892, "balance_loss_clip": 1.04519677, "balance_loss_mlp": 1.01827574, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 3.7079227045462817, "language_loss": 0.67633152, "learning_rate": 4.967759798461646e-09, "loss": 0.69931018, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.638004779815674 }, { "auxiliary_loss_clip": 0.0117154, "auxiliary_loss_mlp": 0.01022112, "balance_loss_clip": 1.04987288, "balance_loss_mlp": 1.01595759, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 2.051541617860036, "language_loss": 0.75004125, "learning_rate": 4.913041484428282e-09, "loss": 0.77197778, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.674104690551758 }, { "auxiliary_loss_clip": 0.01225637, "auxiliary_loss_mlp": 0.01024227, "balance_loss_clip": 1.04892349, "balance_loss_mlp": 1.01780462, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 2.0098225567106742, "language_loss": 0.73827291, "learning_rate": 4.858625823500384e-09, "loss": 0.76077157, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 3.595191240310669 }, { "auxiliary_loss_clip": 0.01225292, "auxiliary_loss_mlp": 0.01022951, "balance_loss_clip": 1.04703152, "balance_loss_mlp": 1.01611996, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 2.2428913617810364, "language_loss": 0.73451686, "learning_rate": 4.80451282393246e-09, "loss": 0.75699925, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 3.527982234954834 }, { "auxiliary_loss_clip": 0.01172602, "auxiliary_loss_mlp": 0.01024184, "balance_loss_clip": 1.04644108, "balance_loss_mlp": 1.01725817, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 3.792971419525503, "language_loss": 0.67405462, "learning_rate": 4.750702493933722e-09, "loss": 0.69602251, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.748850107192993 }, { "auxiliary_loss_clip": 0.01276676, "auxiliary_loss_mlp": 0.025655, "balance_loss_clip": 1.04895186, "balance_loss_mlp": 0.99993134, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 2.3537783296508974, "language_loss": 0.8501485, "learning_rate": 4.697194841666974e-09, "loss": 0.88857025, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.6363420486450195 }, { "auxiliary_loss_clip": 0.01222637, "auxiliary_loss_mlp": 0.01024198, "balance_loss_clip": 1.04742908, "balance_loss_mlp": 1.01653266, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 3.2536707362221367, "language_loss": 0.82157284, "learning_rate": 4.6439898752492764e-09, "loss": 0.84404123, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 3.56811785697937 }, { "auxiliary_loss_clip": 0.01113895, "auxiliary_loss_mlp": 0.02505272, "balance_loss_clip": 1.00646019, "balance_loss_mlp": 0.99989361, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.75043556857342, "language_loss": 0.63661683, "learning_rate": 4.591087602751731e-09, "loss": 0.67280853, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.3031511306762695 }, { "auxiliary_loss_clip": 0.01218842, "auxiliary_loss_mlp": 0.0102109, "balance_loss_clip": 1.04692125, "balance_loss_mlp": 1.01423287, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.6888868371381534, "language_loss": 0.72147393, "learning_rate": 4.538488032199916e-09, "loss": 0.7438733, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.6398508548736572 }, { "auxiliary_loss_clip": 0.01226777, "auxiliary_loss_mlp": 0.01030088, "balance_loss_clip": 1.04656982, "balance_loss_mlp": 1.0231024, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 2.1645152874799054, "language_loss": 0.69227058, "learning_rate": 4.486191171572784e-09, "loss": 0.71483922, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.670585870742798 }, { "auxiliary_loss_clip": 0.01226561, "auxiliary_loss_mlp": 0.01023141, "balance_loss_clip": 1.04906023, "balance_loss_mlp": 1.01669443, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 4.948835463403625, "language_loss": 0.77575958, "learning_rate": 4.434197028803766e-09, "loss": 0.79825664, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.6177213191986084 }, { "auxiliary_loss_clip": 0.01326631, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.04319286, "balance_loss_mlp": 1.02567816, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.1390658832230183, "language_loss": 0.82168937, "learning_rate": 4.3825056117805514e-09, "loss": 0.84527969, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 3.596791982650757 }, { "auxiliary_loss_clip": 0.01173867, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.04887772, "balance_loss_mlp": 1.02346587, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 3.4817894796991773, "language_loss": 0.79234946, "learning_rate": 4.331116928344425e-09, "loss": 0.8143968, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.5755534172058105 }, { "auxiliary_loss_clip": 0.01279424, "auxiliary_loss_mlp": 0.02566897, "balance_loss_clip": 1.04761004, "balance_loss_mlp": 0.99990994, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.162891058742617, "language_loss": 0.62662053, "learning_rate": 4.28003098629115e-09, "loss": 0.66508371, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.609384298324585 }, { "auxiliary_loss_clip": 0.01319795, "auxiliary_loss_mlp": 0.01020046, "balance_loss_clip": 1.03893626, "balance_loss_mlp": 1.01318526, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.8532458841370534, "language_loss": 0.78794241, "learning_rate": 4.229247793370305e-09, "loss": 0.81134081, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.759444236755371 }, { "auxiliary_loss_clip": 0.01173106, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.04908299, "balance_loss_mlp": 1.02126312, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.9452679776619388, "language_loss": 0.70489651, "learning_rate": 4.178767357285951e-09, "loss": 0.72691089, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.6126294136047363 }, { "auxiliary_loss_clip": 0.01224681, "auxiliary_loss_mlp": 0.02566124, "balance_loss_clip": 1.04816163, "balance_loss_mlp": 0.99990237, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.719879798257293, "language_loss": 0.71867979, "learning_rate": 4.128589685695516e-09, "loss": 0.75658786, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.713348150253296 }, { "auxiliary_loss_clip": 0.01172446, "auxiliary_loss_mlp": 0.01027975, "balance_loss_clip": 1.0489521, "balance_loss_mlp": 1.02073312, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 1.8883844856498497, "language_loss": 0.84246379, "learning_rate": 4.078714786211135e-09, "loss": 0.86446798, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.515617609024048 }, { "auxiliary_loss_clip": 0.01218018, "auxiliary_loss_mlp": 0.0102296, "balance_loss_clip": 1.04706669, "balance_loss_mlp": 1.0164963, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 2.20408790809063, "language_loss": 0.76812756, "learning_rate": 4.029142666398977e-09, "loss": 0.79053736, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.7215042114257812 }, { "auxiliary_loss_clip": 0.01172649, "auxiliary_loss_mlp": 0.01022216, "balance_loss_clip": 1.05108571, "balance_loss_mlp": 1.01610935, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 2.090763925885501, "language_loss": 0.80400276, "learning_rate": 3.979873333778805e-09, "loss": 0.8259514, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.5991363525390625 }, { "auxiliary_loss_clip": 0.01276492, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.04729414, "balance_loss_mlp": 1.02103329, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.979390765325647, "language_loss": 0.74103242, "learning_rate": 3.930906795824862e-09, "loss": 0.76407987, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.7860875129699707 }, { "auxiliary_loss_clip": 0.01217515, "auxiliary_loss_mlp": 0.01025229, "balance_loss_clip": 1.04692578, "balance_loss_mlp": 1.01848149, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 1.9867082688524853, "language_loss": 0.765136, "learning_rate": 3.882243059965207e-09, "loss": 0.78756344, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.5889415740966797 }, { "auxiliary_loss_clip": 0.012192, "auxiliary_loss_mlp": 0.01023844, "balance_loss_clip": 1.04529333, "balance_loss_mlp": 1.01633632, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 3.5832252652977896, "language_loss": 0.65992737, "learning_rate": 3.833882133582156e-09, "loss": 0.68235779, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.614975690841675 }, { "auxiliary_loss_clip": 0.01224633, "auxiliary_loss_mlp": 0.01026221, "balance_loss_clip": 1.04654217, "balance_loss_mlp": 1.01933372, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 2.5789150738741964, "language_loss": 0.78288484, "learning_rate": 3.785824024012285e-09, "loss": 0.80539334, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.615952491760254 }, { "auxiliary_loss_clip": 0.01265092, "auxiliary_loss_mlp": 0.01022478, "balance_loss_clip": 1.04788446, "balance_loss_mlp": 1.01590693, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.7257324468670343, "language_loss": 0.78469872, "learning_rate": 3.738068738545541e-09, "loss": 0.80757445, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.6631081104278564 }, { "auxiliary_loss_clip": 0.01226036, "auxiliary_loss_mlp": 0.01025912, "balance_loss_clip": 1.04758382, "balance_loss_mlp": 1.01888514, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.3251792899182764, "language_loss": 0.78414917, "learning_rate": 3.6906162844265733e-09, "loss": 0.80666864, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.6103334426879883 }, { "auxiliary_loss_clip": 0.01269131, "auxiliary_loss_mlp": 0.01024699, "balance_loss_clip": 1.0437268, "balance_loss_mlp": 1.01753163, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.7736737430572151, "language_loss": 0.70585728, "learning_rate": 3.643466668853845e-09, "loss": 0.72879553, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.6477913856506348 }, { "auxiliary_loss_clip": 0.01272676, "auxiliary_loss_mlp": 0.01025035, "balance_loss_clip": 1.04368806, "balance_loss_mlp": 1.01808524, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 2.8394578556735595, "language_loss": 0.75463331, "learning_rate": 3.59661989898008e-09, "loss": 0.77761042, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.716294527053833 }, { "auxiliary_loss_clip": 0.01312619, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.04467058, "balance_loss_mlp": 1.01879442, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 2.0459836969205134, "language_loss": 0.76412016, "learning_rate": 3.5500759819115934e-09, "loss": 0.78750014, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 3.570202112197876 }, { "auxiliary_loss_clip": 0.01173495, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.04988694, "balance_loss_mlp": 1.02487254, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.020009534367411, "language_loss": 0.81205851, "learning_rate": 3.5038349247094034e-09, "loss": 0.83410788, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 3.4798805713653564 }, { "auxiliary_loss_clip": 0.01273208, "auxiliary_loss_mlp": 0.01025981, "balance_loss_clip": 1.04458666, "balance_loss_mlp": 1.01925468, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 6.262593811881118, "language_loss": 0.77390009, "learning_rate": 3.4578967343878994e-09, "loss": 0.79689199, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.6599955558776855 }, { "auxiliary_loss_clip": 0.01269929, "auxiliary_loss_mlp": 0.01022838, "balance_loss_clip": 1.04697299, "balance_loss_mlp": 1.01598358, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 2.301382984170266, "language_loss": 0.80930525, "learning_rate": 3.4122614179161733e-09, "loss": 0.83223295, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.638740301132202 }, { "auxiliary_loss_clip": 0.0131764, "auxiliary_loss_mlp": 0.01024889, "balance_loss_clip": 1.04340744, "balance_loss_mlp": 1.01876795, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 2.184244554571534, "language_loss": 0.78485596, "learning_rate": 3.36692898221691e-09, "loss": 0.8082813, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 3.5635814666748047 }, { "auxiliary_loss_clip": 0.01224919, "auxiliary_loss_mlp": 0.01018697, "balance_loss_clip": 1.04802823, "balance_loss_mlp": 1.01261425, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 1.9238882069283803, "language_loss": 0.73705482, "learning_rate": 3.3218994341668305e-09, "loss": 0.75949097, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.5854713916778564 }, { "auxiliary_loss_clip": 0.01172377, "auxiliary_loss_mlp": 0.01020588, "balance_loss_clip": 1.05139637, "balance_loss_mlp": 1.01421309, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.669297662324535, "language_loss": 0.75424314, "learning_rate": 3.2771727805971373e-09, "loss": 0.77617282, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.6621246337890625 }, { "auxiliary_loss_clip": 0.01367737, "auxiliary_loss_mlp": 0.01023575, "balance_loss_clip": 1.03941703, "balance_loss_mlp": 1.01676238, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.7351309740527119, "language_loss": 0.771083, "learning_rate": 3.232749028292847e-09, "loss": 0.79499614, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.7080976963043213 }, { "auxiliary_loss_clip": 0.01172868, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.04773366, "balance_loss_mlp": 1.0207442, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.7804051480996195, "language_loss": 0.88193125, "learning_rate": 3.188628183992792e-09, "loss": 0.9039405, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.5987253189086914 }, { "auxiliary_loss_clip": 0.01116465, "auxiliary_loss_mlp": 0.01000301, "balance_loss_clip": 1.00557315, "balance_loss_mlp": 0.99941874, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7368035878670971, "language_loss": 0.62478185, "learning_rate": 3.1448102543902844e-09, "loss": 0.6459496, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 4.1643383502960205 }, { "auxiliary_loss_clip": 0.01268924, "auxiliary_loss_mlp": 0.01025972, "balance_loss_clip": 1.04656959, "balance_loss_mlp": 1.01934719, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 2.193108206931262, "language_loss": 0.67748189, "learning_rate": 3.1012952461324515e-09, "loss": 0.70043087, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.6003880500793457 }, { "auxiliary_loss_clip": 0.01218764, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.04982734, "balance_loss_mlp": 1.01849174, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.119283976370514, "language_loss": 0.73960018, "learning_rate": 3.0580831658204575e-09, "loss": 0.76204479, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.596463918685913 }, { "auxiliary_loss_clip": 0.01221367, "auxiliary_loss_mlp": 0.01023407, "balance_loss_clip": 1.04902327, "balance_loss_mlp": 1.01656413, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 2.9082518850920325, "language_loss": 0.77834213, "learning_rate": 3.015174020009281e-09, "loss": 0.80078983, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.5999507904052734 }, { "auxiliary_loss_clip": 0.01325648, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.04312086, "balance_loss_mlp": 1.02238381, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 1.8113715005267594, "language_loss": 0.74932551, "learning_rate": 2.9725678152086043e-09, "loss": 0.77287173, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.6963109970092773 }, { "auxiliary_loss_clip": 0.01263062, "auxiliary_loss_mlp": 0.01023851, "balance_loss_clip": 1.04280055, "balance_loss_mlp": 1.01660585, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 2.8229668881490704, "language_loss": 0.82422501, "learning_rate": 2.930264557881257e-09, "loss": 0.84709418, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.648855686187744 }, { "auxiliary_loss_clip": 0.01059521, "auxiliary_loss_mlp": 0.01000028, "balance_loss_clip": 1.00561905, "balance_loss_mlp": 0.99919969, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.8324063270151959, "language_loss": 0.58128452, "learning_rate": 2.8882642544452163e-09, "loss": 0.60188007, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.167478561401367 }, { "auxiliary_loss_clip": 0.01265823, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04411542, "balance_loss_mlp": 1.02057481, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 3.1350885250536518, "language_loss": 0.74560237, "learning_rate": 2.8465669112716083e-09, "loss": 0.7685324, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.6301181316375732 }, { "auxiliary_loss_clip": 0.01225502, "auxiliary_loss_mlp": 0.0256667, "balance_loss_clip": 1.04691458, "balance_loss_mlp": 0.99991322, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.9440262614287958, "language_loss": 0.76735222, "learning_rate": 2.8051725346858177e-09, "loss": 0.80527395, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.6243069171905518 }, { "auxiliary_loss_clip": 0.01171122, "auxiliary_loss_mlp": 0.01022736, "balance_loss_clip": 1.04537868, "balance_loss_mlp": 1.01607251, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 2.1974190148497064, "language_loss": 0.71184713, "learning_rate": 2.7640811309674883e-09, "loss": 0.73378569, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6329071521759033 }, { "auxiliary_loss_clip": 0.01314847, "auxiliary_loss_mlp": 0.01027026, "balance_loss_clip": 1.04355681, "balance_loss_mlp": 1.02018595, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.688612520813857, "language_loss": 0.80811161, "learning_rate": 2.7232927063498557e-09, "loss": 0.83153033, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.7302498817443848 }, { "auxiliary_loss_clip": 0.01221585, "auxiliary_loss_mlp": 0.0102678, "balance_loss_clip": 1.04752254, "balance_loss_mlp": 1.0196867, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 5.746203946027384, "language_loss": 0.69310331, "learning_rate": 2.682807267020859e-09, "loss": 0.71558702, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.773212194442749 }, { "auxiliary_loss_clip": 0.01219006, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.04577994, "balance_loss_mlp": 1.02093983, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.7319794494844207, "language_loss": 0.62761998, "learning_rate": 2.642624819121808e-09, "loss": 0.65008461, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.6370620727539062 }, { "auxiliary_loss_clip": 0.01271645, "auxiliary_loss_mlp": 0.01024344, "balance_loss_clip": 1.04804611, "balance_loss_mlp": 1.01764441, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 1.9885309791199768, "language_loss": 0.61712414, "learning_rate": 2.6027453687487154e-09, "loss": 0.64008403, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.6404874324798584 }, { "auxiliary_loss_clip": 0.01269643, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 1.04666805, "balance_loss_mlp": 1.02087498, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.315364890694146, "language_loss": 0.53986239, "learning_rate": 2.5631689219509643e-09, "loss": 0.56283623, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.628795623779297 }, { "auxiliary_loss_clip": 0.01167284, "auxiliary_loss_mlp": 0.01023346, "balance_loss_clip": 1.04695392, "balance_loss_mlp": 1.01718307, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.7752884578119619, "language_loss": 0.83641064, "learning_rate": 2.523895484732197e-09, "loss": 0.85831696, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.702096939086914 }, { "auxiliary_loss_clip": 0.0122705, "auxiliary_loss_mlp": 0.01023417, "balance_loss_clip": 1.04641771, "balance_loss_mlp": 1.01613355, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 2.1843653065248247, "language_loss": 0.74768573, "learning_rate": 2.4849250630505357e-09, "loss": 0.77019036, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.6488585472106934 }, { "auxiliary_loss_clip": 0.01458816, "auxiliary_loss_mlp": 0.0102396, "balance_loss_clip": 1.03801394, "balance_loss_mlp": 1.0173111, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.7948988955524519, "language_loss": 0.73444366, "learning_rate": 2.4462576628172528e-09, "loss": 0.75927138, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 3.644676923751831 }, { "auxiliary_loss_clip": 0.01221307, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.02226388, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 3.993905874798312, "language_loss": 0.73859215, "learning_rate": 2.407893289898766e-09, "loss": 0.76109505, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 3.5194203853607178 }, { "auxiliary_loss_clip": 0.01314452, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04283881, "balance_loss_mlp": 1.01748919, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.9002799080798765, "language_loss": 0.84223616, "learning_rate": 2.3698319501144202e-09, "loss": 0.86562586, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.688042402267456 }, { "auxiliary_loss_clip": 0.01127644, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.04848576, "balance_loss_mlp": 1.02038288, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 2.6367740969489186, "language_loss": 0.73392749, "learning_rate": 2.3320736492382644e-09, "loss": 0.75547802, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.6411659717559814 }, { "auxiliary_loss_clip": 0.01171305, "auxiliary_loss_mlp": 0.01024751, "balance_loss_clip": 1.04954433, "balance_loss_mlp": 1.01810467, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.963331951677283, "language_loss": 0.68415773, "learning_rate": 2.29461839299816e-09, "loss": 0.70611823, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.5515103340148926 }, { "auxiliary_loss_clip": 0.01222865, "auxiliary_loss_mlp": 0.01022837, "balance_loss_clip": 1.04580092, "balance_loss_mlp": 1.0162003, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.6802903589310003, "language_loss": 0.80061376, "learning_rate": 2.257466187076229e-09, "loss": 0.82307082, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 3.6386654376983643 }, { "auxiliary_loss_clip": 0.01227516, "auxiliary_loss_mlp": 0.0256735, "balance_loss_clip": 1.04685593, "balance_loss_mlp": 0.99988729, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 1.8735699581959333, "language_loss": 0.71602541, "learning_rate": 2.2206170371081854e-09, "loss": 0.75397408, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.6481716632843018 }, { "auxiliary_loss_clip": 0.01277961, "auxiliary_loss_mlp": 0.01028542, "balance_loss_clip": 1.04489827, "balance_loss_mlp": 1.02153552, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.6996012714802455, "language_loss": 0.84916824, "learning_rate": 2.1840709486842247e-09, "loss": 0.87223327, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.683431386947632 }, { "auxiliary_loss_clip": 0.01266676, "auxiliary_loss_mlp": 0.0102661, "balance_loss_clip": 1.04505968, "balance_loss_mlp": 1.0197556, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.3295398535460765, "language_loss": 0.79216373, "learning_rate": 2.1478279273481335e-09, "loss": 0.81509662, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.6710562705993652 }, { "auxiliary_loss_clip": 0.01219722, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.04716134, "balance_loss_mlp": 1.01950455, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.413742378614482, "language_loss": 0.804075, "learning_rate": 2.1118879785981815e-09, "loss": 0.82653582, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 3.593425989151001 }, { "auxiliary_loss_clip": 0.01269376, "auxiliary_loss_mlp": 0.01021922, "balance_loss_clip": 1.04597378, "balance_loss_mlp": 1.01586056, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.7278513234268609, "language_loss": 0.79426163, "learning_rate": 2.0762511078862288e-09, "loss": 0.81717467, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.7478859424591064 }, { "auxiliary_loss_clip": 0.01177046, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 1.04583216, "balance_loss_mlp": 1.01760209, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 1.8937516543248345, "language_loss": 0.65087193, "learning_rate": 2.0409173206186183e-09, "loss": 0.67288327, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.682705879211426 }, { "auxiliary_loss_clip": 0.01316087, "auxiliary_loss_mlp": 0.01024378, "balance_loss_clip": 1.04459035, "balance_loss_mlp": 1.01791143, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 1.9343675961697597, "language_loss": 0.87153578, "learning_rate": 2.0058866221550617e-09, "loss": 0.89494038, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.669773817062378 }, { "auxiliary_loss_clip": 0.01173268, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.04786348, "balance_loss_mlp": 1.02034378, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 2.0397143325119758, "language_loss": 0.74986255, "learning_rate": 1.971159017809976e-09, "loss": 0.7718659, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.594998598098755 }, { "auxiliary_loss_clip": 0.01224286, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.05027461, "balance_loss_mlp": 1.02037311, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.3700589870879685, "language_loss": 0.77780116, "learning_rate": 1.93673451285159e-09, "loss": 0.8003149, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.5950088500976562 }, { "auxiliary_loss_clip": 0.01174185, "auxiliary_loss_mlp": 0.0100122, "balance_loss_clip": 1.00547957, "balance_loss_mlp": 1.00028384, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7312513764108556, "language_loss": 0.56499016, "learning_rate": 1.9026131125019495e-09, "loss": 0.58674431, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.16135311126709 }, { "auxiliary_loss_clip": 0.01217334, "auxiliary_loss_mlp": 0.01023858, "balance_loss_clip": 1.04685938, "balance_loss_mlp": 1.01694369, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.7939022631613357, "language_loss": 0.87034291, "learning_rate": 1.8687948219371363e-09, "loss": 0.89275479, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.60935115814209 }, { "auxiliary_loss_clip": 0.01175308, "auxiliary_loss_mlp": 0.01023182, "balance_loss_clip": 1.04719067, "balance_loss_mlp": 1.015764, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.4334043267458174, "language_loss": 0.88678014, "learning_rate": 1.835279646287491e-09, "loss": 0.90876502, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.7042205333709717 }, { "auxiliary_loss_clip": 0.01230518, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.05004013, "balance_loss_mlp": 1.02509594, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.9427171900058084, "language_loss": 0.76774049, "learning_rate": 1.8020675906371685e-09, "loss": 0.79037476, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.5905849933624268 }, { "auxiliary_loss_clip": 0.01366996, "auxiliary_loss_mlp": 0.010222, "balance_loss_clip": 1.04358029, "balance_loss_mlp": 1.01529825, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 2.313334673613612, "language_loss": 0.75281304, "learning_rate": 1.7691586600243612e-09, "loss": 0.77670503, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.7866811752319336 }, { "auxiliary_loss_clip": 0.01273573, "auxiliary_loss_mlp": 0.0102301, "balance_loss_clip": 1.04987645, "balance_loss_mlp": 1.01612866, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 2.7345257778663368, "language_loss": 0.86862069, "learning_rate": 1.7365528594415202e-09, "loss": 0.89158654, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.604391098022461 }, { "auxiliary_loss_clip": 0.01227697, "auxiliary_loss_mlp": 0.02566965, "balance_loss_clip": 1.04884052, "balance_loss_mlp": 0.9998911, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 2.9401786672086727, "language_loss": 0.67725205, "learning_rate": 1.7042501938346888e-09, "loss": 0.71519864, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.7910826206207275 }, { "auxiliary_loss_clip": 0.01263259, "auxiliary_loss_mlp": 0.01021993, "balance_loss_clip": 1.03987753, "balance_loss_mlp": 1.01529956, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.8956955233090456, "language_loss": 0.76597416, "learning_rate": 1.6722506681043913e-09, "loss": 0.7888267, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.629702091217041 }, { "auxiliary_loss_clip": 0.0127307, "auxiliary_loss_mlp": 0.01026078, "balance_loss_clip": 1.04553747, "balance_loss_mlp": 1.01866317, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.1066789811790425, "language_loss": 0.69506222, "learning_rate": 1.640554287104745e-09, "loss": 0.7180537, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.680727005004883 }, { "auxiliary_loss_clip": 0.01324759, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.04134464, "balance_loss_mlp": 1.01781297, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 4.076527286907154, "language_loss": 0.80271655, "learning_rate": 1.609161055644348e-09, "loss": 0.82621706, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.649904251098633 }, { "auxiliary_loss_clip": 0.01227216, "auxiliary_loss_mlp": 0.0102863, "balance_loss_clip": 1.04597056, "balance_loss_mlp": 1.02095246, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.507946621713477, "language_loss": 0.68489987, "learning_rate": 1.5780709784849467e-09, "loss": 0.70745826, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 3.5112483501434326 }, { "auxiliary_loss_clip": 0.01420342, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.04600859, "balance_loss_mlp": 1.02177227, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 1.89778637619767, "language_loss": 0.82993495, "learning_rate": 1.5472840603436565e-09, "loss": 0.85442686, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.740823268890381 }, { "auxiliary_loss_clip": 0.01171054, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.04617429, "balance_loss_mlp": 1.01728106, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 2.398418260037538, "language_loss": 0.78281039, "learning_rate": 1.5168003058900757e-09, "loss": 0.80476242, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 3.566293239593506 }, { "auxiliary_loss_clip": 0.01317684, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 1.04302907, "balance_loss_mlp": 1.02063465, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 2.100627483710409, "language_loss": 0.92048597, "learning_rate": 1.4866197197491715e-09, "loss": 0.94393313, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.7280917167663574 }, { "auxiliary_loss_clip": 0.01226903, "auxiliary_loss_mlp": 0.02570217, "balance_loss_clip": 1.04804778, "balance_loss_mlp": 0.99991781, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 3.985760403283924, "language_loss": 0.78854191, "learning_rate": 1.4567423064988371e-09, "loss": 0.82651317, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.623809576034546 }, { "auxiliary_loss_clip": 0.01174633, "auxiliary_loss_mlp": 0.0102267, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.01551163, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 2.7320185722078776, "language_loss": 0.78169346, "learning_rate": 1.4271680706718913e-09, "loss": 0.80366647, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 3.4504363536834717 }, { "auxiliary_loss_clip": 0.01224163, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.04962265, "balance_loss_mlp": 1.018471, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.7771574709593578, "language_loss": 0.82679844, "learning_rate": 1.3978970167543013e-09, "loss": 0.84929764, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.660292148590088 }, { "auxiliary_loss_clip": 0.01265053, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 1.04522204, "balance_loss_mlp": 1.01868093, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.515813970298919, "language_loss": 0.77623558, "learning_rate": 1.3689291491867372e-09, "loss": 0.79914379, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.6117310523986816 }, { "auxiliary_loss_clip": 0.01174805, "auxiliary_loss_mlp": 0.01026218, "balance_loss_clip": 1.04826903, "balance_loss_mlp": 1.01888645, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 2.111116640557565, "language_loss": 0.73672062, "learning_rate": 1.3402644723636836e-09, "loss": 0.75873089, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.669499397277832 }, { "auxiliary_loss_clip": 0.01266656, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.04659224, "balance_loss_mlp": 1.02527499, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 3.4725819723061995, "language_loss": 0.83869159, "learning_rate": 1.311902990633218e-09, "loss": 0.86167878, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.657245635986328 }, { "auxiliary_loss_clip": 0.01266104, "auxiliary_loss_mlp": 0.01021835, "balance_loss_clip": 1.04010022, "balance_loss_mlp": 1.01559734, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.732375790744388, "language_loss": 0.71430099, "learning_rate": 1.2838447082978987e-09, "loss": 0.73718035, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 3.694157600402832 }, { "auxiliary_loss_clip": 0.01220423, "auxiliary_loss_mlp": 0.01026045, "balance_loss_clip": 1.04554367, "balance_loss_mlp": 1.0192287, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.3694209103489343, "language_loss": 0.83081496, "learning_rate": 1.2560896296143208e-09, "loss": 0.85327959, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.6197404861450195 }, { "auxiliary_loss_clip": 0.01173284, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.04891682, "balance_loss_mlp": 1.02282929, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.6583911917708973, "language_loss": 0.82342261, "learning_rate": 1.2286377587926722e-09, "loss": 0.84545445, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.5583908557891846 }, { "auxiliary_loss_clip": 0.01173521, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.04880452, "balance_loss_mlp": 1.02108645, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 1.9897324795743438, "language_loss": 0.74661732, "learning_rate": 1.2014890999973992e-09, "loss": 0.76862943, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.6416029930114746 }, { "auxiliary_loss_clip": 0.01170308, "auxiliary_loss_mlp": 0.010229, "balance_loss_clip": 1.04706407, "balance_loss_mlp": 1.01615596, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.7743212575479907, "language_loss": 0.78537828, "learning_rate": 1.1746436573472073e-09, "loss": 0.80731034, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.5901246070861816 }, { "auxiliary_loss_clip": 0.01280113, "auxiliary_loss_mlp": 0.01028452, "balance_loss_clip": 1.04609549, "balance_loss_mlp": 1.02144885, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 1.8575825202908962, "language_loss": 0.69300544, "learning_rate": 1.1481014349141726e-09, "loss": 0.71609104, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.67826247215271 }, { "auxiliary_loss_clip": 0.01273137, "auxiliary_loss_mlp": 0.01028946, "balance_loss_clip": 1.04795599, "balance_loss_mlp": 1.02191854, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 1.8811195682849324, "language_loss": 0.84608221, "learning_rate": 1.121862436724852e-09, "loss": 0.86910307, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.638256311416626 }, { "auxiliary_loss_clip": 0.01224311, "auxiliary_loss_mlp": 0.01029081, "balance_loss_clip": 1.04841447, "balance_loss_mlp": 1.02241397, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.885545091762791, "language_loss": 0.70507789, "learning_rate": 1.0959266667598388e-09, "loss": 0.7276119, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.635660409927368 }, { "auxiliary_loss_clip": 0.01322394, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.04683506, "balance_loss_mlp": 1.02425313, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.972129997185183, "language_loss": 0.74974751, "learning_rate": 1.0702941289533196e-09, "loss": 0.77329087, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.6900534629821777 }, { "auxiliary_loss_clip": 0.01316678, "auxiliary_loss_mlp": 0.01021132, "balance_loss_clip": 1.04497766, "balance_loss_mlp": 1.01478076, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 2.2036394324297115, "language_loss": 0.89015859, "learning_rate": 1.0449648271939615e-09, "loss": 0.91353673, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.6865315437316895 }, { "auxiliary_loss_clip": 0.0137753, "auxiliary_loss_mlp": 0.02565647, "balance_loss_clip": 1.04720604, "balance_loss_mlp": 0.9998979, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.6030436833891195, "language_loss": 0.72988456, "learning_rate": 1.0199387653240243e-09, "loss": 0.76931632, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.7271647453308105 }, { "auxiliary_loss_clip": 0.01264605, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.04368567, "balance_loss_mlp": 1.01963782, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.6319127973955871, "language_loss": 0.70712698, "learning_rate": 9.952159471400267e-10, "loss": 0.7300362, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.6614911556243896 }, { "auxiliary_loss_clip": 0.01122228, "auxiliary_loss_mlp": 0.02563062, "balance_loss_clip": 1.04588819, "balance_loss_mlp": 0.99990261, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.9563788606392252, "language_loss": 0.84666598, "learning_rate": 9.707963763923022e-10, "loss": 0.88351887, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.596771240234375 }, { "auxiliary_loss_clip": 0.01271789, "auxiliary_loss_mlp": 0.01025963, "balance_loss_clip": 1.04394782, "balance_loss_mlp": 1.01915932, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 2.0200909166865495, "language_loss": 0.79420328, "learning_rate": 9.466800567854427e-10, "loss": 0.81718081, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.6371898651123047 }, { "auxiliary_loss_clip": 0.01224468, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.04247534, "balance_loss_mlp": 1.01833367, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 2.2746768825630195, "language_loss": 0.68423557, "learning_rate": 9.228669919778553e-10, "loss": 0.70674223, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.7217857837677 }, { "auxiliary_loss_clip": 0.01171497, "auxiliary_loss_mlp": 0.0102755, "balance_loss_clip": 1.0458076, "balance_loss_mlp": 1.02057338, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.3064991145048483, "language_loss": 0.79578, "learning_rate": 8.993571855817617e-10, "loss": 0.81777048, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.6792516708374023 }, { "auxiliary_loss_clip": 0.01218331, "auxiliary_loss_mlp": 0.01025219, "balance_loss_clip": 1.04624391, "balance_loss_mlp": 1.01821232, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 1.7849503901281862, "language_loss": 0.74983919, "learning_rate": 8.761506411638642e-10, "loss": 0.77227473, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 3.5605368614196777 }, { "auxiliary_loss_clip": 0.01277972, "auxiliary_loss_mlp": 0.01029418, "balance_loss_clip": 1.05011463, "balance_loss_mlp": 1.02218246, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.737419819130022, "language_loss": 0.73721492, "learning_rate": 8.53247362244236e-10, "loss": 0.76028883, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.6367921829223633 }, { "auxiliary_loss_clip": 0.01271682, "auxiliary_loss_mlp": 0.01027354, "balance_loss_clip": 1.04777193, "balance_loss_mlp": 1.02069008, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.5718640704487163, "language_loss": 0.68035972, "learning_rate": 8.306473522976532e-10, "loss": 0.70335007, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 3.6063671112060547 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.01026641, "balance_loss_clip": 1.0494498, "balance_loss_mlp": 1.02027822, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.9368527430780065, "language_loss": 0.71724153, "learning_rate": 8.083506147522623e-10, "loss": 0.7392416, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.5927305221557617 }, { "auxiliary_loss_clip": 0.01217082, "auxiliary_loss_mlp": 0.01027708, "balance_loss_clip": 1.04446566, "balance_loss_mlp": 1.02064514, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.4056212426080728, "language_loss": 0.85026175, "learning_rate": 7.863571529906909e-10, "loss": 0.87270963, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.61842679977417 }, { "auxiliary_loss_clip": 0.01115219, "auxiliary_loss_mlp": 0.00999409, "balance_loss_clip": 1.0056721, "balance_loss_mlp": 0.99857444, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.724793159400049, "language_loss": 0.59664649, "learning_rate": 7.646669703489372e-10, "loss": 0.61779279, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 4.145562410354614 }, { "auxiliary_loss_clip": 0.01575579, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.03677368, "balance_loss_mlp": 1.02053654, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 1.853079492858798, "language_loss": 0.57296336, "learning_rate": 7.432800701177023e-10, "loss": 0.59899318, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 2.97361421585083 }, { "auxiliary_loss_clip": 0.01168451, "auxiliary_loss_mlp": 0.00997664, "balance_loss_clip": 1.00769687, "balance_loss_mlp": 0.99684763, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7930911764136114, "language_loss": 0.57730973, "learning_rate": 7.221964555415017e-10, "loss": 0.59897089, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.352449417114258 }, { "auxiliary_loss_clip": 0.0126747, "auxiliary_loss_mlp": 0.01022459, "balance_loss_clip": 1.04517436, "balance_loss_mlp": 1.01605403, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 2.1213030856450126, "language_loss": 0.75026691, "learning_rate": 7.01416129818222e-10, "loss": 0.77316618, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.6481921672821045 }, { "auxiliary_loss_clip": 0.01330198, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.04645324, "balance_loss_mlp": 1.02140498, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 1.839589443311687, "language_loss": 0.58645129, "learning_rate": 6.809390961006745e-10, "loss": 0.61003649, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.9813663959503174 }, { "auxiliary_loss_clip": 0.01272382, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.04713845, "balance_loss_mlp": 1.02115822, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 2.5427472222099223, "language_loss": 0.68750244, "learning_rate": 6.607653574948191e-10, "loss": 0.71050429, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 3.798618793487549 }, { "auxiliary_loss_clip": 0.0121537, "auxiliary_loss_mlp": 0.01028206, "balance_loss_clip": 1.04439056, "balance_loss_mlp": 1.02193546, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 5.274102653192486, "language_loss": 0.81994361, "learning_rate": 6.408949170613187e-10, "loss": 0.84237933, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.674574613571167 }, { "auxiliary_loss_clip": 0.01271978, "auxiliary_loss_mlp": 0.01028859, "balance_loss_clip": 1.04492271, "balance_loss_mlp": 1.02168298, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 1.6100947657901903, "language_loss": 0.8188374, "learning_rate": 6.213277778144288e-10, "loss": 0.84184575, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.693537950515747 }, { "auxiliary_loss_clip": 0.01320276, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.04041672, "balance_loss_mlp": 1.02333474, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 2.3849497898534944, "language_loss": 0.67109972, "learning_rate": 6.020639427224416e-10, "loss": 0.6946063, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.8271055221557617 }, { "auxiliary_loss_clip": 0.01273498, "auxiliary_loss_mlp": 0.01026123, "balance_loss_clip": 1.04859829, "balance_loss_mlp": 1.01957583, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 2.1366953501704193, "language_loss": 0.72762918, "learning_rate": 5.831034147076864e-10, "loss": 0.75062543, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.6746108531951904 }, { "auxiliary_loss_clip": 0.01108994, "auxiliary_loss_mlp": 0.01001465, "balance_loss_clip": 1.00664258, "balance_loss_mlp": 1.00064218, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6874464100316846, "language_loss": 0.5567174, "learning_rate": 5.644461966463065e-10, "loss": 0.57782197, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.318091869354248 }, { "auxiliary_loss_clip": 0.01268741, "auxiliary_loss_mlp": 0.010272, "balance_loss_clip": 1.04662466, "balance_loss_mlp": 1.02065253, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 2.0580963218512225, "language_loss": 0.75998747, "learning_rate": 5.460922913687049e-10, "loss": 0.78294694, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.647670030593872 }, { "auxiliary_loss_clip": 0.01368802, "auxiliary_loss_mlp": 0.02569728, "balance_loss_clip": 1.03998804, "balance_loss_mlp": 0.99993229, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.107610087229513, "language_loss": 0.75877893, "learning_rate": 5.280417016593208e-10, "loss": 0.79816425, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.6987483501434326 }, { "auxiliary_loss_clip": 0.01220066, "auxiliary_loss_mlp": 0.02565173, "balance_loss_clip": 1.04970539, "balance_loss_mlp": 0.99989796, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.7952312184226038, "language_loss": 0.74766886, "learning_rate": 5.102944302559642e-10, "loss": 0.78552127, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.6118004322052 }, { "auxiliary_loss_clip": 0.01432986, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.04680967, "balance_loss_mlp": 1.02079153, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 6.961249800784524, "language_loss": 0.79535997, "learning_rate": 4.9285047985137e-10, "loss": 0.81997174, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.7769243717193604 }, { "auxiliary_loss_clip": 0.01225458, "auxiliary_loss_mlp": 0.01027094, "balance_loss_clip": 1.04714823, "balance_loss_mlp": 1.02017951, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.8475480361653513, "language_loss": 0.74706525, "learning_rate": 4.757098530916436e-10, "loss": 0.76959074, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.7071943283081055 }, { "auxiliary_loss_clip": 0.01222568, "auxiliary_loss_mlp": 0.01024379, "balance_loss_clip": 1.048684, "balance_loss_mlp": 1.0169847, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 2.4990950358612647, "language_loss": 0.77614993, "learning_rate": 4.5887255257670563e-10, "loss": 0.79861939, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.586324453353882 }, { "auxiliary_loss_clip": 0.01171778, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.04646397, "balance_loss_mlp": 1.02270365, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 2.087594577338146, "language_loss": 0.77043825, "learning_rate": 4.4233858086117906e-10, "loss": 0.79245305, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.575388193130493 }, { "auxiliary_loss_clip": 0.01364587, "auxiliary_loss_mlp": 0.01022373, "balance_loss_clip": 1.0470382, "balance_loss_mlp": 1.01560199, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.1979293126992623, "language_loss": 0.67533958, "learning_rate": 4.261079404528356e-10, "loss": 0.69920921, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.704423666000366 }, { "auxiliary_loss_clip": 0.01222743, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.0470705, "balance_loss_mlp": 1.01859152, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 1.7723709137403312, "language_loss": 0.69250828, "learning_rate": 4.1018063381437205e-10, "loss": 0.71498978, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.6404080390930176 }, { "auxiliary_loss_clip": 0.01106696, "auxiliary_loss_mlp": 0.01002037, "balance_loss_clip": 1.00616717, "balance_loss_mlp": 1.00122643, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8664121817183589, "language_loss": 0.61051381, "learning_rate": 3.9455666336141167e-10, "loss": 0.6316011, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 4.189847230911255 }, { "auxiliary_loss_clip": 0.01171756, "auxiliary_loss_mlp": 0.01025891, "balance_loss_clip": 1.049824, "balance_loss_mlp": 1.01834512, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 2.5981942749163447, "language_loss": 0.83243465, "learning_rate": 3.7923603146450267e-10, "loss": 0.85441113, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.554323196411133 }, { "auxiliary_loss_clip": 0.0131973, "auxiliary_loss_mlp": 0.0102767, "balance_loss_clip": 1.04146838, "balance_loss_mlp": 1.02066612, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 2.1075153648663907, "language_loss": 0.80755651, "learning_rate": 3.642187404473418e-10, "loss": 0.83103049, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.6651298999786377 }, { "auxiliary_loss_clip": 0.01224003, "auxiliary_loss_mlp": 0.01021822, "balance_loss_clip": 1.04696393, "balance_loss_mlp": 1.01557803, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.202421420730303, "language_loss": 0.85887378, "learning_rate": 3.495047925885508e-10, "loss": 0.88133204, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 3.6232926845550537 }, { "auxiliary_loss_clip": 0.01272489, "auxiliary_loss_mlp": 0.01024424, "balance_loss_clip": 1.04426634, "balance_loss_mlp": 1.01785612, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.0403948585528457, "language_loss": 0.82862425, "learning_rate": 3.350941901199e-10, "loss": 0.85159338, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.6619069576263428 }, { "auxiliary_loss_clip": 0.01276345, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04672098, "balance_loss_mlp": 1.018888, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 4.381460049079689, "language_loss": 0.83176243, "learning_rate": 3.2098693522764066e-10, "loss": 0.85478294, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.5806376934051514 }, { "auxiliary_loss_clip": 0.01278876, "auxiliary_loss_mlp": 0.0256693, "balance_loss_clip": 1.04664814, "balance_loss_mlp": 0.9999125, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 1.9862124639852408, "language_loss": 0.8114925, "learning_rate": 3.071830300516165e-10, "loss": 0.84995055, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 3.549713134765625 }, { "auxiliary_loss_clip": 0.01229657, "auxiliary_loss_mlp": 0.01024282, "balance_loss_clip": 1.0484159, "balance_loss_mlp": 1.01682544, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 1.9923505991223773, "language_loss": 0.70948619, "learning_rate": 2.9368247668615234e-10, "loss": 0.73202562, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.610046148300171 }, { "auxiliary_loss_clip": 0.0117876, "auxiliary_loss_mlp": 0.01028548, "balance_loss_clip": 1.05091119, "balance_loss_mlp": 1.02103758, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 2.8938081311334796, "language_loss": 0.61349654, "learning_rate": 2.804852771789434e-10, "loss": 0.63556957, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.525644302368164 }, { "auxiliary_loss_clip": 0.01172011, "auxiliary_loss_mlp": 0.01024565, "balance_loss_clip": 1.0489403, "balance_loss_mlp": 1.01820481, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.8177024596603115, "language_loss": 0.55789065, "learning_rate": 2.675914335321661e-10, "loss": 0.5798564, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.5766186714172363 }, { "auxiliary_loss_clip": 0.01226155, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.04766548, "balance_loss_mlp": 1.01988912, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 3.897416602262242, "language_loss": 0.79329693, "learning_rate": 2.550009477018111e-10, "loss": 0.81582689, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.6283960342407227 }, { "auxiliary_loss_clip": 0.01270895, "auxiliary_loss_mlp": 0.0256862, "balance_loss_clip": 1.04708946, "balance_loss_mlp": 0.99990827, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 2.5261573803526938, "language_loss": 0.62699085, "learning_rate": 2.4271382159790634e-10, "loss": 0.66538602, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 3.6178090572357178 }, { "auxiliary_loss_clip": 0.01331627, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.04379201, "balance_loss_mlp": 1.02112114, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 3.284350914487527, "language_loss": 0.86181271, "learning_rate": 2.3073005708429406e-10, "loss": 0.88540685, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.7749431133270264 }, { "auxiliary_loss_clip": 0.01317153, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.04683495, "balance_loss_mlp": 1.02118218, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.7289501268489524, "language_loss": 0.71922004, "learning_rate": 2.190496559788535e-10, "loss": 0.74267006, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.6886322498321533 }, { "auxiliary_loss_clip": 0.01266364, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.04628491, "balance_loss_mlp": 1.02275133, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.5164284264135977, "language_loss": 0.76700217, "learning_rate": 2.0767262005372265e-10, "loss": 0.78996348, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.626100540161133 }, { "auxiliary_loss_clip": 0.01226965, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.04328895, "balance_loss_mlp": 1.02073669, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 1.835738529140405, "language_loss": 0.75421786, "learning_rate": 1.965989510346322e-10, "loss": 0.77676463, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.706963062286377 }, { "auxiliary_loss_clip": 0.01366565, "auxiliary_loss_mlp": 0.01021671, "balance_loss_clip": 1.04222035, "balance_loss_mlp": 1.01467967, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 3.07666879868278, "language_loss": 0.71233273, "learning_rate": 1.8582865060134955e-10, "loss": 0.73621505, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.7134578227996826 }, { "auxiliary_loss_clip": 0.01059461, "auxiliary_loss_mlp": 0.01000082, "balance_loss_clip": 1.00554872, "balance_loss_mlp": 0.99922991, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7837749649795689, "language_loss": 0.55701721, "learning_rate": 1.7536172038790098e-10, "loss": 0.57761264, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.2197062969207764 }, { "auxiliary_loss_clip": 0.01270615, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04886556, "balance_loss_mlp": 1.01929092, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.555805455767541, "language_loss": 0.69221461, "learning_rate": 1.651981619819054e-10, "loss": 0.71518111, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.7209367752075195 }, { "auxiliary_loss_clip": 0.01371646, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.04457593, "balance_loss_mlp": 1.01774073, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.7460937630509363, "language_loss": 0.71049869, "learning_rate": 1.5533797692546257e-10, "loss": 0.73446178, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.7168235778808594 }, { "auxiliary_loss_clip": 0.01221276, "auxiliary_loss_mlp": 0.01022514, "balance_loss_clip": 1.04583442, "balance_loss_mlp": 1.01430297, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 2.13330092953392, "language_loss": 0.84591275, "learning_rate": 1.4578116671404296e-10, "loss": 0.86835068, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.6027705669403076 }, { "auxiliary_loss_clip": 0.01218959, "auxiliary_loss_mlp": 0.01025328, "balance_loss_clip": 1.04902744, "balance_loss_mlp": 1.01898956, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 2.0591794784701762, "language_loss": 0.71305752, "learning_rate": 1.3652773279759777e-10, "loss": 0.73550045, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.590189218521118 }, { "auxiliary_loss_clip": 0.0122405, "auxiliary_loss_mlp": 0.01024476, "balance_loss_clip": 1.04729974, "balance_loss_mlp": 1.0173533, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 2.215568208095501, "language_loss": 0.63308001, "learning_rate": 1.2757767657989305e-10, "loss": 0.65556532, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.7125871181488037 }, { "auxiliary_loss_clip": 0.01219129, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.04864883, "balance_loss_mlp": 1.0209049, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 2.0780448475358346, "language_loss": 0.87128896, "learning_rate": 1.1893099941850948e-10, "loss": 0.89375675, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.6276051998138428 }, { "auxiliary_loss_clip": 0.01273331, "auxiliary_loss_mlp": 0.01023768, "balance_loss_clip": 1.04424393, "balance_loss_mlp": 1.01669025, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.1174305018533968, "language_loss": 0.77343822, "learning_rate": 1.105877026252866e-10, "loss": 0.79640919, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.6987154483795166 }, { "auxiliary_loss_clip": 0.01175114, "auxiliary_loss_mlp": 0.01025437, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.01870728, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 2.2235566466693273, "language_loss": 0.72044861, "learning_rate": 1.0254778746565663e-10, "loss": 0.74245417, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.502345323562622 }, { "auxiliary_loss_clip": 0.01317268, "auxiliary_loss_mlp": 0.01024002, "balance_loss_clip": 1.0458169, "balance_loss_mlp": 1.01793671, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 7.8203287171458475, "language_loss": 0.73555142, "learning_rate": 9.481125515953259e-11, "loss": 0.75896418, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.6679654121398926 }, { "auxiliary_loss_clip": 0.01374295, "auxiliary_loss_mlp": 0.01025655, "balance_loss_clip": 1.04029775, "balance_loss_mlp": 1.0182426, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.9637415972133963, "language_loss": 0.79970729, "learning_rate": 8.737810688064228e-11, "loss": 0.82370681, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 3.8468434810638428 }, { "auxiliary_loss_clip": 0.01325393, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.04889655, "balance_loss_mlp": 1.02146935, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 2.0976999052592067, "language_loss": 0.79346097, "learning_rate": 8.024834375608414e-11, "loss": 0.81700283, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.699910879135132 }, { "auxiliary_loss_clip": 0.01059351, "auxiliary_loss_mlp": 0.01000988, "balance_loss_clip": 1.00548577, "balance_loss_mlp": 1.0001421, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8138728572583651, "language_loss": 0.62743378, "learning_rate": 7.342196686788149e-11, "loss": 0.6480372, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 4.014256715774536 }, { "auxiliary_loss_clip": 0.01275454, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.05133569, "balance_loss_mlp": 1.02086532, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.121645470943436, "language_loss": 0.68304378, "learning_rate": 6.689897725142834e-11, "loss": 0.70608366, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.6812031269073486 }, { "auxiliary_loss_clip": 0.01273591, "auxiliary_loss_mlp": 0.01024333, "balance_loss_clip": 1.04457998, "balance_loss_mlp": 1.01793146, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 2.14495053396276, "language_loss": 0.88326216, "learning_rate": 6.067937589615545e-11, "loss": 0.90624142, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.618645191192627 }, { "auxiliary_loss_clip": 0.01173238, "auxiliary_loss_mlp": 0.009987, "balance_loss_clip": 1.00645947, "balance_loss_mlp": 0.99777621, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7422660478103639, "language_loss": 0.57662177, "learning_rate": 5.476316374575241e-11, "loss": 0.59834117, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 4.100350618362427 }, { "auxiliary_loss_clip": 0.0117807, "auxiliary_loss_mlp": 0.0102415, "balance_loss_clip": 1.0519346, "balance_loss_mlp": 1.01674128, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 1.8891234748686514, "language_loss": 0.72461593, "learning_rate": 4.9150341697723476e-11, "loss": 0.74663818, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.5788798332214355 }, { "auxiliary_loss_clip": 0.01273367, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.04802251, "balance_loss_mlp": 1.02077579, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 3.797125607120145, "language_loss": 0.66769445, "learning_rate": 4.384091060338768e-11, "loss": 0.69070637, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.744431495666504 }, { "auxiliary_loss_clip": 0.01220883, "auxiliary_loss_mlp": 0.010309, "balance_loss_clip": 1.0481689, "balance_loss_mlp": 1.02425373, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.1358664017492495, "language_loss": 0.73904645, "learning_rate": 3.883487126810081e-11, "loss": 0.76156431, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.6785454750061035 }, { "auxiliary_loss_clip": 0.01216942, "auxiliary_loss_mlp": 0.01021535, "balance_loss_clip": 1.04495716, "balance_loss_mlp": 1.01470459, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.7178017199499926, "language_loss": 0.79270673, "learning_rate": 3.41322244516995e-11, "loss": 0.81509149, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.580578565597534 }, { "auxiliary_loss_clip": 0.01366713, "auxiliary_loss_mlp": 0.01018442, "balance_loss_clip": 1.04535782, "balance_loss_mlp": 1.01188254, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 2.6372342342755903, "language_loss": 0.63270956, "learning_rate": 2.9732970866946925e-11, "loss": 0.65656114, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 3.6992685794830322 }, { "auxiliary_loss_clip": 0.01312546, "auxiliary_loss_mlp": 0.01026738, "balance_loss_clip": 1.03934407, "balance_loss_mlp": 1.01885843, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.4303722289168506, "language_loss": 0.78319311, "learning_rate": 2.563711118175327e-11, "loss": 0.80658597, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.6560866832733154 }, { "auxiliary_loss_clip": 0.01318774, "auxiliary_loss_mlp": 0.01022629, "balance_loss_clip": 1.04506075, "balance_loss_mlp": 1.01578021, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 1.7953875248328464, "language_loss": 0.83726948, "learning_rate": 2.184464601717728e-11, "loss": 0.8606835, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.73249888420105 }, { "auxiliary_loss_clip": 0.01225098, "auxiliary_loss_mlp": 0.01025899, "balance_loss_clip": 1.05012465, "balance_loss_mlp": 1.01913428, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 3.254746145386755, "language_loss": 0.77854997, "learning_rate": 1.8355575948758585e-11, "loss": 0.80105996, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.5849852561950684 }, { "auxiliary_loss_clip": 0.01274986, "auxiliary_loss_mlp": 0.01025668, "balance_loss_clip": 1.04343617, "balance_loss_mlp": 1.01828897, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.2461773021851252, "language_loss": 0.73311555, "learning_rate": 1.5169901505407424e-11, "loss": 0.75612211, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.6472344398498535 }, { "auxiliary_loss_clip": 0.01268792, "auxiliary_loss_mlp": 0.01026516, "balance_loss_clip": 1.0458802, "balance_loss_mlp": 1.01963139, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 2.3257123070181205, "language_loss": 0.73997712, "learning_rate": 1.228762317073695e-11, "loss": 0.76293015, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.6951401233673096 }, { "auxiliary_loss_clip": 0.01278376, "auxiliary_loss_mlp": 0.01024341, "balance_loss_clip": 1.05009913, "balance_loss_mlp": 1.01773131, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 3.7833897835449197, "language_loss": 0.78899413, "learning_rate": 9.70874138195299e-12, "loss": 0.81202132, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.794603109359741 }, { "auxiliary_loss_clip": 0.0117323, "auxiliary_loss_mlp": 0.0101993, "balance_loss_clip": 1.04802775, "balance_loss_mlp": 1.01255655, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.7461949368624077, "language_loss": 0.74768525, "learning_rate": 7.433256530076093e-12, "loss": 0.7696169, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.569227933883667 }, { "auxiliary_loss_clip": 0.01376099, "auxiliary_loss_mlp": 0.01024656, "balance_loss_clip": 1.04195631, "balance_loss_mlp": 1.01818347, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 4.343984170375211, "language_loss": 0.76254237, "learning_rate": 5.46116896038562e-12, "loss": 0.78654993, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.71580171585083 }, { "auxiliary_loss_clip": 0.01276571, "auxiliary_loss_mlp": 0.0102613, "balance_loss_clip": 1.04806483, "balance_loss_mlp": 1.01929939, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 1.981616554235785, "language_loss": 0.61754906, "learning_rate": 3.792478972197699e-12, "loss": 0.64057606, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.8900208473205566 }, { "auxiliary_loss_clip": 0.01169913, "auxiliary_loss_mlp": 0.01026209, "balance_loss_clip": 1.04646969, "balance_loss_mlp": 1.01954818, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 2.1434060406161652, "language_loss": 0.70507061, "learning_rate": 2.4271868181990895e-12, "loss": 0.72703189, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.543121337890625 }, { "auxiliary_loss_clip": 0.01223641, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.04652345, "balance_loss_mlp": 1.0207653, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.2964391667372563, "language_loss": 0.80915999, "learning_rate": 1.3652927060014973e-12, "loss": 0.83167011, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.5957930088043213 }, { "auxiliary_loss_clip": 0.01329106, "auxiliary_loss_mlp": 0.01024161, "balance_loss_clip": 1.04728079, "balance_loss_mlp": 1.01719308, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 2.177331923082963, "language_loss": 0.63700873, "learning_rate": 6.067967965872612e-13, "loss": 0.66054142, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.655853033065796 }, { "auxiliary_loss_clip": 0.01318594, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.04592776, "balance_loss_mlp": 1.01957464, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.7899771234736448, "language_loss": 0.77230346, "learning_rate": 1.5169920497548615e-13, "loss": 0.79574937, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.083948850631714 }, { "auxiliary_loss_clip": 0.01171628, "auxiliary_loss_mlp": 0.01011211, "balance_loss_clip": 1.02671599, "balance_loss_mlp": 1.00722528, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.1213571057662117, "language_loss": 0.54982269, "learning_rate": 0.0, "loss": 0.5716511, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.182302474975586 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7923422268286756, "train_runtime": 25183.1629, "train_samples_per_second": 13.209, "train_steps_per_second": 0.33 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }